From 1aa0aac9f98d33c113a8baba2a5e3ca4f5243a86 Mon Sep 17 00:00:00 2001 From: Adrian Short Date: Tue, 25 Sep 2018 21:02:23 +0100 Subject: [PATCH] Create Application class; scrape into Application objects. #8 --- lib/uk_planning_scraper.rb | 1 + lib/uk_planning_scraper/application.rb | 46 +++++++++++++++++ lib/uk_planning_scraper/authority.rb | 17 ++++--- lib/uk_planning_scraper/idox.rb | 68 ++++++++++++-------------- lib/uk_planning_scraper/northgate.rb | 37 +++++--------- 5 files changed, 102 insertions(+), 67 deletions(-) create mode 100644 lib/uk_planning_scraper/application.rb diff --git a/lib/uk_planning_scraper.rb b/lib/uk_planning_scraper.rb index bdd2b57..ece96e1 100644 --- a/lib/uk_planning_scraper.rb +++ b/lib/uk_planning_scraper.rb @@ -1,5 +1,6 @@ require "uk_planning_scraper/version" require "uk_planning_scraper/authority" +require "uk_planning_scraper/application" require 'uk_planning_scraper/idox' require 'uk_planning_scraper/northgate' require 'logger' diff --git a/lib/uk_planning_scraper/application.rb b/lib/uk_planning_scraper/application.rb new file mode 100644 index 0000000..5c17550 --- /dev/null +++ b/lib/uk_planning_scraper/application.rb @@ -0,0 +1,46 @@ +module UKPlanningScraper + class Application + attr_accessor :authority_name + attr_accessor :council_reference + attr_accessor :date_received + attr_accessor :date_validated + attr_accessor :status + attr_accessor :scraped_at + attr_accessor :info_url + attr_accessor :address + attr_accessor :description + attr_accessor :documents_count + attr_accessor :documents_url + attr_accessor :alternative_reference + attr_accessor :decision + attr_accessor :date_decision + attr_accessor :appeal_status + attr_accessor :appeal_decision + + def to_hash + { + scraped_at: @scraped_at, + authority_name: @authority_name, + council_reference: @council_reference, + date_received: @date_received, + date_validated: @date_validated, + status: @status, + decision: @decision, + date_decision: @date_decision, + info_url: @info_url, + address: @address, + description: @description, + documents_count: @documents_count, + documents_url: @documents_url, + alternative_reference: @alternative_reference, + appeal_status: @appeal_status, + appeal_decision: @appeal_decision + } + end + + def valid? + return true if @authority_name && @council_reference && @info_url + false + end + end +end diff --git a/lib/uk_planning_scraper/authority.rb b/lib/uk_planning_scraper/authority.rb index f99e311..2f4e5f8 100644 --- a/lib/uk_planning_scraper/authority.rb +++ b/lib/uk_planning_scraper/authority.rb @@ -9,6 +9,7 @@ module UKPlanningScraper @name = name @url = url @tags = tags + @applications = [] # Application objects end def scrape(params, options = {}) @@ -41,19 +42,23 @@ module UKPlanningScraper # Select which scraper to use case system when 'idox' - apps = scrape_idox(params, options) + @applications = scrape_idox(params, options) when 'northgate' - apps = scrape_northgate(params, options) + @applications = scrape_northgate(params, options) else raise SystemNotSupported.new("Planning system not supported for #{@name} at URL: #{@url}") end # Post processing - apps.each do |app| - app[:authority_name] = @name + @applications.each do |app| + app.authority_name = @name end - - apps # Single point of successful exit + + # Output as an array of hashes + output = [] + # FIXME - silently ignores invalid apps. How should we handle them? + @applications.each { |app| output << app.to_hash if app.valid? } + output # Single point of successful exit end def tagged?(tag) diff --git a/lib/uk_planning_scraper/idox.rb b/lib/uk_planning_scraper/idox.rb index fd23eb6..d7e0af5 100644 --- a/lib/uk_planning_scraper/idox.rb +++ b/lib/uk_planning_scraper/idox.rb @@ -48,8 +48,7 @@ module UKPlanningScraper # Only some Idox sites (eg Bolton) have a 'searchCriteria.developmentType' parameter form.send(:"searchCriteria\.developmentType", params[:development_type]) if form.has_field? 'searchCriteria.developmentType' - - + page = form.submit if page.search('.errors').inner_text.match(/Too many results found/i) @@ -63,7 +62,7 @@ module UKPlanningScraper puts "Found #{items.size} apps on this page." items.each do |app| - data = {} + data = Application.new # Parse info line info_line = app.at("p.metaInfo").inner_text.strip @@ -71,32 +70,30 @@ module UKPlanningScraper bits.each do |bit| if matches = bit.match(/Ref\. No:\s+(.+)/) - data[:council_reference] = matches[1] + data.council_reference = matches[1] end if matches = bit.match(/(Received|Registered):\s+(.+)/) - data[:date_received] = Date.parse(matches[2]) + data.date_received = Date.parse(matches[2]) end if matches = bit.match(/Validated:\s+(.+)/) - data[:date_validated] = Date.parse(matches[1]) + data.date_validated = Date.parse(matches[1]) end if matches = bit.match(/Status:\s+(.+)/) - data[:status] = matches[1] + data.status = matches[1] end end - data.merge!({ - scraped_at: Time.now, - info_url: base_url + app.at('a')['href'], - address: app.at('p.address').inner_text.strip, - description: app.at('a').inner_text.strip, - }) + data.scraped_at = Time.now + data.info_url = base_url + app.at('a')['href'] + data.address = app.at('p.address').inner_text.strip + data.description = app.at('a').inner_text.strip apps << data end - + # Get the Next button from the pager, if there is one if next_button = page.at('a.next') next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100' @@ -111,28 +108,27 @@ module UKPlanningScraper # Scrape the summary tab for each app apps.each_with_index do |app, i| sleep options[:delay] - puts "#{i + 1} of #{apps.size}: #{app[:info_url]}" - res = agent.get(app[:info_url]) + puts "#{i + 1} of #{apps.size}: #{app.info_url}" + res = agent.get(app.info_url) if res.code == '200' # That's a String not an Integer, ffs # Parse the summary tab for this app - app[:scraped_at] = Time.now + app.scraped_at = Time.now # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead) # Bradford has #tab_documents but without the document count on it - app[:documents_count] = 0 - app[:documents_url] = nil + app.documents_count = 0 if documents_link = res.at('.associateddocument a') if documents_link.inner_text.match(/\d+/) - app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i - app[:documents_url] = base_url + documents_link[:href] + app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i + app.documents_url = base_url + documents_link[:href] end elsif documents_link = res.at('#tab_documents') if documents_link.inner_text.match(/\d+/) - app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i - app[:documents_url] = base_url + documents_link[:href] + app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i + app.documents_url = base_url + documents_link[:href] end end @@ -145,31 +141,31 @@ module UKPlanningScraper case key when 'Reference' - app[:council_reference] = value + app.council_reference = value when 'Alternative Reference' - app[:alternative_reference] = value + app.alternative_reference = value unless value.empty? when 'Planning Portal Reference' - app[:alternative_reference] = value + app.alternative_reference = value unless value.empty? when 'Application Received' - app[:date_received] = Date.parse(value) if value.match(/\d/) + app.date_received = Date.parse(value) if value.match(/\d/) when 'Application Registered' - app[:date_received] = Date.parse(value) if value.match(/\d/) + app.date_received = Date.parse(value) if value.match(/\d/) when 'Application Validated' - app[:date_validated] = Date.parse(value) if value.match(/\d/) + app.date_validated = Date.parse(value) if value.match(/\d/) when 'Address' - app[:address] = value + app.address = value unless value.empty? when 'Proposal' - app[:description] = value + app.description = value unless value.empty? when 'Status' - app[:status] = value + app.status = value unless value.empty? when 'Decision' - app[:decision] = value + app.decision = value unless value.empty? when 'Decision Issued Date' - app[:date_decision] = Date.parse(value) if value.match(/\d/) + app.date_decision = Date.parse(value) if value.match(/\d/) when 'Appeal Status' - app[:appeal_status] = value + app.appeal_status = value unless value.empty? when 'Appeal Decision' - app[:appeal_decision] = value + app.appeal_decision = value unless value.empty? else puts "Error: key '#{key}' not found" end # case diff --git a/lib/uk_planning_scraper/northgate.rb b/lib/uk_planning_scraper/northgate.rb index 56692a5..9735fb7 100644 --- a/lib/uk_planning_scraper/northgate.rb +++ b/lib/uk_planning_scraper/northgate.rb @@ -50,9 +50,6 @@ module UKPlanningScraper form_vars['dateEnd'] = params[:decided_to].to_s if params[:decided_to] # YYYY-MM-DD end - - # form_vars.merge!({ 'cboStatusCode' => ENV['MORPH_STATUS']}) if ENV['MORPH_STATUS'] - logger.info "Form variables: #{form_vars.to_s}" headers = { @@ -110,29 +107,19 @@ module UKPlanningScraper rows.each do |row| if row.at("td") # skip header row which only has th's cells = row.search("td") - ref = cells[0].inner_text.strip - - app = { - scraped_at: Time.now, - # date_scraped: Date.today # FIXME - Planning Alerts compatibility? - } - - app[:council_reference] = ref - app[:info_url] = URI::encode(generic_url + cells[0].at('a')[:href].strip) - app[:info_url].gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this? - app[:address] = cells[1].inner_text.strip - app[:description] = cells[2].inner_text.strip - app[:status] = cells[3].inner_text.strip - + + app = Application.new + app.scraped_at = Time.now + app.council_reference = cells[0].inner_text.strip + app.info_url = URI::encode(generic_url + cells[0].at('a')[:href].strip) + app.info_url.gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this? + app.address = cells[1].inner_text.strip + app.description = cells[2].inner_text.strip + app.status = cells[3].inner_text.strip raw_date_received = cells[4].inner_text.strip - - if raw_date_received != '--' - app[:date_received] = Date.parse(raw_date_received) - else - app[:date_received] = nil - end - - app[:decision] = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney + app.date_received = Date.parse(raw_date_received) if raw_date_received != '--' + app.decision = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney + apps << app end end