From 1efb7c57d44ccd3718138dc0b7d6a030b359102f Mon Sep 17 00:00:00 2001 From: Graeme Porteous Date: Mon, 15 Apr 2019 09:56:12 +0100 Subject: [PATCH] Extract summary scraping into separate method --- lib/uk_planning_scraper/idox.rb | 138 +++++++++++++++++--------------- 1 file changed, 75 insertions(+), 63 deletions(-) diff --git a/lib/uk_planning_scraper/idox.rb b/lib/uk_planning_scraper/idox.rb index 5ba773d..d483dcf 100644 --- a/lib/uk_planning_scraper/idox.rb +++ b/lib/uk_planning_scraper/idox.rb @@ -4,13 +4,20 @@ require 'pp' module UKPlanningScraper class Authority private + + def base_url + @base_url ||= @url.match(/(https?:\/\/.+?)\//)[1] + end + + def agent + @agent ||= Mechanize.new + end + def scrape_idox(params, options) puts "Using Idox scraper." - base_url = @url.match(/(https?:\/\/.+?)\//)[1] apps = [] - agent = Mechanize.new puts "Getting: #{@url}" page = agent.get(@url) # load the search form page @@ -111,72 +118,77 @@ module UKPlanningScraper apps.each_with_index do |app, i| sleep options[:delay] puts "#{i + 1} of #{apps.size}: #{app.info_url}" - res = agent.get(app.info_url) - if res.code == '200' # That's a String not an Integer, ffs - # Parse the summary tab for this app + parse_info_url(app) if app.info_url + end # scrape summary tab for apps + apps + end # scrape_idox - app.scraped_at = Time.now + def parse_info_url(app) + res = agent.get(app.info_url) - # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead) - # Bradford has #tab_documents but without the document count on it - app.documents_count = 0 + if res.code == '200' # That's a String not an Integer, ffs + # Parse the summary tab for this app - if documents_link = res.at('.associateddocument a') - if documents_link.inner_text.match(/\d+/) - app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i - app.documents_url = base_url + documents_link[:href] - end - elsif documents_link = res.at('#tab_documents') - if documents_link.inner_text.match(/\d+/) - app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i - app.documents_url = base_url + documents_link[:href] - end + app.scraped_at = Time.now + + # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead) + # Bradford has #tab_documents but without the document count on it + app.documents_count = 0 + + if documents_link = res.at('.associateddocument a') + if documents_link.inner_text.match(/\d+/) + app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i + app.documents_url = base_url + documents_link[:href] end + elsif documents_link = res.at('#tab_documents') + if documents_link.inner_text.match(/\d+/) + app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i + app.documents_url = base_url + documents_link[:href] + end + end - # We need to find values in the table by using the th labels. - # The row indexes/positions change from site to site (or even app to app) so we can't rely on that. - - res.search('#simpleDetailsTable tr').each do |row| - key = row.at('th').inner_text.strip - value = row.at('td').inner_text.strip - - case key - when 'Reference' - app.council_reference = value - when 'Alternative Reference' - app.alternative_reference = value unless value.empty? - when 'Planning Portal Reference' - app.alternative_reference = value unless value.empty? - when 'Application Received' - app.date_received = Date.parse(value) if value.match(/\d/) - when 'Application Registered' - app.date_received = Date.parse(value) if value.match(/\d/) - when 'Application Validated' - app.date_validated = Date.parse(value) if value.match(/\d/) - when 'Address' - app.address = value unless value.empty? - when 'Proposal' - app.description = value unless value.empty? - when 'Status' - app.status = value unless value.empty? - when 'Decision' - app.decision = value unless value.empty? - when 'Decision Issued Date' - app.date_decision = Date.parse(value) if value.match(/\d/) - when 'Appeal Status' - app.appeal_status = value unless value.empty? - when 'Appeal Decision' - app.appeal_decision = value unless value.empty? - else - puts "Error: key '#{key}' not found" - end # case - end # each row - else - puts "Error: HTTP #{res.code}" - end # if - end # scrape summary tab for apps - apps - end # scrape_idox + # We need to find values in the table by using the th labels. + # The row indexes/positions change from site to site (or even app to app) so we can't rely on that. + + res.search('#simpleDetailsTable tr').each do |row| + key = row.at('th').inner_text.strip + value = row.at('td').inner_text.strip + + case key + when 'Reference' + app.council_reference = value + when 'Alternative Reference' + app.alternative_reference = value unless value.empty? + when 'Planning Portal Reference' + app.alternative_reference = value unless value.empty? + when 'Application Received' + app.date_received = Date.parse(value) if value.match(/\d/) + when 'Application Registered' + app.date_received = Date.parse(value) if value.match(/\d/) + when 'Application Validated' + app.date_validated = Date.parse(value) if value.match(/\d/) + when 'Address' + app.address = value unless value.empty? + when 'Proposal' + app.description = value unless value.empty? + when 'Status' + app.status = value unless value.empty? + when 'Decision' + app.decision = value unless value.empty? + when 'Decision Issued Date' + app.date_decision = Date.parse(value) if value.match(/\d/) + when 'Appeal Status' + app.appeal_status = value unless value.empty? + when 'Appeal Decision' + app.appeal_decision = value unless value.empty? + else + puts "Error: key '#{key}' not found" + end # case + end # each row + else + puts "Error: HTTP #{res.code}" + end # if + end end # class end