From ddc81dd838d16ea6ce679624b2ec6d920a8b0da5 Mon Sep 17 00:00:00 2001 From: Adrian Short Date: Sun, 16 Sep 2018 10:44:23 +0100 Subject: [PATCH] Scrape and parse summary tab for each app --- lib/uk_planning_scraper.rb | 62 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/lib/uk_planning_scraper.rb b/lib/uk_planning_scraper.rb index 784c8a5..af5410b 100644 --- a/lib/uk_planning_scraper.rb +++ b/lib/uk_planning_scraper.rb @@ -90,6 +90,64 @@ module UKPlanningScraper break end end + + # Scrape the summary tab for each app + apps.each_with_index do |app, i| + sleep @options[:delay] + puts "#{i + 1} of #{apps.size}: #{app[:info_url]}" + res = agent.get(app[:info_url]) + + if res.code == '200' # That's a String not an Integer, ffs + # Parse the summary tab for this app + + app[:scraped_at] = Time.now + # Does the Documents tab show if there are no documents? + app[:documents_count] = res.at('#tab_documents').inner_text.match(/\d+/)[0].to_i + app[:documents_url] = @base_url + res.at('#tab_documents')[:href] + + # We need to find values in the table by using the th labels. + # The row indexes/positions change from site to site (or even app to app) so we can't rely on that. + + res.search('#simpleDetailsTable tr').each do |row| + key = row.at('th').inner_text.strip + value = row.at('td').inner_text.strip + + case key + when 'Reference' + app[:council_reference] = value + when 'Alternative Reference' + app[:alternative_reference] = value + when 'Planning Portal Reference' + app[:alternative_reference] = value + when 'Application Received' + app[:date_received] = Date.parse(value) if value != '' + when 'Application Registered' + app[:date_received] = Date.parse(value) if value != '' + when 'Application Validated' + app[:date_validated] = Date.parse(value) if value != '' + when 'Address' + app[:address] = value + when 'Proposal' + app[:description] = value + when 'Status' + app[:status] = value + when 'Decision' + app[:decision] = value + when 'Decision Issued Date' + app[:date_decision] = Date.parse(value) if value != '' + when 'Appeal Status' + app[:appeal_status] = value + when 'Appeal Decision' + app[:appeal_decision] = value + else + puts "Error: key '#{key}' not found" + end # case + end # each row + else + puts "Error: HTTP #{res.code}" + end # if + pp app + end # scrape summary tab for apps apps - end -end + end # self.search +end # module