From 2e100bc3c88964a08ef05691887a07cbc1d60e82 Mon Sep 17 00:00:00 2001 From: Adrian Short Date: Thu, 20 Sep 2018 20:47:29 +0100 Subject: [PATCH] Make scrapers private instance methods in Authority --- lib/uk_planning_scraper/authority.rb | 4 +- lib/uk_planning_scraper/idox.rb | 311 ++++++++++++++------------- lib/uk_planning_scraper/northgate.rb | 231 ++++++++++---------- 3 files changed, 276 insertions(+), 270 deletions(-) diff --git a/lib/uk_planning_scraper/authority.rb b/lib/uk_planning_scraper/authority.rb index 0ff6396..cde35e9 100644 --- a/lib/uk_planning_scraper/authority.rb +++ b/lib/uk_planning_scraper/authority.rb @@ -40,9 +40,9 @@ module UKPlanningScraper # Select which scraper to use based on the URL if @url.match(/search\.do\?action=advanced/i) - apps = UKPlanningScraper.scrape_idox(@url, params, options) + apps = scrape_idox(params, options) elsif @url.match(/generalsearch\.aspx/i) - apps = UKPlanningScraper.scrape_northgate(@url, params, options) + apps = scrape_northgate(params, options) else # Not supported raise SystemNotSupportedError.new("Planning system not supported for #{@name} at URL: #{@url}") diff --git a/lib/uk_planning_scraper/idox.rb b/lib/uk_planning_scraper/idox.rb index 9810624..85d8048 100644 --- a/lib/uk_planning_scraper/idox.rb +++ b/lib/uk_planning_scraper/idox.rb @@ -2,170 +2,173 @@ require 'mechanize' require 'pp' module UKPlanningScraper - def self.scrape_idox(search_url, params, options) - puts "Using Idox scraper." - base_url = search_url.match(/(https?:\/\/.+?)\//)[1] - - apps = [] - - agent = Mechanize.new - puts "Getting: #{search_url}" - page = agent.get(search_url) # load the search form page - - # Check that the search form is actually present. - # When Idox has an internal error it returns an error page with HTTP 200. - unless form = page.form('searchCriteriaForm') - puts "Error: Search form page failed to load due to Idox internal error." - return [] - end - # form.action = form.action + '&searchCriteria.resultsPerPage=100' - - # Fill out and submit search form - - # Some councils don't have the received from/to dates on their form, eg Newham - form.send(:"date(applicationReceivedStart)", params[:received_from].strftime("%d/%m/%Y")) if params[:received_from] - form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime("%d/%m/%Y")) if params[:received_to] - - form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime("%d/%m/%Y")) if params[:validated_from] - form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime("%d/%m/%Y")) if params[:validated_to] - - form.send(:"date(applicationDecisionStart)", params[:decided_from].strftime("%d/%m/%Y")) if params[:decided_from] - form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime("%d/%m/%Y")) if params[:decided_to] - - form.send(:"searchCriteria\.description", params[:keywords]) - - # Some councils don't have the applicant name on their form, eg Bexley - form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName' - - form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType' - # Some Idox sites (eg Bolton) call this 'searchCriteria.developmentType' - form.send(:"searchCriteria\.developmentType", params[:application_type]) if form.has_field? 'searchCriteria.developmentType' - - - page = form.submit - - loop do - # Parse search results - items = page.search('li.searchresult') - - puts "Found #{items.size} apps on this page." - - items.each do |app| - data = {} - - # Parse info line - info_line = app.at("p.metaInfo").inner_text.strip - bits = info_line.split('|').map { |e| e.strip.delete("\r\n") } - - bits.each do |bit| - if matches = bit.match(/Ref\. No:\s+(.+)/) - data[:council_reference] = matches[1] - end - - if matches = bit.match(/(Received|Registered):\s+(.+)/) - data[:date_received] = Date.parse(matches[2]) - end - - if matches = bit.match(/Validated:\s+(.+)/) - data[:date_validated] = Date.parse(matches[1]) - end + class Authority + private + def scrape_idox(params, options) + puts "Using Idox scraper." + base_url = @url.match(/(https?:\/\/.+?)\//)[1] + + apps = [] - if matches = bit.match(/Status:\s+(.+)/) - data[:status] = matches[1] - end - end + agent = Mechanize.new + puts "Getting: #{@url}" + page = agent.get(@url) # load the search form page - data.merge!({ - scraped_at: Time.now, - info_url: base_url + app.at('a')['href'], - address: app.at('p.address').inner_text.strip, - description: app.at('a').inner_text.strip, - }) - - apps << data + # Check that the search form is actually present. + # When Idox has an internal error it returns an error page with HTTP 200. + unless form = page.form('searchCriteriaForm') + puts "Error: Search form page failed to load due to Idox internal error." + return [] end + # form.action = form.action + '&searchCriteria.resultsPerPage=100' - # Get the Next button from the pager, if there is one - if next_button = page.at('a.next') - next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100' - sleep options[:delay] - puts "Getting: #{next_url}" - page = agent.get(next_url) - else - break - end - end - - # Scrape the summary tab for each app - apps.each_with_index do |app, i| - sleep options[:delay] - puts "#{i + 1} of #{apps.size}: #{app[:info_url]}" - res = agent.get(app[:info_url]) + # Fill out and submit search form + + # Some councils don't have the received from/to dates on their form, eg Newham + form.send(:"date(applicationReceivedStart)", params[:received_from].strftime("%d/%m/%Y")) if params[:received_from] + form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime("%d/%m/%Y")) if params[:received_to] + + form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime("%d/%m/%Y")) if params[:validated_from] + form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime("%d/%m/%Y")) if params[:validated_to] + + form.send(:"date(applicationDecisionStart)", params[:decided_from].strftime("%d/%m/%Y")) if params[:decided_from] + form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime("%d/%m/%Y")) if params[:decided_to] + + form.send(:"searchCriteria\.description", params[:keywords]) - if res.code == '200' # That's a String not an Integer, ffs - # Parse the summary tab for this app + # Some councils don't have the applicant name on their form, eg Bexley + form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName' + + form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType' + # Some Idox sites (eg Bolton) call this 'searchCriteria.developmentType' + form.send(:"searchCriteria\.developmentType", params[:application_type]) if form.has_field? 'searchCriteria.developmentType' + + + page = form.submit - app[:scraped_at] = Time.now + loop do + # Parse search results + items = page.search('li.searchresult') - # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead) - # Bradford has #tab_documents but without the document count on it - app[:documents_count] = 0 - app[:documents_url] = nil + puts "Found #{items.size} apps on this page." - if documents_link = res.at('.associateddocument a') - if documents_link.inner_text.match(/\d+/) - app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i - app[:documents_url] = base_url + documents_link[:href] - end - elsif documents_link = res.at('#tab_documents') - if documents_link.inner_text.match(/\d+/) - app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i - app[:documents_url] = base_url + documents_link[:href] + items.each do |app| + data = {} + + # Parse info line + info_line = app.at("p.metaInfo").inner_text.strip + bits = info_line.split('|').map { |e| e.strip.delete("\r\n") } + + bits.each do |bit| + if matches = bit.match(/Ref\. No:\s+(.+)/) + data[:council_reference] = matches[1] + end + + if matches = bit.match(/(Received|Registered):\s+(.+)/) + data[:date_received] = Date.parse(matches[2]) + end + + if matches = bit.match(/Validated:\s+(.+)/) + data[:date_validated] = Date.parse(matches[1]) + end + + if matches = bit.match(/Status:\s+(.+)/) + data[:status] = matches[1] + end end + + data.merge!({ + scraped_at: Time.now, + info_url: base_url + app.at('a')['href'], + address: app.at('p.address').inner_text.strip, + description: app.at('a').inner_text.strip, + }) + + apps << data end - - # We need to find values in the table by using the th labels. - # The row indexes/positions change from site to site (or even app to app) so we can't rely on that. - res.search('#simpleDetailsTable tr').each do |row| - key = row.at('th').inner_text.strip - value = row.at('td').inner_text.strip + # Get the Next button from the pager, if there is one + if next_button = page.at('a.next') + next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100' + sleep options[:delay] + puts "Getting: #{next_url}" + page = agent.get(next_url) + else + break + end + end + + # Scrape the summary tab for each app + apps.each_with_index do |app, i| + sleep options[:delay] + puts "#{i + 1} of #{apps.size}: #{app[:info_url]}" + res = agent.get(app[:info_url]) + + if res.code == '200' # That's a String not an Integer, ffs + # Parse the summary tab for this app + + app[:scraped_at] = Time.now + + # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead) + # Bradford has #tab_documents but without the document count on it + app[:documents_count] = 0 + app[:documents_url] = nil + + if documents_link = res.at('.associateddocument a') + if documents_link.inner_text.match(/\d+/) + app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i + app[:documents_url] = base_url + documents_link[:href] + end + elsif documents_link = res.at('#tab_documents') + if documents_link.inner_text.match(/\d+/) + app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i + app[:documents_url] = base_url + documents_link[:href] + end + end - case key - when 'Reference' - app[:council_reference] = value - when 'Alternative Reference' - app[:alternative_reference] = value - when 'Planning Portal Reference' - app[:alternative_reference] = value - when 'Application Received' - app[:date_received] = Date.parse(value) if value.match(/\d/) - when 'Application Registered' - app[:date_received] = Date.parse(value) if value.match(/\d/) - when 'Application Validated' - app[:date_validated] = Date.parse(value) if value.match(/\d/) - when 'Address' - app[:address] = value - when 'Proposal' - app[:description] = value - when 'Status' - app[:status] = value - when 'Decision' - app[:decision] = value - when 'Decision Issued Date' - app[:date_decision] = Date.parse(value) if value.match(/\d/) - when 'Appeal Status' - app[:appeal_status] = value - when 'Appeal Decision' - app[:appeal_decision] = value - else - puts "Error: key '#{key}' not found" - end # case - end # each row - else - puts "Error: HTTP #{res.code}" - end # if - end # scrape summary tab for apps - apps - end # scrape_idox + # We need to find values in the table by using the th labels. + # The row indexes/positions change from site to site (or even app to app) so we can't rely on that. + + res.search('#simpleDetailsTable tr').each do |row| + key = row.at('th').inner_text.strip + value = row.at('td').inner_text.strip + + case key + when 'Reference' + app[:council_reference] = value + when 'Alternative Reference' + app[:alternative_reference] = value + when 'Planning Portal Reference' + app[:alternative_reference] = value + when 'Application Received' + app[:date_received] = Date.parse(value) if value.match(/\d/) + when 'Application Registered' + app[:date_received] = Date.parse(value) if value.match(/\d/) + when 'Application Validated' + app[:date_validated] = Date.parse(value) if value.match(/\d/) + when 'Address' + app[:address] = value + when 'Proposal' + app[:description] = value + when 'Status' + app[:status] = value + when 'Decision' + app[:decision] = value + when 'Decision Issued Date' + app[:date_decision] = Date.parse(value) if value.match(/\d/) + when 'Appeal Status' + app[:appeal_status] = value + when 'Appeal Decision' + app[:appeal_decision] = value + else + puts "Error: key '#{key}' not found" + end # case + end # each row + else + puts "Error: HTTP #{res.code}" + end # if + end # scrape summary tab for apps + apps + end # scrape_idox + end # class end diff --git a/lib/uk_planning_scraper/northgate.rb b/lib/uk_planning_scraper/northgate.rb index 6d44236..56692a5 100644 --- a/lib/uk_planning_scraper/northgate.rb +++ b/lib/uk_planning_scraper/northgate.rb @@ -3,137 +3,140 @@ require 'nokogiri' require 'logger' module UKPlanningScraper - def self.scrape_northgate(search_url, params, options) - puts "Using Northgate scraper." - base_url = search_url.match(/(https?:\/\/.+?)\//)[1] - - # Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive? - generic_url = search_url.match(/.+\//)[0] + 'Generic/' - - apps = [] - - $stdout.sync = true # Flush output buffer after every write so log messages appear immediately. - logger = Logger.new($stdout) - logger.level = Logger::DEBUG - - date_regex = /\d{2}-\d{2}-\d{4}/ - - form_vars = { - 'csbtnSearch' => 'Search' # required - } - - form_vars['txtProposal'] = params[:keywords] - - # Date received from and to - if params[:received_from] || params[:received_to] - form_vars['cboSelectDateValue'] = 'DATE_RECEIVED' - form_vars['rbGroup'] = 'rbRange' - form_vars['dateStart'] = params[:received_from].to_s if params[:received_from] # YYYY-MM-DD - form_vars['dateEnd'] = params[:received_to].to_s if params[:received_to] # YYYY-MM-DD - end + class Authority + private + def scrape_northgate(params, options) + puts "Using Northgate scraper." + base_url = @url.match(/(https?:\/\/.+?)\//)[1] + + # Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive? + generic_url = @url.match(/.+\//)[0] + 'Generic/' + + apps = [] - # Date validated from and to - if params[:validated_from] || params[:validated_to] - form_vars['cboSelectDateValue'] = 'DATE_VALID' - form_vars['rbGroup'] = 'rbRange' - form_vars['dateStart'] = params[:validated_from].to_s if params[:validated_from] # YYYY-MM-DD - form_vars['dateEnd'] = params[:validated_to].to_s if params[:validated_to] # YYYY-MM-DD - end + $stdout.sync = true # Flush output buffer after every write so log messages appear immediately. + logger = Logger.new($stdout) + logger.level = Logger::DEBUG - # Date decided from and to - if params[:decided_from] || params[:decided_to] - form_vars['cboSelectDateValue'] = 'DATE_DECISION' - form_vars['rbGroup'] = 'rbRange' - form_vars['dateStart'] = params[:decided_from].to_s if params[:decided_from] # YYYY-MM-DD - form_vars['dateEnd'] = params[:decided_to].to_s if params[:decided_to] # YYYY-MM-DD - end + date_regex = /\d{2}-\d{2}-\d{4}/ + form_vars = { + 'csbtnSearch' => 'Search' # required + } - # form_vars.merge!({ 'cboStatusCode' => ENV['MORPH_STATUS']}) if ENV['MORPH_STATUS'] + form_vars['txtProposal'] = params[:keywords] - logger.info "Form variables: #{form_vars.to_s}" + # Date received from and to + if params[:received_from] || params[:received_to] + form_vars['cboSelectDateValue'] = 'DATE_RECEIVED' + form_vars['rbGroup'] = 'rbRange' + form_vars['dateStart'] = params[:received_from].to_s if params[:received_from] # YYYY-MM-DD + form_vars['dateEnd'] = params[:received_to].to_s if params[:received_to] # YYYY-MM-DD + end - headers = { - 'Origin' => base_url, - 'Referer' => search_url, - } + # Date validated from and to + if params[:validated_from] || params[:validated_to] + form_vars['cboSelectDateValue'] = 'DATE_VALID' + form_vars['rbGroup'] = 'rbRange' + form_vars['dateStart'] = params[:validated_from].to_s if params[:validated_from] # YYYY-MM-DD + form_vars['dateEnd'] = params[:validated_to].to_s if params[:validated_to] # YYYY-MM-DD + end - logger.debug "HTTP request headers:" - logger.debug(headers.to_s) + # Date decided from and to + if params[:decided_from] || params[:decided_to] + form_vars['cboSelectDateValue'] = 'DATE_DECISION' + form_vars['rbGroup'] = 'rbRange' + form_vars['dateStart'] = params[:decided_from].to_s if params[:decided_from] # YYYY-MM-DD + form_vars['dateEnd'] = params[:decided_to].to_s if params[:decided_to] # YYYY-MM-DD + end - logger.debug "GET: " + search_url - response = HTTP.headers(headers).get(search_url) - logger.debug "Response code: HTTP " + response.code.to_s - if response.code == 200 - doc = Nokogiri::HTML(response.to_s) - asp_vars = { - '__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'], - '__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value'] - } - else - logger.fatal "Bad response from search page. Response code: #{response.code.to_s}. Exiting." - exit 1 - end + # form_vars.merge!({ 'cboStatusCode' => ENV['MORPH_STATUS']}) if ENV['MORPH_STATUS'] - cookies = {} - response.cookies.each { |c| cookies[c.name] = c.value } + logger.info "Form variables: #{form_vars.to_s}" - form_vars.merge!(asp_vars) + headers = { + 'Origin' => base_url, + 'Referer' => @url, + } - logger.debug "POST: " + search_url - response2 = HTTP.headers(headers).cookies(cookies).post(search_url, :form => form_vars) - logger.debug "Response code: HTTP " + response2.code.to_s + logger.debug "HTTP request headers:" + logger.debug(headers.to_s) - if response2.code == 302 - # Follow the redirect manually - # Set the page size (PS) to max so we don't have to page through search results - logger.debug "Location: #{response2.headers['Location']}" - # exit - results_url = URI::encode(base_url + response2.headers['Location'].gsub!('PS=10', 'PS=99999')) - - logger.debug "GET: " + results_url - response3 = HTTP.headers(headers).cookies(cookies).get(results_url) - logger.debug "Response code: HTTP " + response3.code.to_s - doc = Nokogiri::HTML(response3.to_s) - else - logger.fatal "Didn't get redirected from search. Exiting." - exit 1 - end + logger.debug "GET: " + @url + response = HTTP.headers(headers).get(@url) + logger.debug "Response code: HTTP " + response.code.to_s - rows = doc.search("table.display_table tr") - logger.info "Found #{rows.size - 1} applications in search results." # The first row is the header row - - # Iterate over search results - rows.each do |row| - if row.at("td") # skip header row which only has th's - cells = row.search("td") - ref = cells[0].inner_text.strip - - app = { - scraped_at: Time.now, - # date_scraped: Date.today # FIXME - Planning Alerts compatibility? - } - - app[:council_reference] = ref - app[:info_url] = URI::encode(generic_url + cells[0].at('a')[:href].strip) - app[:info_url].gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this? - app[:address] = cells[1].inner_text.strip - app[:description] = cells[2].inner_text.strip - app[:status] = cells[3].inner_text.strip - - raw_date_received = cells[4].inner_text.strip + if response.code == 200 + doc = Nokogiri::HTML(response.to_s) + asp_vars = { + '__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'], + '__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value'] + } + else + logger.fatal "Bad response from search page. Response code: #{response.code.to_s}. Exiting." + exit 1 + end + + cookies = {} + response.cookies.each { |c| cookies[c.name] = c.value } + + form_vars.merge!(asp_vars) + + logger.debug "POST: " + @url + response2 = HTTP.headers(headers).cookies(cookies).post(@url, :form => form_vars) + logger.debug "Response code: HTTP " + response2.code.to_s + + if response2.code == 302 + # Follow the redirect manually + # Set the page size (PS) to max so we don't have to page through search results + logger.debug "Location: #{response2.headers['Location']}" + # exit + results_url = URI::encode(base_url + response2.headers['Location'].gsub!('PS=10', 'PS=99999')) - if raw_date_received != '--' - app[:date_received] = Date.parse(raw_date_received) - else - app[:date_received] = nil + logger.debug "GET: " + results_url + response3 = HTTP.headers(headers).cookies(cookies).get(results_url) + logger.debug "Response code: HTTP " + response3.code.to_s + doc = Nokogiri::HTML(response3.to_s) + else + logger.fatal "Didn't get redirected from search. Exiting." + exit 1 + end + + rows = doc.search("table.display_table tr") + logger.info "Found #{rows.size - 1} applications in search results." # The first row is the header row + + # Iterate over search results + rows.each do |row| + if row.at("td") # skip header row which only has th's + cells = row.search("td") + ref = cells[0].inner_text.strip + + app = { + scraped_at: Time.now, + # date_scraped: Date.today # FIXME - Planning Alerts compatibility? + } + + app[:council_reference] = ref + app[:info_url] = URI::encode(generic_url + cells[0].at('a')[:href].strip) + app[:info_url].gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this? + app[:address] = cells[1].inner_text.strip + app[:description] = cells[2].inner_text.strip + app[:status] = cells[3].inner_text.strip + + raw_date_received = cells[4].inner_text.strip + + if raw_date_received != '--' + app[:date_received] = Date.parse(raw_date_received) + else + app[:date_received] = nil + end + + app[:decision] = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney + apps << app end - - app[:decision] = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney - apps << app end + apps end - apps end end