diff --git a/lib/uk_planning_scraper.rb b/lib/uk_planning_scraper.rb index 8234580..eef7b9a 100644 --- a/lib/uk_planning_scraper.rb +++ b/lib/uk_planning_scraper.rb @@ -1,9 +1,7 @@ require "uk_planning_scraper/version" require 'uk_planning_scraper/idox' -require 'mechanize' -require 'time' +require 'uk_planning_scraper/northgate' require 'logger' -require 'pp' module UKPlanningScraper def self.search(search_url, params, options = {}) @@ -13,9 +11,10 @@ module UKPlanningScraper options = default_options.merge(options) # The user-supplied options override the defaults # Select which scraper to use based on the URL - if search_url.match(/search.do\?action=advanced/i) - # Idox + if search_url.match(/search\.do\?action=advanced/i) return self.scrape_idox(search_url, params, options) + elsif search_url.match(/generalsearch\.aspx/i) + return self.scrape_northgate(search_url, params, options) else # Not supported raise "Planning system not supported for URL: #{search_url}" diff --git a/lib/uk_planning_scraper/idox.rb b/lib/uk_planning_scraper/idox.rb index 21ab598..f565b62 100644 --- a/lib/uk_planning_scraper/idox.rb +++ b/lib/uk_planning_scraper/idox.rb @@ -1,3 +1,6 @@ +require 'mechanize' +require 'pp' + module UKPlanningScraper def self.scrape_idox(search_url, params, options) puts "Using Idox scraper." diff --git a/lib/uk_planning_scraper/northgate.rb b/lib/uk_planning_scraper/northgate.rb new file mode 100644 index 0000000..5908996 --- /dev/null +++ b/lib/uk_planning_scraper/northgate.rb @@ -0,0 +1,131 @@ +require 'http' +require 'nokogiri' +require 'logger' + +module UKPlanningScraper + def self.scrape_northgate(search_url, params, options) + puts "Using Northgate scraper." + base_url = search_url.match(/(https?:\/\/.+?)\//)[1] + + # Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive? + generic_url = search_url.match(/.+\//)[0] + 'Generic/' + + apps = [] + + $stdout.sync = true # Flush output buffer after every write so log messages appear immediately. + logger = Logger.new($stdout) + logger.level = Logger::DEBUG + + date_regex = /\d{2}-\d{2}-\d{4}/ + + form_vars = { + 'csbtnSearch' => 'Search' # required + } + + form_vars['txtProposal'] = params[:description] + + # Date received from and to + if params[:received_from] || params[:received_to] + form_vars['cboSelectDateValue'] = 'DATE_RECEIVED' + form_vars['rbGroup'] = 'rbRange' + form_vars['dateStart'] = params[:received_from].to_s if params[:received_from] # YYYY-MM-DD + form_vars['dateEnd'] = params[:received_to].to_s if params[:received_to] # YYYY-MM-DD + end + + # Date validated from and to + if params[:validated_from] || params[:validated_to] + form_vars['cboSelectDateValue'] = 'DATE_VALID' + form_vars['rbGroup'] = 'rbRange' + form_vars['dateStart'] = params[:validated_from].to_s if params[:validated_from] # YYYY-MM-DD + form_vars['dateEnd'] = params[:validated_to].to_s if params[:validated_to] # YYYY-MM-DD + end + + # form_vars.merge!({ 'cboStatusCode' => ENV['MORPH_STATUS']}) if ENV['MORPH_STATUS'] + + logger.info "Form variables: #{form_vars.to_s}" + + headers = { + 'Origin' => base_url, + 'Referer' => search_url, + } + + logger.debug "HTTP request headers:" + logger.debug(headers.to_s) + + logger.debug "GET: " + search_url + response = HTTP.headers(headers).get(search_url) + logger.debug "Response code: HTTP " + response.code.to_s + + if response.code == 200 + doc = Nokogiri::HTML(response.to_s) + asp_vars = { + '__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'], + '__VIEWSTATEGENERATOR' => doc.at('#__VIEWSTATEGENERATOR')['value'], + '__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value'] + } + else + logger.fatal "Bad response from search page. Response code: #{response.code.to_s}. Exiting." + exit 1 + end + + cookies = {} + response.cookies.each { |c| cookies[c.name] = c.value } + + form_vars.merge!(asp_vars) + + logger.debug "POST: " + search_url + response2 = HTTP.headers(headers).cookies(cookies).post(search_url, :form => form_vars) + logger.debug "Response code: HTTP " + response2.code.to_s + + if response2.code == 302 + # Follow the redirect manually + # Set the page size (PS) to max so we don't have to page through search results + logger.debug "Location: #{response2.headers['Location']}" + # exit + results_url = URI::encode(base_url + response2.headers['Location'].gsub!('PS=10', 'PS=99999')) + + logger.debug "GET: " + results_url + response3 = HTTP.headers(headers).cookies(cookies).get(results_url) + logger.debug "Response code: HTTP " + response3.code.to_s + doc = Nokogiri::HTML(response3.to_s) + else + logger.fatal "Didn't get redirected from search. Exiting." + exit 1 + end + + rows = doc.search("table.display_table tr") + logger.info "Found #{rows.size - 1} applications in search results." + + # Iterate over search results + rows.each do |row| + if row.at("td") # skip header row which only has th's + cells = row.search("td") + ref = cells[0].inner_text.strip + + app = { + scraped_at: Time.now, + # date_scraped: Date.today # FIXME - Planning Alerts compatibility? + } + + app[:council_reference] = ref + app[:info_url] = URI::encode(generic_url + cells[0].at('a')[:href].strip) + app[:info_url].gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this? + app[:address] = cells[1].inner_text.strip + app[:description] = cells[2].inner_text.strip + app[:status] = cells[3].inner_text.strip + + raw_date_received = cells[4].inner_text.strip + + if raw_date_received != '--' + app[:date_received] = Date.parse(raw_date_received) + else + app[:date_received] = nil + end + + app[:decision] = cells[5].inner_text.strip + apps << app + end + end + apps + end +end diff --git a/uk_planning_scraper.gemspec b/uk_planning_scraper.gemspec index e8f1913..025e84a 100644 --- a/uk_planning_scraper.gemspec +++ b/uk_planning_scraper.gemspec @@ -32,4 +32,5 @@ Gem::Specification.new do |spec| spec.add_development_dependency "rake", "~> 10.0" spec.add_runtime_dependency "mechanize", "~> 2.7" + spec.add_runtime_dependency "http" end