Ver código fonte

Add core Northgate scraper for search results page

tags/v0.4.5
Adrian Short 5 anos atrás
pai
commit
dbfd10b33c
4 arquivos alterados com 139 adições e 5 exclusões
  1. +4
    -5
      lib/uk_planning_scraper.rb
  2. +3
    -0
      lib/uk_planning_scraper/idox.rb
  3. +131
    -0
      lib/uk_planning_scraper/northgate.rb
  4. +1
    -0
      uk_planning_scraper.gemspec

+ 4
- 5
lib/uk_planning_scraper.rb Ver arquivo

@@ -1,9 +1,7 @@
require "uk_planning_scraper/version"
require 'uk_planning_scraper/idox'
require 'mechanize'
require 'time'
require 'uk_planning_scraper/northgate'
require 'logger'
require 'pp'

module UKPlanningScraper
def self.search(search_url, params, options = {})
@@ -13,9 +11,10 @@ module UKPlanningScraper
options = default_options.merge(options) # The user-supplied options override the defaults
# Select which scraper to use based on the URL
if search_url.match(/search.do\?action=advanced/i)
# Idox
if search_url.match(/search\.do\?action=advanced/i)
return self.scrape_idox(search_url, params, options)
elsif search_url.match(/generalsearch\.aspx/i)
return self.scrape_northgate(search_url, params, options)
else
# Not supported
raise "Planning system not supported for URL: #{search_url}"


+ 3
- 0
lib/uk_planning_scraper/idox.rb Ver arquivo

@@ -1,3 +1,6 @@
require 'mechanize'
require 'pp'

module UKPlanningScraper
def self.scrape_idox(search_url, params, options)
puts "Using Idox scraper."


+ 131
- 0
lib/uk_planning_scraper/northgate.rb Ver arquivo

@@ -0,0 +1,131 @@
require 'http'
require 'nokogiri'
require 'logger'

module UKPlanningScraper
def self.scrape_northgate(search_url, params, options)
puts "Using Northgate scraper."
base_url = search_url.match(/(https?:\/\/.+?)\//)[1]
# Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive?
generic_url = search_url.match(/.+\//)[0] + 'Generic/'
apps = []

$stdout.sync = true # Flush output buffer after every write so log messages appear immediately.
logger = Logger.new($stdout)
logger.level = Logger::DEBUG

date_regex = /\d{2}-\d{2}-\d{4}/

form_vars = {
'csbtnSearch' => 'Search' # required
}

form_vars['txtProposal'] = params[:description]

# Date received from and to
if params[:received_from] || params[:received_to]
form_vars['cboSelectDateValue'] = 'DATE_RECEIVED'
form_vars['rbGroup'] = 'rbRange'
form_vars['dateStart'] = params[:received_from].to_s if params[:received_from] # YYYY-MM-DD
form_vars['dateEnd'] = params[:received_to].to_s if params[:received_to] # YYYY-MM-DD
end

# Date validated from and to
if params[:validated_from] || params[:validated_to]
form_vars['cboSelectDateValue'] = 'DATE_VALID'
form_vars['rbGroup'] = 'rbRange'
form_vars['dateStart'] = params[:validated_from].to_s if params[:validated_from] # YYYY-MM-DD
form_vars['dateEnd'] = params[:validated_to].to_s if params[:validated_to] # YYYY-MM-DD
end

# form_vars.merge!({ 'cboStatusCode' => ENV['MORPH_STATUS']}) if ENV['MORPH_STATUS']

logger.info "Form variables: #{form_vars.to_s}"

headers = {
'Origin' => base_url,
'Referer' => search_url,
}

logger.debug "HTTP request headers:"
logger.debug(headers.to_s)

logger.debug "GET: " + search_url
response = HTTP.headers(headers).get(search_url)
logger.debug "Response code: HTTP " + response.code.to_s

if response.code == 200
doc = Nokogiri::HTML(response.to_s)
asp_vars = {
'__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'],
'__VIEWSTATEGENERATOR' => doc.at('#__VIEWSTATEGENERATOR')['value'],
'__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value']
}
else
logger.fatal "Bad response from search page. Response code: #{response.code.to_s}. Exiting."
exit 1
end

cookies = {}
response.cookies.each { |c| cookies[c.name] = c.value }

form_vars.merge!(asp_vars)

logger.debug "POST: " + search_url
response2 = HTTP.headers(headers).cookies(cookies).post(search_url, :form => form_vars)
logger.debug "Response code: HTTP " + response2.code.to_s

if response2.code == 302
# Follow the redirect manually
# Set the page size (PS) to max so we don't have to page through search results
logger.debug "Location: #{response2.headers['Location']}"
# exit
results_url = URI::encode(base_url + response2.headers['Location'].gsub!('PS=10', 'PS=99999'))
logger.debug "GET: " + results_url
response3 = HTTP.headers(headers).cookies(cookies).get(results_url)
logger.debug "Response code: HTTP " + response3.code.to_s
doc = Nokogiri::HTML(response3.to_s)
else
logger.fatal "Didn't get redirected from search. Exiting."
exit 1
end

rows = doc.search("table.display_table tr")
logger.info "Found #{rows.size - 1} applications in search results."

# Iterate over search results
rows.each do |row|
if row.at("td") # skip header row which only has th's
cells = row.search("td")
ref = cells[0].inner_text.strip

app = {
scraped_at: Time.now,
# date_scraped: Date.today # FIXME - Planning Alerts compatibility?
}

app[:council_reference] = ref
app[:info_url] = URI::encode(generic_url + cells[0].at('a')[:href].strip)
app[:info_url].gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
app[:address] = cells[1].inner_text.strip
app[:description] = cells[2].inner_text.strip
app[:status] = cells[3].inner_text.strip
raw_date_received = cells[4].inner_text.strip
if raw_date_received != '--'
app[:date_received] = Date.parse(raw_date_received)
else
app[:date_received] = nil
end
app[:decision] = cells[5].inner_text.strip
apps << app
end
end
apps
end
end

+ 1
- 0
uk_planning_scraper.gemspec Ver arquivo

@@ -32,4 +32,5 @@ Gem::Specification.new do |spec|
spec.add_development_dependency "rake", "~> 10.0"
spec.add_runtime_dependency "mechanize", "~> 2.7"
spec.add_runtime_dependency "http"
end

Carregando…
Cancelar
Salvar