Browse Source

Make scrapers private instance methods in Authority

tags/v0.4.5
Adrian Short 5 years ago
parent
commit
2e100bc3c8
3 changed files with 276 additions and 270 deletions
  1. +2
    -2
      lib/uk_planning_scraper/authority.rb
  2. +157
    -154
      lib/uk_planning_scraper/idox.rb
  3. +117
    -114
      lib/uk_planning_scraper/northgate.rb

+ 2
- 2
lib/uk_planning_scraper/authority.rb View File

@@ -40,9 +40,9 @@ module UKPlanningScraper
# Select which scraper to use based on the URL
if @url.match(/search\.do\?action=advanced/i)
apps = UKPlanningScraper.scrape_idox(@url, params, options)
apps = scrape_idox(params, options)
elsif @url.match(/generalsearch\.aspx/i)
apps = UKPlanningScraper.scrape_northgate(@url, params, options)
apps = scrape_northgate(params, options)
else
# Not supported
raise SystemNotSupportedError.new("Planning system not supported for #{@name} at URL: #{@url}")


+ 157
- 154
lib/uk_planning_scraper/idox.rb View File

@@ -2,170 +2,173 @@ require 'mechanize'
require 'pp'

module UKPlanningScraper
def self.scrape_idox(search_url, params, options)
puts "Using Idox scraper."
base_url = search_url.match(/(https?:\/\/.+?)\//)[1]
apps = []

agent = Mechanize.new
puts "Getting: #{search_url}"
page = agent.get(search_url) # load the search form page

# Check that the search form is actually present.
# When Idox has an internal error it returns an error page with HTTP 200.
unless form = page.form('searchCriteriaForm')
puts "Error: Search form page failed to load due to Idox internal error."
return []
end
# form.action = form.action + '&searchCriteria.resultsPerPage=100'

# Fill out and submit search form

# Some councils don't have the received from/to dates on their form, eg Newham
form.send(:"date(applicationReceivedStart)", params[:received_from].strftime("%d/%m/%Y")) if params[:received_from]
form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime("%d/%m/%Y")) if params[:received_to]

form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime("%d/%m/%Y")) if params[:validated_from]
form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime("%d/%m/%Y")) if params[:validated_to]

form.send(:"date(applicationDecisionStart)", params[:decided_from].strftime("%d/%m/%Y")) if params[:decided_from]
form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime("%d/%m/%Y")) if params[:decided_to]

form.send(:"searchCriteria\.description", params[:keywords])
# Some councils don't have the applicant name on their form, eg Bexley
form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType'
# Some Idox sites (eg Bolton) call this 'searchCriteria.developmentType'
form.send(:"searchCriteria\.developmentType", params[:application_type]) if form.has_field? 'searchCriteria.developmentType'
page = form.submit

loop do
# Parse search results
items = page.search('li.searchresult')

puts "Found #{items.size} apps on this page."

items.each do |app|
data = {}

# Parse info line
info_line = app.at("p.metaInfo").inner_text.strip
bits = info_line.split('|').map { |e| e.strip.delete("\r\n") }
bits.each do |bit|
if matches = bit.match(/Ref\. No:\s+(.+)/)
data[:council_reference] = matches[1]
end

if matches = bit.match(/(Received|Registered):\s+(.+)/)
data[:date_received] = Date.parse(matches[2])
end
if matches = bit.match(/Validated:\s+(.+)/)
data[:date_validated] = Date.parse(matches[1])
end
class Authority
private
def scrape_idox(params, options)
puts "Using Idox scraper."
base_url = @url.match(/(https?:\/\/.+?)\//)[1]
apps = []

if matches = bit.match(/Status:\s+(.+)/)
data[:status] = matches[1]
end
end
agent = Mechanize.new
puts "Getting: #{@url}"
page = agent.get(@url) # load the search form page

data.merge!({
scraped_at: Time.now,
info_url: base_url + app.at('a')['href'],
address: app.at('p.address').inner_text.strip,
description: app.at('a').inner_text.strip,
})
apps << data
# Check that the search form is actually present.
# When Idox has an internal error it returns an error page with HTTP 200.
unless form = page.form('searchCriteriaForm')
puts "Error: Search form page failed to load due to Idox internal error."
return []
end
# form.action = form.action + '&searchCriteria.resultsPerPage=100'

# Get the Next button from the pager, if there is one
if next_button = page.at('a.next')
next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
sleep options[:delay]
puts "Getting: #{next_url}"
page = agent.get(next_url)
else
break
end
end
# Scrape the summary tab for each app
apps.each_with_index do |app, i|
sleep options[:delay]
puts "#{i + 1} of #{apps.size}: #{app[:info_url]}"
res = agent.get(app[:info_url])
# Fill out and submit search form

# Some councils don't have the received from/to dates on their form, eg Newham
form.send(:"date(applicationReceivedStart)", params[:received_from].strftime("%d/%m/%Y")) if params[:received_from]
form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime("%d/%m/%Y")) if params[:received_to]

form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime("%d/%m/%Y")) if params[:validated_from]
form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime("%d/%m/%Y")) if params[:validated_to]

form.send(:"date(applicationDecisionStart)", params[:decided_from].strftime("%d/%m/%Y")) if params[:decided_from]
form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime("%d/%m/%Y")) if params[:decided_to]

form.send(:"searchCriteria\.description", params[:keywords])
if res.code == '200' # That's a String not an Integer, ffs
# Parse the summary tab for this app
# Some councils don't have the applicant name on their form, eg Bexley
form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType'
# Some Idox sites (eg Bolton) call this 'searchCriteria.developmentType'
form.send(:"searchCriteria\.developmentType", params[:application_type]) if form.has_field? 'searchCriteria.developmentType'
page = form.submit

app[:scraped_at] = Time.now
loop do
# Parse search results
items = page.search('li.searchresult')

# The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
# Bradford has #tab_documents but without the document count on it
app[:documents_count] = 0
app[:documents_url] = nil
puts "Found #{items.size} apps on this page."

if documents_link = res.at('.associateddocument a')
if documents_link.inner_text.match(/\d+/)
app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i
app[:documents_url] = base_url + documents_link[:href]
end
elsif documents_link = res.at('#tab_documents')
if documents_link.inner_text.match(/\d+/)
app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i
app[:documents_url] = base_url + documents_link[:href]
items.each do |app|
data = {}

# Parse info line
info_line = app.at("p.metaInfo").inner_text.strip
bits = info_line.split('|').map { |e| e.strip.delete("\r\n") }
bits.each do |bit|
if matches = bit.match(/Ref\. No:\s+(.+)/)
data[:council_reference] = matches[1]
end

if matches = bit.match(/(Received|Registered):\s+(.+)/)
data[:date_received] = Date.parse(matches[2])
end
if matches = bit.match(/Validated:\s+(.+)/)
data[:date_validated] = Date.parse(matches[1])
end

if matches = bit.match(/Status:\s+(.+)/)
data[:status] = matches[1]
end
end

data.merge!({
scraped_at: Time.now,
info_url: base_url + app.at('a')['href'],
address: app.at('p.address').inner_text.strip,
description: app.at('a').inner_text.strip,
})
apps << data
end
# We need to find values in the table by using the th labels.
# The row indexes/positions change from site to site (or even app to app) so we can't rely on that.

res.search('#simpleDetailsTable tr').each do |row|
key = row.at('th').inner_text.strip
value = row.at('td').inner_text.strip
# Get the Next button from the pager, if there is one
if next_button = page.at('a.next')
next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
sleep options[:delay]
puts "Getting: #{next_url}"
page = agent.get(next_url)
else
break
end
end
# Scrape the summary tab for each app
apps.each_with_index do |app, i|
sleep options[:delay]
puts "#{i + 1} of #{apps.size}: #{app[:info_url]}"
res = agent.get(app[:info_url])
if res.code == '200' # That's a String not an Integer, ffs
# Parse the summary tab for this app

app[:scraped_at] = Time.now

# The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
# Bradford has #tab_documents but without the document count on it
app[:documents_count] = 0
app[:documents_url] = nil

if documents_link = res.at('.associateddocument a')
if documents_link.inner_text.match(/\d+/)
app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i
app[:documents_url] = base_url + documents_link[:href]
end
elsif documents_link = res.at('#tab_documents')
if documents_link.inner_text.match(/\d+/)
app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i
app[:documents_url] = base_url + documents_link[:href]
end
end
case key
when 'Reference'
app[:council_reference] = value
when 'Alternative Reference'
app[:alternative_reference] = value
when 'Planning Portal Reference'
app[:alternative_reference] = value
when 'Application Received'
app[:date_received] = Date.parse(value) if value.match(/\d/)
when 'Application Registered'
app[:date_received] = Date.parse(value) if value.match(/\d/)
when 'Application Validated'
app[:date_validated] = Date.parse(value) if value.match(/\d/)
when 'Address'
app[:address] = value
when 'Proposal'
app[:description] = value
when 'Status'
app[:status] = value
when 'Decision'
app[:decision] = value
when 'Decision Issued Date'
app[:date_decision] = Date.parse(value) if value.match(/\d/)
when 'Appeal Status'
app[:appeal_status] = value
when 'Appeal Decision'
app[:appeal_decision] = value
else
puts "Error: key '#{key}' not found"
end # case
end # each row
else
puts "Error: HTTP #{res.code}"
end # if
end # scrape summary tab for apps
apps
end # scrape_idox
# We need to find values in the table by using the th labels.
# The row indexes/positions change from site to site (or even app to app) so we can't rely on that.

res.search('#simpleDetailsTable tr').each do |row|
key = row.at('th').inner_text.strip
value = row.at('td').inner_text.strip
case key
when 'Reference'
app[:council_reference] = value
when 'Alternative Reference'
app[:alternative_reference] = value
when 'Planning Portal Reference'
app[:alternative_reference] = value
when 'Application Received'
app[:date_received] = Date.parse(value) if value.match(/\d/)
when 'Application Registered'
app[:date_received] = Date.parse(value) if value.match(/\d/)
when 'Application Validated'
app[:date_validated] = Date.parse(value) if value.match(/\d/)
when 'Address'
app[:address] = value
when 'Proposal'
app[:description] = value
when 'Status'
app[:status] = value
when 'Decision'
app[:decision] = value
when 'Decision Issued Date'
app[:date_decision] = Date.parse(value) if value.match(/\d/)
when 'Appeal Status'
app[:appeal_status] = value
when 'Appeal Decision'
app[:appeal_decision] = value
else
puts "Error: key '#{key}' not found"
end # case
end # each row
else
puts "Error: HTTP #{res.code}"
end # if
end # scrape summary tab for apps
apps
end # scrape_idox
end # class
end

+ 117
- 114
lib/uk_planning_scraper/northgate.rb View File

@@ -3,137 +3,140 @@ require 'nokogiri'
require 'logger'

module UKPlanningScraper
def self.scrape_northgate(search_url, params, options)
puts "Using Northgate scraper."
base_url = search_url.match(/(https?:\/\/.+?)\//)[1]
# Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive?
generic_url = search_url.match(/.+\//)[0] + 'Generic/'
apps = []

$stdout.sync = true # Flush output buffer after every write so log messages appear immediately.
logger = Logger.new($stdout)
logger.level = Logger::DEBUG

date_regex = /\d{2}-\d{2}-\d{4}/

form_vars = {
'csbtnSearch' => 'Search' # required
}

form_vars['txtProposal'] = params[:keywords]

# Date received from and to
if params[:received_from] || params[:received_to]
form_vars['cboSelectDateValue'] = 'DATE_RECEIVED'
form_vars['rbGroup'] = 'rbRange'
form_vars['dateStart'] = params[:received_from].to_s if params[:received_from] # YYYY-MM-DD
form_vars['dateEnd'] = params[:received_to].to_s if params[:received_to] # YYYY-MM-DD
end
class Authority
private
def scrape_northgate(params, options)
puts "Using Northgate scraper."
base_url = @url.match(/(https?:\/\/.+?)\//)[1]
# Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive?
generic_url = @url.match(/.+\//)[0] + 'Generic/'
apps = []

# Date validated from and to
if params[:validated_from] || params[:validated_to]
form_vars['cboSelectDateValue'] = 'DATE_VALID'
form_vars['rbGroup'] = 'rbRange'
form_vars['dateStart'] = params[:validated_from].to_s if params[:validated_from] # YYYY-MM-DD
form_vars['dateEnd'] = params[:validated_to].to_s if params[:validated_to] # YYYY-MM-DD
end
$stdout.sync = true # Flush output buffer after every write so log messages appear immediately.
logger = Logger.new($stdout)
logger.level = Logger::DEBUG

# Date decided from and to
if params[:decided_from] || params[:decided_to]
form_vars['cboSelectDateValue'] = 'DATE_DECISION'
form_vars['rbGroup'] = 'rbRange'
form_vars['dateStart'] = params[:decided_from].to_s if params[:decided_from] # YYYY-MM-DD
form_vars['dateEnd'] = params[:decided_to].to_s if params[:decided_to] # YYYY-MM-DD
end
date_regex = /\d{2}-\d{2}-\d{4}/

form_vars = {
'csbtnSearch' => 'Search' # required
}

# form_vars.merge!({ 'cboStatusCode' => ENV['MORPH_STATUS']}) if ENV['MORPH_STATUS']
form_vars['txtProposal'] = params[:keywords]

logger.info "Form variables: #{form_vars.to_s}"
# Date received from and to
if params[:received_from] || params[:received_to]
form_vars['cboSelectDateValue'] = 'DATE_RECEIVED'
form_vars['rbGroup'] = 'rbRange'
form_vars['dateStart'] = params[:received_from].to_s if params[:received_from] # YYYY-MM-DD
form_vars['dateEnd'] = params[:received_to].to_s if params[:received_to] # YYYY-MM-DD
end

headers = {
'Origin' => base_url,
'Referer' => search_url,
}
# Date validated from and to
if params[:validated_from] || params[:validated_to]
form_vars['cboSelectDateValue'] = 'DATE_VALID'
form_vars['rbGroup'] = 'rbRange'
form_vars['dateStart'] = params[:validated_from].to_s if params[:validated_from] # YYYY-MM-DD
form_vars['dateEnd'] = params[:validated_to].to_s if params[:validated_to] # YYYY-MM-DD
end

logger.debug "HTTP request headers:"
logger.debug(headers.to_s)
# Date decided from and to
if params[:decided_from] || params[:decided_to]
form_vars['cboSelectDateValue'] = 'DATE_DECISION'
form_vars['rbGroup'] = 'rbRange'
form_vars['dateStart'] = params[:decided_from].to_s if params[:decided_from] # YYYY-MM-DD
form_vars['dateEnd'] = params[:decided_to].to_s if params[:decided_to] # YYYY-MM-DD
end

logger.debug "GET: " + search_url
response = HTTP.headers(headers).get(search_url)
logger.debug "Response code: HTTP " + response.code.to_s

if response.code == 200
doc = Nokogiri::HTML(response.to_s)
asp_vars = {
'__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'],
'__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value']
}
else
logger.fatal "Bad response from search page. Response code: #{response.code.to_s}. Exiting."
exit 1
end
# form_vars.merge!({ 'cboStatusCode' => ENV['MORPH_STATUS']}) if ENV['MORPH_STATUS']

cookies = {}
response.cookies.each { |c| cookies[c.name] = c.value }
logger.info "Form variables: #{form_vars.to_s}"

form_vars.merge!(asp_vars)
headers = {
'Origin' => base_url,
'Referer' => @url,
}

logger.debug "POST: " + search_url
response2 = HTTP.headers(headers).cookies(cookies).post(search_url, :form => form_vars)
logger.debug "Response code: HTTP " + response2.code.to_s
logger.debug "HTTP request headers:"
logger.debug(headers.to_s)

if response2.code == 302
# Follow the redirect manually
# Set the page size (PS) to max so we don't have to page through search results
logger.debug "Location: #{response2.headers['Location']}"
# exit
results_url = URI::encode(base_url + response2.headers['Location'].gsub!('PS=10', 'PS=99999'))
logger.debug "GET: " + results_url
response3 = HTTP.headers(headers).cookies(cookies).get(results_url)
logger.debug "Response code: HTTP " + response3.code.to_s
doc = Nokogiri::HTML(response3.to_s)
else
logger.fatal "Didn't get redirected from search. Exiting."
exit 1
end
logger.debug "GET: " + @url
response = HTTP.headers(headers).get(@url)
logger.debug "Response code: HTTP " + response.code.to_s

rows = doc.search("table.display_table tr")
logger.info "Found #{rows.size - 1} applications in search results." # The first row is the header row

# Iterate over search results
rows.each do |row|
if row.at("td") # skip header row which only has th's
cells = row.search("td")
ref = cells[0].inner_text.strip

app = {
scraped_at: Time.now,
# date_scraped: Date.today # FIXME - Planning Alerts compatibility?
}

app[:council_reference] = ref
app[:info_url] = URI::encode(generic_url + cells[0].at('a')[:href].strip)
app[:info_url].gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
app[:address] = cells[1].inner_text.strip
app[:description] = cells[2].inner_text.strip
app[:status] = cells[3].inner_text.strip
raw_date_received = cells[4].inner_text.strip
if response.code == 200
doc = Nokogiri::HTML(response.to_s)
asp_vars = {
'__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'],
'__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value']
}
else
logger.fatal "Bad response from search page. Response code: #{response.code.to_s}. Exiting."
exit 1
end

cookies = {}
response.cookies.each { |c| cookies[c.name] = c.value }

form_vars.merge!(asp_vars)

logger.debug "POST: " + @url
response2 = HTTP.headers(headers).cookies(cookies).post(@url, :form => form_vars)
logger.debug "Response code: HTTP " + response2.code.to_s

if response2.code == 302
# Follow the redirect manually
# Set the page size (PS) to max so we don't have to page through search results
logger.debug "Location: #{response2.headers['Location']}"
# exit
results_url = URI::encode(base_url + response2.headers['Location'].gsub!('PS=10', 'PS=99999'))
if raw_date_received != '--'
app[:date_received] = Date.parse(raw_date_received)
else
app[:date_received] = nil
logger.debug "GET: " + results_url
response3 = HTTP.headers(headers).cookies(cookies).get(results_url)
logger.debug "Response code: HTTP " + response3.code.to_s
doc = Nokogiri::HTML(response3.to_s)
else
logger.fatal "Didn't get redirected from search. Exiting."
exit 1
end

rows = doc.search("table.display_table tr")
logger.info "Found #{rows.size - 1} applications in search results." # The first row is the header row

# Iterate over search results
rows.each do |row|
if row.at("td") # skip header row which only has th's
cells = row.search("td")
ref = cells[0].inner_text.strip

app = {
scraped_at: Time.now,
# date_scraped: Date.today # FIXME - Planning Alerts compatibility?
}

app[:council_reference] = ref
app[:info_url] = URI::encode(generic_url + cells[0].at('a')[:href].strip)
app[:info_url].gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
app[:address] = cells[1].inner_text.strip
app[:description] = cells[2].inner_text.strip
app[:status] = cells[3].inner_text.strip
raw_date_received = cells[4].inner_text.strip
if raw_date_received != '--'
app[:date_received] = Date.parse(raw_date_received)
else
app[:date_received] = nil
end
app[:decision] = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney
apps << app
end
app[:decision] = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney
apps << app
end
apps
end
apps
end
end

Loading…
Cancel
Save