|
|
@@ -1,4 +1,5 @@ |
|
|
|
require "uk_planning_scraper/version" |
|
|
|
require 'uk_planning_scraper/idox' |
|
|
|
require 'mechanize' |
|
|
|
require 'time' |
|
|
|
require 'logger' |
|
|
@@ -9,158 +10,15 @@ module UKPlanningScraper |
|
|
|
default_options = { |
|
|
|
delay: 10, |
|
|
|
} |
|
|
|
@options = default_options.merge(options) # The user-supplied options override the defaults |
|
|
|
|
|
|
|
@search_url = search_url |
|
|
|
@base_url = search_url.match(/(https?:\/\/.+?)\//)[1] |
|
|
|
|
|
|
|
apps = [] |
|
|
|
|
|
|
|
agent = Mechanize.new |
|
|
|
puts "Getting: #{@search_url}" |
|
|
|
page = agent.get(@search_url) # load the search form page |
|
|
|
|
|
|
|
options = default_options.merge(options) # The user-supplied options override the defaults |
|
|
|
|
|
|
|
# Fill out and submit search form |
|
|
|
form = page.form('searchCriteriaForm') |
|
|
|
# form.action = form.action + '&searchCriteria.resultsPerPage=100' |
|
|
|
|
|
|
|
# Some councils don't have the received from/to dates on their form, eg Newham |
|
|
|
form.send(:"date(applicationReceivedStart)", params[:received_from].strftime("%d/%m/%Y")) if params[:received_from] |
|
|
|
form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime("%d/%m/%Y")) if params[:received_to] |
|
|
|
|
|
|
|
form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime("%d/%m/%Y")) if params[:validated_from] |
|
|
|
form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime("%d/%m/%Y")) if params[:validated_to] |
|
|
|
|
|
|
|
form.send(:"searchCriteria\.description", params[:description]) |
|
|
|
|
|
|
|
# Some councils don't have the applicant name on their form, eg Bexley |
|
|
|
form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName' |
|
|
|
form.send(:"searchCriteria\.caseType", params[:application_type]) |
|
|
|
page = form.submit |
|
|
|
|
|
|
|
loop do |
|
|
|
# Parse search results |
|
|
|
items = page.search('li.searchresult') |
|
|
|
|
|
|
|
puts "Found #{items.size} apps on this page." |
|
|
|
|
|
|
|
items.each do |app| |
|
|
|
data = {} |
|
|
|
|
|
|
|
# Parse info line |
|
|
|
info_line = app.at("p.metaInfo").inner_text.strip |
|
|
|
bits = info_line.split('|').map { |e| e.strip.delete("\r\n") } |
|
|
|
|
|
|
|
bits.each do |bit| |
|
|
|
if matches = bit.match(/Ref\. No:\s+(.+)/) |
|
|
|
data[:council_reference] = matches[1] |
|
|
|
end |
|
|
|
|
|
|
|
if matches = bit.match(/(Received|Registered):\s+(.+)/) |
|
|
|
data[:date_received] = Date.parse(matches[2]) |
|
|
|
end |
|
|
|
|
|
|
|
if matches = bit.match(/Validated:\s+(.+)/) |
|
|
|
data[:date_validated] = Date.parse(matches[1]) |
|
|
|
end |
|
|
|
|
|
|
|
if matches = bit.match(/Status:\s+(.+)/) |
|
|
|
data[:status] = matches[1] |
|
|
|
end |
|
|
|
end |
|
|
|
|
|
|
|
data.merge!({ |
|
|
|
scraped_at: Time.now, |
|
|
|
info_url: @base_url + app.at('a')['href'], |
|
|
|
address: app.at('p.address').inner_text.strip, |
|
|
|
description: app.at('a').inner_text.strip, |
|
|
|
}) |
|
|
|
|
|
|
|
apps << data |
|
|
|
end |
|
|
|
|
|
|
|
# Get the Next button from the pager, if there is one |
|
|
|
if next_button = page.at('a.next') |
|
|
|
next_url = @base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100' |
|
|
|
sleep @options[:delay] |
|
|
|
puts "Getting: #{next_url}" |
|
|
|
page = agent.get(next_url) |
|
|
|
else |
|
|
|
break |
|
|
|
end |
|
|
|
# Select which scraper to use based on the URL |
|
|
|
if search_url.match(/search.do\?action=advanced/i) |
|
|
|
# Idox |
|
|
|
return self.scrape_idox(search_url, params, options) |
|
|
|
else |
|
|
|
# Not supported |
|
|
|
raise "Planning system not supported for URL: #{search_url}" |
|
|
|
end |
|
|
|
|
|
|
|
# Scrape the summary tab for each app |
|
|
|
apps.each_with_index do |app, i| |
|
|
|
sleep @options[:delay] |
|
|
|
puts "#{i + 1} of #{apps.size}: #{app[:info_url]}" |
|
|
|
res = agent.get(app[:info_url]) |
|
|
|
|
|
|
|
if res.code == '200' # That's a String not an Integer, ffs |
|
|
|
# Parse the summary tab for this app |
|
|
|
|
|
|
|
app[:scraped_at] = Time.now |
|
|
|
|
|
|
|
# The Documents tab doesn't show if there are no documents (we get li.nodocuments instead) |
|
|
|
# Bradford has #tab_documents but without the document count on it |
|
|
|
app[:documents_count] = 0 |
|
|
|
app[:documents_url] = nil |
|
|
|
|
|
|
|
if documents_link = res.at('.associateddocument a') |
|
|
|
if documents_link.inner_text.match(/\d+/) |
|
|
|
app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i |
|
|
|
app[:documents_url] = @base_url + documents_link[:href] |
|
|
|
end |
|
|
|
elsif documents_link = res.at('#tab_documents') |
|
|
|
if documents_link.inner_text.match(/\d+/) |
|
|
|
app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i |
|
|
|
app[:documents_url] = @base_url + documents_link[:href] |
|
|
|
end |
|
|
|
end |
|
|
|
|
|
|
|
# We need to find values in the table by using the th labels. |
|
|
|
# The row indexes/positions change from site to site (or even app to app) so we can't rely on that. |
|
|
|
|
|
|
|
res.search('#simpleDetailsTable tr').each do |row| |
|
|
|
key = row.at('th').inner_text.strip |
|
|
|
value = row.at('td').inner_text.strip |
|
|
|
|
|
|
|
case key |
|
|
|
when 'Reference' |
|
|
|
app[:council_reference] = value |
|
|
|
when 'Alternative Reference' |
|
|
|
app[:alternative_reference] = value |
|
|
|
when 'Planning Portal Reference' |
|
|
|
app[:alternative_reference] = value |
|
|
|
when 'Application Received' |
|
|
|
app[:date_received] = Date.parse(value) if value != '' |
|
|
|
when 'Application Registered' |
|
|
|
app[:date_received] = Date.parse(value) if value != '' |
|
|
|
when 'Application Validated' |
|
|
|
app[:date_validated] = Date.parse(value) if value != '' |
|
|
|
when 'Address' |
|
|
|
app[:address] = value |
|
|
|
when 'Proposal' |
|
|
|
app[:description] = value |
|
|
|
when 'Status' |
|
|
|
app[:status] = value |
|
|
|
when 'Decision' |
|
|
|
app[:decision] = value |
|
|
|
when 'Decision Issued Date' |
|
|
|
app[:date_decision] = Date.parse(value) if value != '' |
|
|
|
when 'Appeal Status' |
|
|
|
app[:appeal_status] = value |
|
|
|
when 'Appeal Decision' |
|
|
|
app[:appeal_decision] = value |
|
|
|
else |
|
|
|
puts "Error: key '#{key}' not found" |
|
|
|
end # case |
|
|
|
end # each row |
|
|
|
else |
|
|
|
puts "Error: HTTP #{res.code}" |
|
|
|
end # if |
|
|
|
end # scrape summary tab for apps |
|
|
|
apps |
|
|
|
end # self.search |
|
|
|
end # module |
|
|
|
end |
|
|
|
end |