@@ -37,7 +37,7 @@ module UKPlanningScraper | |||
appeal_decision: @appeal_decision | |||
} | |||
end | |||
def valid? | |||
return true if @authority_name && @council_reference && @info_url | |||
false | |||
@@ -3,7 +3,7 @@ require 'csv' | |||
module UKPlanningScraper | |||
class Authority | |||
attr_reader :name, :url | |||
@@authorities = [] | |||
def initialize(name, url) | |||
@@ -31,7 +31,7 @@ module UKPlanningScraper | |||
raise SystemNotSupported.new("Planning system not supported for \ | |||
#{@name} at URL: #{@url}") | |||
end | |||
# Post processing | |||
@applications.each do |app| | |||
app.authority_name = @name | |||
@@ -41,32 +41,32 @@ module UKPlanningScraper | |||
output = [] | |||
# FIXME - silently ignores invalid apps. How should we handle them? | |||
@applications.each { |app| output << app.to_hash if app.valid? } | |||
# Reset so that old params don't get used for new scrapes | |||
clear_scrape_params | |||
output # Single point of successful exit | |||
end | |||
def tags | |||
@tags.sort | |||
end | |||
# Add multiple tags to existing tags | |||
def add_tags(tags) | |||
tags.each { |t| add_tag(t) } | |||
end | |||
# Add a single tag to existing tags | |||
def add_tag(tag) | |||
clean_tag = tag.strip.downcase.gsub(' ', '') | |||
@tags << clean_tag unless tagged?(clean_tag) # prevent duplicates | |||
end | |||
def tagged?(tag) | |||
@tags.include?(tag) | |||
end | |||
def system | |||
if @url.match(/search\.do\?action=advanced/i) | |||
'idox' | |||
@@ -84,18 +84,18 @@ module UKPlanningScraper | |||
def self.all | |||
@@authorities | |||
end | |||
# List all the tags in use | |||
def self.tags | |||
tags = [] | |||
@@authorities.each { |a| tags << a.tags } | |||
tags.flatten.uniq.sort | |||
end | |||
def self.named(name) | |||
authority = @@authorities.find { |a| name == a.name } | |||
raise AuthorityNotFound if authority.nil? | |||
authority | |||
authority | |||
end | |||
# Tagged x | |||
@@ -125,11 +125,11 @@ module UKPlanningScraper | |||
CSV.foreach(File.join(File.dirname(__dir__), 'uk_planning_scraper', \ | |||
'authorities.csv'), :headers => true) do |line| | |||
auth = Authority.new(line['authority_name'], line['url']) | |||
if line['tags'] | |||
auth.add_tags(line['tags'].split(/\s+/)) | |||
end | |||
auth.add_tag(auth.system) | |||
@@authorities << auth | |||
end | |||
@@ -4,7 +4,7 @@ module UKPlanningScraper | |||
class Authority | |||
# Parameter methods for Authority#scrape | |||
# Desgined to be method chained, eg: | |||
# | |||
# | |||
# applications = UKPlanningScraper::Authority.named("Barnet"). \ | |||
# development_type("Q22").keywords("illuminat"). \ | |||
# validated_days(30).scrape | |||
@@ -17,7 +17,7 @@ module UKPlanningScraper | |||
unless n > 0 | |||
raise ArgumentError.new("validated_days must be greater than 0") | |||
end | |||
validated_from(Date.today - (n - 1)) | |||
validated_to(Date.today) | |||
self | |||
@@ -31,7 +31,7 @@ module UKPlanningScraper | |||
unless n > 0 | |||
raise ArgumentError.new("received_days must be greater than 0") | |||
end | |||
received_from(Date.today - (n - 1)) | |||
received_to(Date.today) | |||
self | |||
@@ -45,18 +45,18 @@ module UKPlanningScraper | |||
unless n > 0 | |||
raise ArgumentError.new("decided_days must be greater than 0") | |||
end | |||
decided_from(Date.today - (n - 1)) | |||
decided_to(Date.today) | |||
self | |||
end | |||
def applicant_name(s) | |||
unless system == 'idox' | |||
raise NoMethodError.new("applicant_name is only implemented for Idox. \ | |||
This authority (#{@name}) is #{system.capitalize}.") | |||
end | |||
check_class(s, String) | |||
@scrape_params[:applicant_name] = s.strip | |||
self | |||
@@ -67,7 +67,7 @@ module UKPlanningScraper | |||
raise NoMethodError.new("application_type is only implemented for \ | |||
Idox. This authority (#{@name}) is #{system.capitalize}.") | |||
end | |||
check_class(s, String) | |||
@scrape_params[:application_type] = s.strip | |||
self | |||
@@ -78,14 +78,14 @@ module UKPlanningScraper | |||
raise NoMethodError.new("development_type is only implemented for \ | |||
Idox. This authority (#{@name}) is #{system.capitalize}.") | |||
end | |||
check_class(s, String) | |||
@scrape_params[:development_type] = s.strip | |||
self | |||
end | |||
private | |||
# Handle the simple params with this | |||
def method_missing(method_name, *args) | |||
sc_params = { | |||
@@ -97,18 +97,18 @@ module UKPlanningScraper | |||
decided_to: Date, | |||
keywords: String | |||
} | |||
value = args[0] | |||
if sc_params[method_name] | |||
check_class(value, sc_params[method_name], method_name.to_s) | |||
value.strip! if value.class == String | |||
if value.class == Date && value > Date.today | |||
raise ArgumentError.new("#{method_name} can't be a date in the " + \ | |||
"future (#{value.to_s})") | |||
end | |||
@scrape_params[method_name] = value | |||
self | |||
else | |||
@@ -119,7 +119,7 @@ module UKPlanningScraper | |||
def clear_scrape_params | |||
@scrape_params = {} | |||
end | |||
# https://stackoverflow.com/questions/5100299/how-to-get-the-name-of-the-calling-method | |||
def check_class( | |||
param_value, | |||
@@ -7,7 +7,7 @@ module UKPlanningScraper | |||
def scrape_idox(params, options) | |||
puts "Using Idox scraper." | |||
base_url = @url.match(/(https?:\/\/.+?)\//)[1] | |||
apps = [] | |||
agent = Mechanize.new | |||
@@ -31,7 +31,7 @@ module UKPlanningScraper | |||
}.each { |f| form.add_field!(f) unless form.has_field?(f) } | |||
date_format = "%d/%m/%Y" | |||
form.send(:"date(applicationReceivedStart)", params[:received_from].strftime(date_format)) if params[:received_from] | |||
form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime(date_format)) if params[:received_to] | |||
@@ -42,12 +42,12 @@ module UKPlanningScraper | |||
form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime(date_format)) if params[:decided_to] | |||
form.send(:"searchCriteria\.description", params[:keywords]) | |||
# Some councils don't have the applicant name on their form, eg Bexley | |||
form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName' | |||
form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType' | |||
# Only some Idox sites (eg Bolton) have a 'searchCriteria.developmentType' parameter | |||
form.send(:"searchCriteria\.developmentType", params[:development_type]) if form.has_field? 'searchCriteria.developmentType' | |||
@@ -56,7 +56,7 @@ module UKPlanningScraper | |||
if page.search('.errors').inner_text.match(/Too many results found/i) | |||
raise TooManySearchResults.new("Scrape in smaller chunks. Use shorter date ranges and/or more search parameters.") | |||
end | |||
loop do | |||
# Parse search results | |||
items = page.search('li.searchresult') | |||
@@ -69,7 +69,7 @@ module UKPlanningScraper | |||
# Parse info line | |||
info_line = app.at("p.metaInfo").inner_text.strip | |||
bits = info_line.split('|').map { |e| e.strip.delete("\r\n") } | |||
bits.each do |bit| | |||
if matches = bit.match(/Ref\. No:\s+(.+)/) | |||
data.council_reference = matches[1] | |||
@@ -78,7 +78,7 @@ module UKPlanningScraper | |||
if matches = bit.match(/(Received|Registered):\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/) | |||
data.date_received = Date.parse(matches[2]) | |||
end | |||
if matches = bit.match(/Validated:\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/) | |||
data.date_validated = Date.parse(matches[1]) | |||
end | |||
@@ -92,10 +92,10 @@ module UKPlanningScraper | |||
data.info_url = base_url + app.at('a')['href'] | |||
data.address = app.at('p.address').inner_text.strip | |||
data.description = app.at('a').inner_text.strip | |||
apps << data | |||
end | |||
# Get the Next button from the pager, if there is one | |||
if next_button = page.at('a.next') | |||
next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100' | |||
@@ -106,13 +106,13 @@ module UKPlanningScraper | |||
break | |||
end | |||
end | |||
# Scrape the summary tab for each app | |||
apps.each_with_index do |app, i| | |||
sleep options[:delay] | |||
puts "#{i + 1} of #{apps.size}: #{app.info_url}" | |||
res = agent.get(app.info_url) | |||
if res.code == '200' # That's a String not an Integer, ffs | |||
# Parse the summary tab for this app | |||
@@ -133,14 +133,14 @@ module UKPlanningScraper | |||
app.documents_url = base_url + documents_link[:href] | |||
end | |||
end | |||
# We need to find values in the table by using the th labels. | |||
# The row indexes/positions change from site to site (or even app to app) so we can't rely on that. | |||
res.search('#simpleDetailsTable tr').each do |row| | |||
key = row.at('th').inner_text.strip | |||
value = row.at('td').inner_text.strip | |||
case key | |||
when 'Reference' | |||
app.council_reference = value | |||
@@ -8,10 +8,10 @@ module UKPlanningScraper | |||
def scrape_northgate(params, options) | |||
puts "Using Northgate scraper." | |||
base_url = @url.match(/(https?:\/\/.+?)\//)[1] | |||
# Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive? | |||
generic_url = @url.match(/.+\//)[0] + 'Generic/' | |||
apps = [] | |||
$stdout.sync = true # Flush output buffer after every write so log messages appear immediately. | |||