Sfoglia il codice sorgente

Merge branch 'applicationclass'

tags/v0.4.5
Adrian Short 6 anni fa
parent
commit
2cac593c56
5 ha cambiato i file con 110 aggiunte e 73 eliminazioni
  1. +1
    -0
      lib/uk_planning_scraper.rb
  2. +46
    -0
      lib/uk_planning_scraper/application.rb
  3. +11
    -6
      lib/uk_planning_scraper/authority.rb
  4. +40
    -42
      lib/uk_planning_scraper/idox.rb
  5. +12
    -25
      lib/uk_planning_scraper/northgate.rb

+ 1
- 0
lib/uk_planning_scraper.rb Vedi File

@@ -1,5 +1,6 @@
require "uk_planning_scraper/version"
require "uk_planning_scraper/authority"
require "uk_planning_scraper/application"
require 'uk_planning_scraper/idox'
require 'uk_planning_scraper/northgate'
require 'logger'


+ 46
- 0
lib/uk_planning_scraper/application.rb Vedi File

@@ -0,0 +1,46 @@
module UKPlanningScraper
class Application
attr_accessor :authority_name
attr_accessor :council_reference
attr_accessor :date_received
attr_accessor :date_validated
attr_accessor :status
attr_accessor :scraped_at
attr_accessor :info_url
attr_accessor :address
attr_accessor :description
attr_accessor :documents_count
attr_accessor :documents_url
attr_accessor :alternative_reference
attr_accessor :decision
attr_accessor :date_decision
attr_accessor :appeal_status
attr_accessor :appeal_decision

def to_hash
{
scraped_at: @scraped_at,
authority_name: @authority_name,
council_reference: @council_reference,
date_received: @date_received,
date_validated: @date_validated,
status: @status,
decision: @decision,
date_decision: @date_decision,
info_url: @info_url,
address: @address,
description: @description,
documents_count: @documents_count,
documents_url: @documents_url,
alternative_reference: @alternative_reference,
appeal_status: @appeal_status,
appeal_decision: @appeal_decision
}
end
def valid?
return true if @authority_name && @council_reference && @info_url
false
end
end
end

+ 11
- 6
lib/uk_planning_scraper/authority.rb Vedi File

@@ -9,6 +9,7 @@ module UKPlanningScraper
@name = name
@url = url
@tags = tags
@applications = [] # Application objects
end

def scrape(params, options = {})
@@ -41,19 +42,23 @@ module UKPlanningScraper
# Select which scraper to use
case system
when 'idox'
apps = scrape_idox(params, options)
@applications = scrape_idox(params, options)
when 'northgate'
apps = scrape_northgate(params, options)
@applications = scrape_northgate(params, options)
else
raise SystemNotSupported.new("Planning system not supported for #{@name} at URL: #{@url}")
end
# Post processing
apps.each do |app|
app[:authority_name] = @name
@applications.each do |app|
app.authority_name = @name
end
apps # Single point of successful exit

# Output as an array of hashes
output = []
# FIXME - silently ignores invalid apps. How should we handle them?
@applications.each { |app| output << app.to_hash if app.valid? }
output # Single point of successful exit
end
def tagged?(tag)


+ 40
- 42
lib/uk_planning_scraper/idox.rb Vedi File

@@ -30,14 +30,16 @@ module UKPlanningScraper
date(applicationReceivedEnd)
}.each { |f| form.add_field!(f) unless form.has_field?(f) }

form.send(:"date(applicationReceivedStart)", params[:received_from].strftime("%d/%m/%Y")) if params[:received_from]
form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime("%d/%m/%Y")) if params[:received_to]
date_format = "%d/%m/%Y"
form.send(:"date(applicationReceivedStart)", params[:received_from].strftime(date_format)) if params[:received_from]
form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime(date_format)) if params[:received_to]

form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime("%d/%m/%Y")) if params[:validated_from]
form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime("%d/%m/%Y")) if params[:validated_to]
form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime(date_format)) if params[:validated_from]
form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime(date_format)) if params[:validated_to]

form.send(:"date(applicationDecisionStart)", params[:decided_from].strftime("%d/%m/%Y")) if params[:decided_from]
form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime("%d/%m/%Y")) if params[:decided_to]
form.send(:"date(applicationDecisionStart)", params[:decided_from].strftime(date_format)) if params[:decided_from]
form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime(date_format)) if params[:decided_to]

form.send(:"searchCriteria\.description", params[:keywords])
@@ -48,8 +50,7 @@ module UKPlanningScraper
# Only some Idox sites (eg Bolton) have a 'searchCriteria.developmentType' parameter
form.send(:"searchCriteria\.developmentType", params[:development_type]) if form.has_field? 'searchCriteria.developmentType'

page = form.submit

if page.search('.errors').inner_text.match(/Too many results found/i)
@@ -63,7 +64,7 @@ module UKPlanningScraper
puts "Found #{items.size} apps on this page."

items.each do |app|
data = {}
data = Application.new

# Parse info line
info_line = app.at("p.metaInfo").inner_text.strip
@@ -71,32 +72,30 @@ module UKPlanningScraper
bits.each do |bit|
if matches = bit.match(/Ref\. No:\s+(.+)/)
data[:council_reference] = matches[1]
data.council_reference = matches[1]
end

if matches = bit.match(/(Received|Registered):\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
data[:date_received] = Date.parse(matches[2])
data.date_received = Date.parse(matches[2])
end
if matches = bit.match(/Validated:\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
data[:date_validated] = Date.parse(matches[1])
data.date_validated = Date.parse(matches[1])
end

if matches = bit.match(/Status:\s+(.+)/)
data[:status] = matches[1]
data.status = matches[1]
end
end

data.merge!({
scraped_at: Time.now,
info_url: base_url + app.at('a')['href'],
address: app.at('p.address').inner_text.strip,
description: app.at('a').inner_text.strip,
})
data.scraped_at = Time.now
data.info_url = base_url + app.at('a')['href']
data.address = app.at('p.address').inner_text.strip
data.description = app.at('a').inner_text.strip
apps << data
end
# Get the Next button from the pager, if there is one
if next_button = page.at('a.next')
next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
@@ -111,28 +110,27 @@ module UKPlanningScraper
# Scrape the summary tab for each app
apps.each_with_index do |app, i|
sleep options[:delay]
puts "#{i + 1} of #{apps.size}: #{app[:info_url]}"
res = agent.get(app[:info_url])
puts "#{i + 1} of #{apps.size}: #{app.info_url}"
res = agent.get(app.info_url)
if res.code == '200' # That's a String not an Integer, ffs
# Parse the summary tab for this app

app[:scraped_at] = Time.now
app.scraped_at = Time.now

# The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
# Bradford has #tab_documents but without the document count on it
app[:documents_count] = 0
app[:documents_url] = nil
app.documents_count = 0

if documents_link = res.at('.associateddocument a')
if documents_link.inner_text.match(/\d+/)
app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i
app[:documents_url] = base_url + documents_link[:href]
app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
app.documents_url = base_url + documents_link[:href]
end
elsif documents_link = res.at('#tab_documents')
if documents_link.inner_text.match(/\d+/)
app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i
app[:documents_url] = base_url + documents_link[:href]
app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
app.documents_url = base_url + documents_link[:href]
end
end
@@ -145,31 +143,31 @@ module UKPlanningScraper
case key
when 'Reference'
app[:council_reference] = value
app.council_reference = value
when 'Alternative Reference'
app[:alternative_reference] = value
app.alternative_reference = value unless value.empty?
when 'Planning Portal Reference'
app[:alternative_reference] = value
app.alternative_reference = value unless value.empty?
when 'Application Received'
app[:date_received] = Date.parse(value) if value.match(/\d/)
app.date_received = Date.parse(value) if value.match(/\d/)
when 'Application Registered'
app[:date_received] = Date.parse(value) if value.match(/\d/)
app.date_received = Date.parse(value) if value.match(/\d/)
when 'Application Validated'
app[:date_validated] = Date.parse(value) if value.match(/\d/)
app.date_validated = Date.parse(value) if value.match(/\d/)
when 'Address'
app[:address] = value
app.address = value unless value.empty?
when 'Proposal'
app[:description] = value
app.description = value unless value.empty?
when 'Status'
app[:status] = value
app.status = value unless value.empty?
when 'Decision'
app[:decision] = value
app.decision = value unless value.empty?
when 'Decision Issued Date'
app[:date_decision] = Date.parse(value) if value.match(/\d/)
app.date_decision = Date.parse(value) if value.match(/\d/)
when 'Appeal Status'
app[:appeal_status] = value
app.appeal_status = value unless value.empty?
when 'Appeal Decision'
app[:appeal_decision] = value
app.appeal_decision = value unless value.empty?
else
puts "Error: key '#{key}' not found"
end # case


+ 12
- 25
lib/uk_planning_scraper/northgate.rb Vedi File

@@ -50,9 +50,6 @@ module UKPlanningScraper
form_vars['dateEnd'] = params[:decided_to].to_s if params[:decided_to] # YYYY-MM-DD
end


# form_vars.merge!({ 'cboStatusCode' => ENV['MORPH_STATUS']}) if ENV['MORPH_STATUS']

logger.info "Form variables: #{form_vars.to_s}"

headers = {
@@ -110,29 +107,19 @@ module UKPlanningScraper
rows.each do |row|
if row.at("td") # skip header row which only has th's
cells = row.search("td")
ref = cells[0].inner_text.strip

app = {
scraped_at: Time.now,
# date_scraped: Date.today # FIXME - Planning Alerts compatibility?
}

app[:council_reference] = ref
app[:info_url] = URI::encode(generic_url + cells[0].at('a')[:href].strip)
app[:info_url].gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
app[:address] = cells[1].inner_text.strip
app[:description] = cells[2].inner_text.strip
app[:status] = cells[3].inner_text.strip

app = Application.new
app.scraped_at = Time.now
app.council_reference = cells[0].inner_text.strip
app.info_url = URI::encode(generic_url + cells[0].at('a')[:href].strip)
app.info_url.gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
app.address = cells[1].inner_text.strip
app.description = cells[2].inner_text.strip
app.status = cells[3].inner_text.strip
raw_date_received = cells[4].inner_text.strip
if raw_date_received != '--'
app[:date_received] = Date.parse(raw_date_received)
else
app[:date_received] = nil
end
app[:decision] = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney
app.date_received = Date.parse(raw_date_received) if raw_date_received != '--'
app.decision = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney

apps << app
end
end


Caricamento…
Annulla
Salva