瀏覽代碼

Merge d6ff621eac into 557678ea7c

pull/38/merge
Graeme Porteous 5 年之前
committed by GitHub
父節點
當前提交
3268cd9a3d
沒有發現已知的金鑰在資料庫的簽署中 GPG Key ID: 4AEE18F83AFDEB23
共有 9 個文件被更改,包括 7459 次插入113 次删除
  1. +1
    -0
      lib/uk_planning_scraper.rb
  2. +21
    -3
      lib/uk_planning_scraper/application.rb
  3. +14
    -14
      lib/uk_planning_scraper/authority.rb
  4. +24
    -14
      lib/uk_planning_scraper/authority_scrape_params.rb
  5. +159
    -80
      lib/uk_planning_scraper/idox.rb
  6. +2
    -2
      lib/uk_planning_scraper/northgate.rb
  7. +30
    -0
      lib/uk_planning_scraper/property.rb
  8. +25
    -0
      spec/property_details_spec.rb
  9. +7183
    -0
      spec/vcr_cassettes/for_2_days_with_property_details.yml

+ 1
- 0
lib/uk_planning_scraper.rb 查看文件

@@ -2,6 +2,7 @@ require "uk_planning_scraper/version"
require "uk_planning_scraper/authority"
require "uk_planning_scraper/authority_scrape_params"
require "uk_planning_scraper/application"
require "uk_planning_scraper/property"
require 'uk_planning_scraper/idox'
require 'uk_planning_scraper/northgate'
require 'logger'


+ 21
- 3
lib/uk_planning_scraper/application.rb 查看文件

@@ -16,9 +16,13 @@ module UKPlanningScraper
attr_accessor :date_decision
attr_accessor :appeal_status
attr_accessor :appeal_decision
attr_accessor :property_count
attr_accessor :property_url
attr_accessor :property_detail_urls
attr_accessor :properties

def to_hash
{
base = {
scraped_at: @scraped_at,
authority_name: @authority_name,
council_reference: @council_reference,
@@ -34,10 +38,24 @@ module UKPlanningScraper
documents_url: @documents_url,
alternative_reference: @alternative_reference,
appeal_status: @appeal_status,
appeal_decision: @appeal_decision
appeal_decision: @appeal_decision,
property_count: @property_count
}

@property_detail_urls.each_with_index do |url, idx|
base["property_detail_url_#{idx + 1}".to_sym] = url
end if @property_detail_urls

@properties.each_with_index do |property, idx|
property_hash = property.to_hash
property_hash.transform_keys! { |k| "property_#{idx + 1}_#{k}".to_sym }

base.merge!(property_hash)
end if @properties

base
end
def valid?
return true if @authority_name && @council_reference && @info_url
false


+ 14
- 14
lib/uk_planning_scraper/authority.rb 查看文件

@@ -3,7 +3,7 @@ require 'csv'
module UKPlanningScraper
class Authority
attr_reader :name, :url
@@authorities = []

def initialize(name, url)
@@ -31,7 +31,7 @@ module UKPlanningScraper
raise SystemNotSupported.new("Planning system not supported for \
#{@name} at URL: #{@url}")
end
# Post processing
@applications.each do |app|
app.authority_name = @name
@@ -41,32 +41,32 @@ module UKPlanningScraper
output = []
# FIXME - silently ignores invalid apps. How should we handle them?
@applications.each { |app| output << app.to_hash if app.valid? }
# Reset so that old params don't get used for new scrapes
clear_scrape_params
output # Single point of successful exit
end
def tags
@tags.sort
end
# Add multiple tags to existing tags
def add_tags(tags)
tags.each { |t| add_tag(t) }
end
# Add a single tag to existing tags
def add_tag(tag)
clean_tag = tag.strip.downcase.gsub(' ', '')
@tags << clean_tag unless tagged?(clean_tag) # prevent duplicates
end
def tagged?(tag)
@tags.include?(tag)
end
def system
if @url.match(/search\.do\?action=advanced/i)
'idox'
@@ -84,18 +84,18 @@ module UKPlanningScraper
def self.all
@@authorities
end
# List all the tags in use
def self.tags
tags = []
@@authorities.each { |a| tags << a.tags }
tags.flatten.uniq.sort
end
def self.named(name)
authority = @@authorities.find { |a| name == a.name }
raise AuthorityNotFound if authority.nil?
authority
authority
end

# Tagged x
@@ -125,11 +125,11 @@ module UKPlanningScraper
CSV.foreach(File.join(File.dirname(__dir__), 'uk_planning_scraper', \
'authorities.csv'), :headers => true) do |line|
auth = Authority.new(line['authority_name'], line['url'])
if line['tags']
auth.add_tags(line['tags'].split(/\s+/))
end
auth.add_tag(auth.system)
@@authorities << auth
end


+ 24
- 14
lib/uk_planning_scraper/authority_scrape_params.rb 查看文件

@@ -4,7 +4,7 @@ module UKPlanningScraper
class Authority
# Parameter methods for Authority#scrape
# Desgined to be method chained, eg:
#
#
# applications = UKPlanningScraper::Authority.named("Barnet"). \
# development_type("Q22").keywords("illuminat"). \
# validated_days(30).scrape
@@ -17,7 +17,7 @@ module UKPlanningScraper
unless n > 0
raise ArgumentError.new("validated_days must be greater than 0")
end
validated_from(Date.today - (n - 1))
validated_to(Date.today)
self
@@ -31,7 +31,7 @@ module UKPlanningScraper
unless n > 0
raise ArgumentError.new("received_days must be greater than 0")
end
received_from(Date.today - (n - 1))
received_to(Date.today)
self
@@ -45,18 +45,18 @@ module UKPlanningScraper
unless n > 0
raise ArgumentError.new("decided_days must be greater than 0")
end
decided_from(Date.today - (n - 1))
decided_to(Date.today)
self
end
def applicant_name(s)
unless system == 'idox'
raise NoMethodError.new("applicant_name is only implemented for Idox. \
This authority (#{@name}) is #{system.capitalize}.")
end
check_class(s, String)
@scrape_params[:applicant_name] = s.strip
self
@@ -67,7 +67,7 @@ module UKPlanningScraper
raise NoMethodError.new("application_type is only implemented for \
Idox. This authority (#{@name}) is #{system.capitalize}.")
end
check_class(s, String)
@scrape_params[:application_type] = s.strip
self
@@ -78,14 +78,24 @@ module UKPlanningScraper
raise NoMethodError.new("development_type is only implemented for \
Idox. This authority (#{@name}) is #{system.capitalize}.")
end
check_class(s, String)
@scrape_params[:development_type] = s.strip
self
end

def include_property
unless system == 'idox'
raise NoMethodError.new("include_property is only implemented for \
Idox. This authority (#{@name}) is #{system.capitalize}.")
end

@scrape_params[:include_property] = true
self
end

private
# Handle the simple params with this
def method_missing(method_name, *args)
sc_params = {
@@ -97,18 +107,18 @@ module UKPlanningScraper
decided_to: Date,
keywords: String
}
value = args[0]
if sc_params[method_name]
check_class(value, sc_params[method_name], method_name.to_s)
value.strip! if value.class == String
if value.class == Date && value > Date.today
raise ArgumentError.new("#{method_name} can't be a date in the " + \
"future (#{value.to_s})")
end
@scrape_params[method_name] = value
self
else
@@ -119,7 +129,7 @@ module UKPlanningScraper
def clear_scrape_params
@scrape_params = {}
end
# https://stackoverflow.com/questions/5100299/how-to-get-the-name-of-the-calling-method
def check_class(
param_value,


+ 159
- 80
lib/uk_planning_scraper/idox.rb 查看文件

@@ -4,15 +4,32 @@ require 'pp'
module UKPlanningScraper
class Authority
private

def base_url
@base_url ||= @url.match(/(https?:\/\/.+?)\//)[1]
end

def agent
@agent ||= Mechanize.new
end

def get(url, &block)
puts "Getting: #{url}"
res = agent.get(url)

if res.code == '200' # That's a String not an Integer, ffs
block_given? ? block.call(res) : res
else
puts "Error: HTTP #{res.code}"
end
end

def scrape_idox(params, options)
puts "Using Idox scraper."
base_url = @url.match(/(https?:\/\/.+?)\//)[1]

apps = []

agent = Mechanize.new
puts "Getting: #{@url}"
page = agent.get(@url) # load the search form page
page = get(@url) # load the search form page

# Check that the search form is actually present.
# When Idox has an internal error it returns an error page with HTTP 200.
@@ -31,7 +48,7 @@ module UKPlanningScraper
}.each { |f| form.add_field!(f) unless form.has_field?(f) }

date_format = "%d/%m/%Y"
form.send(:"date(applicationReceivedStart)", params[:received_from].strftime(date_format)) if params[:received_from]
form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime(date_format)) if params[:received_to]

@@ -42,12 +59,12 @@ module UKPlanningScraper
form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime(date_format)) if params[:decided_to]

form.send(:"searchCriteria\.description", params[:keywords])
# Some councils don't have the applicant name on their form, eg Bexley
form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType'
# Only some Idox sites (eg Bolton) have a 'searchCriteria.developmentType' parameter
form.send(:"searchCriteria\.developmentType", params[:development_type]) if form.has_field? 'searchCriteria.developmentType'

@@ -56,7 +73,7 @@ module UKPlanningScraper
if page.search('.errors').inner_text.match(/Too many results found/i)
raise TooManySearchResults.new("Scrape in smaller chunks. Use shorter date ranges and/or more search parameters.")
end
loop do
# Parse search results
items = page.search('li.searchresult')
@@ -69,7 +86,7 @@ module UKPlanningScraper
# Parse info line
info_line = app.at("p.metaInfo").inner_text.strip
bits = info_line.split('|').map { |e| e.strip.delete("\r\n") }
bits.each do |bit|
if matches = bit.match(/Ref\. No:\s+(.+)/)
data.council_reference = matches[1]
@@ -78,7 +95,7 @@ module UKPlanningScraper
if matches = bit.match(/(Received|Registered):\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
data.date_received = Date.parse(matches[2])
end
if matches = bit.match(/Validated:\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
data.date_validated = Date.parse(matches[1])
end
@@ -92,91 +109,153 @@ module UKPlanningScraper
data.info_url = base_url + app.at('a')['href']
data.address = app.at('p.address').inner_text.strip
data.description = app.at('a').inner_text.strip
apps << data
end
# Get the Next button from the pager, if there is one
if next_button = page.at('a.next')
next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
sleep options[:delay]
puts "Getting: #{next_url}"
page = agent.get(next_url)
page = get(next_url)
else
break
end
end
# Scrape the summary tab for each app
apps.each_with_index do |app, i|
sleep options[:delay]
puts "#{i + 1} of #{apps.size}: #{app.info_url}"
res = agent.get(app.info_url)
if res.code == '200' # That's a String not an Integer, ffs
# Parse the summary tab for this app

app.scraped_at = Time.now

# The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
# Bradford has #tab_documents but without the document count on it
app.documents_count = 0

if documents_link = res.at('.associateddocument a')
if documents_link.inner_text.match(/\d+/)
app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
app.documents_url = base_url + documents_link[:href]
end
elsif documents_link = res.at('#tab_documents')
if documents_link.inner_text.match(/\d+/)
app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
app.documents_url = base_url + documents_link[:href]
end
puts "#{i + 1} of #{apps.size}"

parse_info_url(app) if app.info_url

next unless params[:include_property]
parse_property_url(app) if app.property_url
parse_property_detail_urls(app) if app.property_detail_urls
end # scrape summary tab for apps
apps
end # scrape_idox

def parse_info_url(app)
get(app.info_url) do |res|
# Parse the summary tab for this app

app.scraped_at = Time.now

# The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
# Bradford has #tab_documents but without the document count on it
app.documents_count = 0

if documents_link = res.at('.associateddocument a')
if documents_link.inner_text.match(/\d+/)
app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
app.documents_url = base_url + documents_link[:href]
end
elsif documents_link = res.at('#tab_documents')
if documents_link.inner_text.match(/\d+/)
app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
app.documents_url = base_url + documents_link[:href]
end
# We need to find values in the table by using the th labels.
# The row indexes/positions change from site to site (or even app to app) so we can't rely on that.
end

# We need to find values in the table by using the th labels.
# The row indexes/positions change from site to site (or even app to app) so we can't rely on that.

res.search('#simpleDetailsTable tr').each do |row|
key = row.at('th').inner_text.strip
value = row.at('td').inner_text.strip

case key
when 'Reference'
app.council_reference = value
when 'Alternative Reference'
app.alternative_reference = value unless value.empty?
when 'Planning Portal Reference'
app.alternative_reference = value unless value.empty?
when 'Application Received'
app.date_received = Date.parse(value) if value.match(/\d/)
when 'Application Registered'
app.date_received = Date.parse(value) if value.match(/\d/)
when 'Application Validated'
app.date_validated = Date.parse(value) if value.match(/\d/)
when 'Address'
app.address = value unless value.empty?
when 'Proposal'
app.description = value unless value.empty?
when 'Status'
app.status = value unless value.empty?
when 'Decision'
app.decision = value unless value.empty?
when 'Decision Issued Date'
app.date_decision = Date.parse(value) if value.match(/\d/)
when 'Appeal Status'
app.appeal_status = value unless value.empty?
when 'Appeal Decision'
app.appeal_decision = value unless value.empty?
else
puts "Error: key '#{key}' not found"
end # case
end # each row

# find associated property link
property_association_link = res.at('p.associatedproperty a')

if property_association_link
app.property_url = base_url + property_association_link[:href]
app.property_count = property_association_link.inner_text.to_i
end
end # get
end

def parse_property_url(app)
# get URLs of property pages
app.property_detail_urls = []

get(app.property_url) do |res|
res.search('#Property li a').each_with_index do |property_link, index|
break if index >= 10

app.property_detail_urls << base_url + property_link[:href]
end
end
end

def parse_property_detail_urls(app)
# get property details
app.properties = []

res.search('#simpleDetailsTable tr').each do |row|
app.property_detail_urls.each do |property_url|
get(property_url) do |res|
property = Property.new

res.search('#propertyAddress tr').each do |row|
key = row.at('th').inner_text.strip
value = row.at('td').inner_text.strip
case key
when 'Reference'
app.council_reference = value
when 'Alternative Reference'
app.alternative_reference = value unless value.empty?
when 'Planning Portal Reference'
app.alternative_reference = value unless value.empty?
when 'Application Received'
app.date_received = Date.parse(value) if value.match(/\d/)
when 'Application Registered'
app.date_received = Date.parse(value) if value.match(/\d/)
when 'Application Validated'
app.date_validated = Date.parse(value) if value.match(/\d/)
when 'Address'
app.address = value unless value.empty?
when 'Proposal'
app.description = value unless value.empty?
when 'Status'
app.status = value unless value.empty?
when 'Decision'
app.decision = value unless value.empty?
when 'Decision Issued Date'
app.date_decision = Date.parse(value) if value.match(/\d/)
when 'Appeal Status'
app.appeal_status = value unless value.empty?
when 'Appeal Decision'
app.appeal_decision = value unless value.empty?
else
puts "Error: key '#{key}' not found"
end # case
end # each row
else
puts "Error: HTTP #{res.code}"
end # if
end # scrape summary tab for apps
apps
end # scrape_idox
when 'UPRN:'
property.uprn = value
when 'Full Address:'
property.address = value unless value.empty?
when 'Property Number:'
property.number = value unless value.empty?
when 'Street:'
property.street = value unless value.empty?
when 'Town:'
property.town = value unless value.empty?
when 'Postcode:'
property.postcode = value unless value.empty?
when 'Ward:'
property.ward = value unless value.empty?
when 'Parish:'
property.parish = value unless value.empty?
end
end

app.properties << property
end
end
end
end # class
end

+ 2
- 2
lib/uk_planning_scraper/northgate.rb 查看文件

@@ -8,10 +8,10 @@ module UKPlanningScraper
def scrape_northgate(params, options)
puts "Using Northgate scraper."
base_url = @url.match(/(https?:\/\/.+?)\//)[1]
# Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive?
generic_url = @url.match(/.+\//)[0] + 'Generic/'
apps = []

$stdout.sync = true # Flush output buffer after every write so log messages appear immediately.


+ 30
- 0
lib/uk_planning_scraper/property.rb 查看文件

@@ -0,0 +1,30 @@
module UKPlanningScraper
class Property
attr_accessor :uprn
attr_accessor :address
attr_accessor :number
attr_accessor :street
attr_accessor :town
attr_accessor :postcode
attr_accessor :ward
attr_accessor :parish

def to_hash
{
uprn: @uprn,
address: @address,
number: @number,
street: @street,
town: @town,
postcode: @postcode,
ward: @ward,
parish: @parish
}
end

def valid?
return true if @uprn
false
end
end
end

+ 25
- 0
spec/property_details_spec.rb 查看文件

@@ -0,0 +1,25 @@
require 'spec_helper'

describe UKPlanningScraper::Authority do

describe '#include_property' do

let(:scraper) { UKPlanningScraper::Authority.named(authority_name) }

context 'for 2 days with property details' do
let(:authority_name) { 'Cardiff' }

it 'returns apps' do
apps = VCR.use_cassette("#{self.class.description}") {
scraper.include_property
.decided_from(Date.new(2019, 4, 8))
.decided_to(Date.new(2019, 4, 9))
.scrape(delay: 0)
}
pp apps
end
end

end

end

+ 7183
- 0
spec/vcr_cassettes/for_2_days_with_property_details.yml
文件差異過大導致無法顯示
查看文件


Loading…
取消
儲存