소스 검색

Create Application class; scrape into Application objects. #8

tags/v0.4.5
Adrian Short 6 년 전
부모
커밋
1aa0aac9f9
5개의 변경된 파일102개의 추가작업 그리고 67개의 파일을 삭제
  1. +1
    -0
      lib/uk_planning_scraper.rb
  2. +46
    -0
      lib/uk_planning_scraper/application.rb
  3. +11
    -6
      lib/uk_planning_scraper/authority.rb
  4. +32
    -36
      lib/uk_planning_scraper/idox.rb
  5. +12
    -25
      lib/uk_planning_scraper/northgate.rb

+ 1
- 0
lib/uk_planning_scraper.rb 파일 보기

@@ -1,5 +1,6 @@
require "uk_planning_scraper/version" require "uk_planning_scraper/version"
require "uk_planning_scraper/authority" require "uk_planning_scraper/authority"
require "uk_planning_scraper/application"
require 'uk_planning_scraper/idox' require 'uk_planning_scraper/idox'
require 'uk_planning_scraper/northgate' require 'uk_planning_scraper/northgate'
require 'logger' require 'logger'


+ 46
- 0
lib/uk_planning_scraper/application.rb 파일 보기

@@ -0,0 +1,46 @@
module UKPlanningScraper
class Application
attr_accessor :authority_name
attr_accessor :council_reference
attr_accessor :date_received
attr_accessor :date_validated
attr_accessor :status
attr_accessor :scraped_at
attr_accessor :info_url
attr_accessor :address
attr_accessor :description
attr_accessor :documents_count
attr_accessor :documents_url
attr_accessor :alternative_reference
attr_accessor :decision
attr_accessor :date_decision
attr_accessor :appeal_status
attr_accessor :appeal_decision

def to_hash
{
scraped_at: @scraped_at,
authority_name: @authority_name,
council_reference: @council_reference,
date_received: @date_received,
date_validated: @date_validated,
status: @status,
decision: @decision,
date_decision: @date_decision,
info_url: @info_url,
address: @address,
description: @description,
documents_count: @documents_count,
documents_url: @documents_url,
alternative_reference: @alternative_reference,
appeal_status: @appeal_status,
appeal_decision: @appeal_decision
}
end
def valid?
return true if @authority_name && @council_reference && @info_url
false
end
end
end

+ 11
- 6
lib/uk_planning_scraper/authority.rb 파일 보기

@@ -9,6 +9,7 @@ module UKPlanningScraper
@name = name @name = name
@url = url @url = url
@tags = tags @tags = tags
@applications = [] # Application objects
end end


def scrape(params, options = {}) def scrape(params, options = {})
@@ -41,19 +42,23 @@ module UKPlanningScraper
# Select which scraper to use # Select which scraper to use
case system case system
when 'idox' when 'idox'
apps = scrape_idox(params, options)
@applications = scrape_idox(params, options)
when 'northgate' when 'northgate'
apps = scrape_northgate(params, options)
@applications = scrape_northgate(params, options)
else else
raise SystemNotSupported.new("Planning system not supported for #{@name} at URL: #{@url}") raise SystemNotSupported.new("Planning system not supported for #{@name} at URL: #{@url}")
end end
# Post processing # Post processing
apps.each do |app|
app[:authority_name] = @name
@applications.each do |app|
app.authority_name = @name
end end
apps # Single point of successful exit

# Output as an array of hashes
output = []
# FIXME - silently ignores invalid apps. How should we handle them?
@applications.each { |app| output << app.to_hash if app.valid? }
output # Single point of successful exit
end end
def tagged?(tag) def tagged?(tag)


+ 32
- 36
lib/uk_planning_scraper/idox.rb 파일 보기

@@ -48,8 +48,7 @@ module UKPlanningScraper
# Only some Idox sites (eg Bolton) have a 'searchCriteria.developmentType' parameter # Only some Idox sites (eg Bolton) have a 'searchCriteria.developmentType' parameter
form.send(:"searchCriteria\.developmentType", params[:development_type]) if form.has_field? 'searchCriteria.developmentType' form.send(:"searchCriteria\.developmentType", params[:development_type]) if form.has_field? 'searchCriteria.developmentType'

page = form.submit page = form.submit


if page.search('.errors').inner_text.match(/Too many results found/i) if page.search('.errors').inner_text.match(/Too many results found/i)
@@ -63,7 +62,7 @@ module UKPlanningScraper
puts "Found #{items.size} apps on this page." puts "Found #{items.size} apps on this page."


items.each do |app| items.each do |app|
data = {}
data = Application.new


# Parse info line # Parse info line
info_line = app.at("p.metaInfo").inner_text.strip info_line = app.at("p.metaInfo").inner_text.strip
@@ -71,32 +70,30 @@ module UKPlanningScraper
bits.each do |bit| bits.each do |bit|
if matches = bit.match(/Ref\. No:\s+(.+)/) if matches = bit.match(/Ref\. No:\s+(.+)/)
data[:council_reference] = matches[1]
data.council_reference = matches[1]
end end


if matches = bit.match(/(Received|Registered):\s+(.+)/) if matches = bit.match(/(Received|Registered):\s+(.+)/)
data[:date_received] = Date.parse(matches[2])
data.date_received = Date.parse(matches[2])
end end
if matches = bit.match(/Validated:\s+(.+)/) if matches = bit.match(/Validated:\s+(.+)/)
data[:date_validated] = Date.parse(matches[1])
data.date_validated = Date.parse(matches[1])
end end


if matches = bit.match(/Status:\s+(.+)/) if matches = bit.match(/Status:\s+(.+)/)
data[:status] = matches[1]
data.status = matches[1]
end end
end end


data.merge!({
scraped_at: Time.now,
info_url: base_url + app.at('a')['href'],
address: app.at('p.address').inner_text.strip,
description: app.at('a').inner_text.strip,
})
data.scraped_at = Time.now
data.info_url = base_url + app.at('a')['href']
data.address = app.at('p.address').inner_text.strip
data.description = app.at('a').inner_text.strip
apps << data apps << data
end end
# Get the Next button from the pager, if there is one # Get the Next button from the pager, if there is one
if next_button = page.at('a.next') if next_button = page.at('a.next')
next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100' next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
@@ -111,28 +108,27 @@ module UKPlanningScraper
# Scrape the summary tab for each app # Scrape the summary tab for each app
apps.each_with_index do |app, i| apps.each_with_index do |app, i|
sleep options[:delay] sleep options[:delay]
puts "#{i + 1} of #{apps.size}: #{app[:info_url]}"
res = agent.get(app[:info_url])
puts "#{i + 1} of #{apps.size}: #{app.info_url}"
res = agent.get(app.info_url)
if res.code == '200' # That's a String not an Integer, ffs if res.code == '200' # That's a String not an Integer, ffs
# Parse the summary tab for this app # Parse the summary tab for this app


app[:scraped_at] = Time.now
app.scraped_at = Time.now


# The Documents tab doesn't show if there are no documents (we get li.nodocuments instead) # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
# Bradford has #tab_documents but without the document count on it # Bradford has #tab_documents but without the document count on it
app[:documents_count] = 0
app[:documents_url] = nil
app.documents_count = 0


if documents_link = res.at('.associateddocument a') if documents_link = res.at('.associateddocument a')
if documents_link.inner_text.match(/\d+/) if documents_link.inner_text.match(/\d+/)
app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i
app[:documents_url] = base_url + documents_link[:href]
app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
app.documents_url = base_url + documents_link[:href]
end end
elsif documents_link = res.at('#tab_documents') elsif documents_link = res.at('#tab_documents')
if documents_link.inner_text.match(/\d+/) if documents_link.inner_text.match(/\d+/)
app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i
app[:documents_url] = base_url + documents_link[:href]
app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
app.documents_url = base_url + documents_link[:href]
end end
end end
@@ -145,31 +141,31 @@ module UKPlanningScraper
case key case key
when 'Reference' when 'Reference'
app[:council_reference] = value
app.council_reference = value
when 'Alternative Reference' when 'Alternative Reference'
app[:alternative_reference] = value
app.alternative_reference = value unless value.empty?
when 'Planning Portal Reference' when 'Planning Portal Reference'
app[:alternative_reference] = value
app.alternative_reference = value unless value.empty?
when 'Application Received' when 'Application Received'
app[:date_received] = Date.parse(value) if value.match(/\d/)
app.date_received = Date.parse(value) if value.match(/\d/)
when 'Application Registered' when 'Application Registered'
app[:date_received] = Date.parse(value) if value.match(/\d/)
app.date_received = Date.parse(value) if value.match(/\d/)
when 'Application Validated' when 'Application Validated'
app[:date_validated] = Date.parse(value) if value.match(/\d/)
app.date_validated = Date.parse(value) if value.match(/\d/)
when 'Address' when 'Address'
app[:address] = value
app.address = value unless value.empty?
when 'Proposal' when 'Proposal'
app[:description] = value
app.description = value unless value.empty?
when 'Status' when 'Status'
app[:status] = value
app.status = value unless value.empty?
when 'Decision' when 'Decision'
app[:decision] = value
app.decision = value unless value.empty?
when 'Decision Issued Date' when 'Decision Issued Date'
app[:date_decision] = Date.parse(value) if value.match(/\d/)
app.date_decision = Date.parse(value) if value.match(/\d/)
when 'Appeal Status' when 'Appeal Status'
app[:appeal_status] = value
app.appeal_status = value unless value.empty?
when 'Appeal Decision' when 'Appeal Decision'
app[:appeal_decision] = value
app.appeal_decision = value unless value.empty?
else else
puts "Error: key '#{key}' not found" puts "Error: key '#{key}' not found"
end # case end # case


+ 12
- 25
lib/uk_planning_scraper/northgate.rb 파일 보기

@@ -50,9 +50,6 @@ module UKPlanningScraper
form_vars['dateEnd'] = params[:decided_to].to_s if params[:decided_to] # YYYY-MM-DD form_vars['dateEnd'] = params[:decided_to].to_s if params[:decided_to] # YYYY-MM-DD
end end



# form_vars.merge!({ 'cboStatusCode' => ENV['MORPH_STATUS']}) if ENV['MORPH_STATUS']

logger.info "Form variables: #{form_vars.to_s}" logger.info "Form variables: #{form_vars.to_s}"


headers = { headers = {
@@ -110,29 +107,19 @@ module UKPlanningScraper
rows.each do |row| rows.each do |row|
if row.at("td") # skip header row which only has th's if row.at("td") # skip header row which only has th's
cells = row.search("td") cells = row.search("td")
ref = cells[0].inner_text.strip

app = {
scraped_at: Time.now,
# date_scraped: Date.today # FIXME - Planning Alerts compatibility?
}

app[:council_reference] = ref
app[:info_url] = URI::encode(generic_url + cells[0].at('a')[:href].strip)
app[:info_url].gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
app[:address] = cells[1].inner_text.strip
app[:description] = cells[2].inner_text.strip
app[:status] = cells[3].inner_text.strip

app = Application.new
app.scraped_at = Time.now
app.council_reference = cells[0].inner_text.strip
app.info_url = URI::encode(generic_url + cells[0].at('a')[:href].strip)
app.info_url.gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
app.address = cells[1].inner_text.strip
app.description = cells[2].inner_text.strip
app.status = cells[3].inner_text.strip
raw_date_received = cells[4].inner_text.strip raw_date_received = cells[4].inner_text.strip
if raw_date_received != '--'
app[:date_received] = Date.parse(raw_date_received)
else
app[:date_received] = nil
end
app[:decision] = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney
app.date_received = Date.parse(raw_date_received) if raw_date_received != '--'
app.decision = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney

apps << app apps << app
end end
end end


불러오는 중...
취소
저장