瀏覽代碼

Extract summary scraping into separate method

pull/38/head
Graeme Porteous 5 年之前
父節點
當前提交
1efb7c57d4
共有 1 個文件被更改,包括 75 次插入63 次删除
  1. +75
    -63
      lib/uk_planning_scraper/idox.rb

+ 75
- 63
lib/uk_planning_scraper/idox.rb 查看文件

@@ -4,13 +4,20 @@ require 'pp'
module UKPlanningScraper
class Authority
private

def base_url
@base_url ||= @url.match(/(https?:\/\/.+?)\//)[1]
end

def agent
@agent ||= Mechanize.new
end

def scrape_idox(params, options)
puts "Using Idox scraper."
base_url = @url.match(/(https?:\/\/.+?)\//)[1]

apps = []

agent = Mechanize.new
puts "Getting: #{@url}"
page = agent.get(@url) # load the search form page

@@ -111,72 +118,77 @@ module UKPlanningScraper
apps.each_with_index do |app, i|
sleep options[:delay]
puts "#{i + 1} of #{apps.size}: #{app.info_url}"
res = agent.get(app.info_url)

if res.code == '200' # That's a String not an Integer, ffs
# Parse the summary tab for this app
parse_info_url(app) if app.info_url
end # scrape summary tab for apps
apps
end # scrape_idox

app.scraped_at = Time.now
def parse_info_url(app)
res = agent.get(app.info_url)

# The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
# Bradford has #tab_documents but without the document count on it
app.documents_count = 0
if res.code == '200' # That's a String not an Integer, ffs
# Parse the summary tab for this app

if documents_link = res.at('.associateddocument a')
if documents_link.inner_text.match(/\d+/)
app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
app.documents_url = base_url + documents_link[:href]
end
elsif documents_link = res.at('#tab_documents')
if documents_link.inner_text.match(/\d+/)
app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
app.documents_url = base_url + documents_link[:href]
end
app.scraped_at = Time.now
# The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
# Bradford has #tab_documents but without the document count on it
app.documents_count = 0
if documents_link = res.at('.associateddocument a')
if documents_link.inner_text.match(/\d+/)
app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
app.documents_url = base_url + documents_link[:href]
end
elsif documents_link = res.at('#tab_documents')
if documents_link.inner_text.match(/\d+/)
app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
app.documents_url = base_url + documents_link[:href]
end
end

# We need to find values in the table by using the th labels.
# The row indexes/positions change from site to site (or even app to app) so we can't rely on that.

res.search('#simpleDetailsTable tr').each do |row|
key = row.at('th').inner_text.strip
value = row.at('td').inner_text.strip

case key
when 'Reference'
app.council_reference = value
when 'Alternative Reference'
app.alternative_reference = value unless value.empty?
when 'Planning Portal Reference'
app.alternative_reference = value unless value.empty?
when 'Application Received'
app.date_received = Date.parse(value) if value.match(/\d/)
when 'Application Registered'
app.date_received = Date.parse(value) if value.match(/\d/)
when 'Application Validated'
app.date_validated = Date.parse(value) if value.match(/\d/)
when 'Address'
app.address = value unless value.empty?
when 'Proposal'
app.description = value unless value.empty?
when 'Status'
app.status = value unless value.empty?
when 'Decision'
app.decision = value unless value.empty?
when 'Decision Issued Date'
app.date_decision = Date.parse(value) if value.match(/\d/)
when 'Appeal Status'
app.appeal_status = value unless value.empty?
when 'Appeal Decision'
app.appeal_decision = value unless value.empty?
else
puts "Error: key '#{key}' not found"
end # case
end # each row
else
puts "Error: HTTP #{res.code}"
end # if
end # scrape summary tab for apps
apps
end # scrape_idox
# We need to find values in the table by using the th labels.
# The row indexes/positions change from site to site (or even app to app) so we can't rely on that.

res.search('#simpleDetailsTable tr').each do |row|
key = row.at('th').inner_text.strip
value = row.at('td').inner_text.strip

case key
when 'Reference'
app.council_reference = value
when 'Alternative Reference'
app.alternative_reference = value unless value.empty?
when 'Planning Portal Reference'
app.alternative_reference = value unless value.empty?
when 'Application Received'
app.date_received = Date.parse(value) if value.match(/\d/)
when 'Application Registered'
app.date_received = Date.parse(value) if value.match(/\d/)
when 'Application Validated'
app.date_validated = Date.parse(value) if value.match(/\d/)
when 'Address'
app.address = value unless value.empty?
when 'Proposal'
app.description = value unless value.empty?
when 'Status'
app.status = value unless value.empty?
when 'Decision'
app.decision = value unless value.empty?
when 'Decision Issued Date'
app.date_decision = Date.parse(value) if value.match(/\d/)
when 'Appeal Status'
app.appeal_status = value unless value.empty?
when 'Appeal Decision'
app.appeal_decision = value unless value.empty?
else
puts "Error: key '#{key}' not found"
end # case
end # each row
else
puts "Error: HTTP #{res.code}"
end # if
end
end # class
end

Loading…
取消
儲存