Procházet zdrojové kódy

Fix #3 parsing for more Idox sites. Bump version to 0.1.1

tags/v0.4.5
Adrian Short před 6 roky
rodič
revize
b9e75b3507
2 změnil soubory, kde provedl 26 přidání a 10 odebrání
  1. +25
    -9
      lib/uk_planning_scraper.rb
  2. +1
    -1
      lib/uk_planning_scraper/version.rb

+ 25
- 9
lib/uk_planning_scraper.rb Zobrazit soubor

@@ -15,8 +15,6 @@ module UKPlanningScraper
@base_url = search_url.match(/(https?:\/\/.+?)\//)[1]
apps = []
# Regex doesn't work for Newham, Greenwich, Tower Hamlets which don't have the Received date in the text
meta_regex = /Ref\. No:\s+(.+)\s+.+\s+Received:\s+(.+)\s+.+\s+Validated:\s+(.+)\s+.+\s+Status:\s+(.+)/

agent = Mechanize.new
puts "Getting: #{@search_url}"
@@ -48,18 +46,36 @@ module UKPlanningScraper
puts "Found #{items.size} apps on this page."

items.each do |app|
matches = app.at("p.metaInfo").inner_html.match(meta_regex)
data = {}

# Parse info line
info_line = app.at("p.metaInfo").inner_text.strip
bits = info_line.split('|').map { |e| e.strip.delete("\r\n") }
data = {
council_reference: matches[1].strip,
bits.each do |bit|
if matches = bit.match(/Ref\. No:\s+(.+)/)
data[:council_reference] = matches[1]
end

if matches = bit.match(/(Received|Registered):\s+(.+)/)
data[:date_received] = Date.parse(matches[2])
end
if matches = bit.match(/Validated:\s+(.+)/)
data[:date_validated] = Date.parse(matches[1])
end

if matches = bit.match(/Status:\s+(.+)/)
data[:status] = matches[1]
end
end

data.merge!({
scraped_at: Time.now,
date_received: Date.parse(matches[2]),
date_validated: Date.parse(matches[3]),
info_url: @base_url + app.at('a')['href'],
address: app.at('p.address').inner_text.strip,
description: app.at('a').inner_text.strip,
status: matches[4].strip
}
})
apps << data
end


+ 1
- 1
lib/uk_planning_scraper/version.rb Zobrazit soubor

@@ -1,3 +1,3 @@
module UKPlanningScraper
VERSION = "0.1.0"
VERSION = "0.1.1"
end

Načítá se…
Zrušit
Uložit