From b9e75b3507523d64c0a14168dbd38d967d6c4781 Mon Sep 17 00:00:00 2001 From: Adrian Short Date: Sat, 15 Sep 2018 14:59:03 +0100 Subject: [PATCH] Fix #3 parsing for more Idox sites. Bump version to 0.1.1 --- lib/uk_planning_scraper.rb | 34 ++++++++++++++++++++++-------- lib/uk_planning_scraper/version.rb | 2 +- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/lib/uk_planning_scraper.rb b/lib/uk_planning_scraper.rb index b80e6dd..784c8a5 100644 --- a/lib/uk_planning_scraper.rb +++ b/lib/uk_planning_scraper.rb @@ -15,8 +15,6 @@ module UKPlanningScraper @base_url = search_url.match(/(https?:\/\/.+?)\//)[1] apps = [] - # Regex doesn't work for Newham, Greenwich, Tower Hamlets which don't have the Received date in the text - meta_regex = /Ref\. No:\s+(.+)\s+.+\s+Received:\s+(.+)\s+.+\s+Validated:\s+(.+)\s+.+\s+Status:\s+(.+)/ agent = Mechanize.new puts "Getting: #{@search_url}" @@ -48,18 +46,36 @@ module UKPlanningScraper puts "Found #{items.size} apps on this page." items.each do |app| - matches = app.at("p.metaInfo").inner_html.match(meta_regex) + data = {} + + # Parse info line + info_line = app.at("p.metaInfo").inner_text.strip + bits = info_line.split('|').map { |e| e.strip.delete("\r\n") } - data = { - council_reference: matches[1].strip, + bits.each do |bit| + if matches = bit.match(/Ref\. No:\s+(.+)/) + data[:council_reference] = matches[1] + end + + if matches = bit.match(/(Received|Registered):\s+(.+)/) + data[:date_received] = Date.parse(matches[2]) + end + + if matches = bit.match(/Validated:\s+(.+)/) + data[:date_validated] = Date.parse(matches[1]) + end + + if matches = bit.match(/Status:\s+(.+)/) + data[:status] = matches[1] + end + end + + data.merge!({ scraped_at: Time.now, - date_received: Date.parse(matches[2]), - date_validated: Date.parse(matches[3]), info_url: @base_url + app.at('a')['href'], address: app.at('p.address').inner_text.strip, description: app.at('a').inner_text.strip, - status: matches[4].strip - } + }) apps << data end diff --git a/lib/uk_planning_scraper/version.rb b/lib/uk_planning_scraper/version.rb index b3a61db..6801bf6 100644 --- a/lib/uk_planning_scraper/version.rb +++ b/lib/uk_planning_scraper/version.rb @@ -1,3 +1,3 @@ module UKPlanningScraper - VERSION = "0.1.0" + VERSION = "0.1.1" end