Parcourir la source

Scrape all valid apps for the last 12 * 30 days

master
Adrian Short il y a 7 ans
Parent
révision
ee8d8e7f09
1 fichiers modifiés avec 71 ajouts et 38 suppressions
  1. +71
    -38
      scraper.rb

+ 71
- 38
scraper.rb Voir le fichier

@@ -1,55 +1,88 @@
# This is a template for a Ruby scraper on morph.io (https://morph.io)
# including some code snippets below that you should find helpful

require 'bundler'
Bundler.setup
require 'scraperwiki'
require 'mechanize'
require 'pp'
require 'time'
require 'date'
require 'active_support/all'

# Use the column names from planningalerts.org.au:
# https://www.planningalerts.org.au/how_to_write_a_scraper

BASEURL = "https://maps.kingston.gov.uk/propertyServices/planning/"

agent = Mechanize.new
agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
#
# # Read in a page
page = agent.get("https://maps.kingston.gov.uk/propertyServices/planning/Summary?weekListType=SRCH&recFrom=01/Jan/2017&recTo=01/Feb/2017&ward=ALL&appTyp=ALL&wardTxt=All%20Wards&appTypTxt=All%20Application%20Types&limit=50")
#
# page = Nokogiri::HTML(open("page.html"))

apps = page.search("#planningApplication")

apps.each do |app|
@title = app.at("h4").inner_text
@id = @title.match(/\d+\/\d+\/\w+/)[0]
puts @id
# Parse and save a single planning application
def parse(app)
record = {}
record['title'] = app.at("h4").inner_text
matches = record['title'].match(/(\d+\/\d+\/\w+)\s+-\s+(.+)/)
record['council_reference'] = matches[1]
record['type'] = matches[2]
# puts record['council_reference']

app.search("a").each do |link|
@url = BASEURL + link['href'].strip if link['href'].match(/Details\.aspx/)
puts @url
@map_url = link['href'].strip if link['href'].match(/\?map=/)
record['info_url'] = BASEURL + link['href'].strip if link['href'].match(/Details/)
record['map_url'] = link['href'].strip if link['href'].match(/\?map=/)
record['images_url'] = link['href'].strip if link['href'].match(/ImageMenu/)
record['comment_url'] = BASEURL + link['href'].strip if link['href'].match(/PlanningComments/)
end

spans = app.search("span")
@description = spans[0].inner_text
@address = spans[1].inner_text
@ward = spans[2].inner_text
record['description'] = spans[0].inner_text
record['address'] = spans[1].inner_text
record['ward'] = spans[2].inner_text

# Decision and decision date
if matches = spans[4].inner_text.match(/(.+?)\s+(\d{1,2}\/\d{1,2}\/\d{4})/)
record['decision'] = matches[1]
record['date_decision'] = Date.parse(matches[2])
end
# Comments/consultation - consultation end date can change during lifetime of application
app.search("dd").each do |dd|
if matches = dd.inner_text.match(/The current closing date for comments on this application is (\d{1,2}-[A-Z][a-z]{2}-\d{4})/)
record['on_notice_to'] = Date.parse(matches[1])
end
end
# Date valid
begin
@date_valid = Date.parse(spans[3].inner_text)
@date_valid_text = nil
record['date_valid'] = Date.parse(spans[3].inner_text)
record['date_valid_text'] = nil
rescue ArgumentError
@date_valid = nil
@date_valid_text = spans[3].inner_text
record['date_valid'] = nil
record['date_valid_text'] = spans[3].inner_text
end
ScraperWiki.save_sqlite(["id"],
{ 'id' => @id,
'url' => @url,
'title' => @title,
'description' => @description,
'address' => @address,
'ward' => @ward,
'date_valid' => @date_valid,
'date_valid_text' => @date_valid_text,
'map_url' => @map_url
})
# Scraper timestamps
record['updated_at'] = Time.now
record['date_scraped'] = Date.today.to_s
ScraperWiki.save_sqlite(['council_reference'], record)
end

agent = Mechanize.new
agent.verify_mode = OpenSSL::SSL::VERIFY_NONE

# Get all valid applications for the last 12 * 30 days
d = Date.today

12.times do
d_start = (d - 29.days).strftime("%d/%m/%Y")
d_end = d.strftime("%d/%m/%Y")
url = "#{BASEURL}Summary?weekListType=SRCH&recFrom=#{d_start}&recTo=#{d_end}&ward=ALL&appTyp=ALL&wardTxt=All%20Wards&appTypTxt=All%20Application%20Types&limit=500"
puts url

page = agent.get(url)
apps = page.search("#planningApplication")
puts apps.size, ''

apps.each { |app| parse(app) }
d -= 30.days
sleep 5
end

# page = Nokogiri::HTML(open("page.html"))

Chargement…
Annuler
Enregistrer