@@ -1,2 +1,9 @@ | |||||
# Ignore output of scraper | |||||
data.sqlite | |||||
*.sqlite | |||||
*.db | |||||
.ruby-* | |||||
.DS_Store | |||||
*.csv | |||||
*.xls* | |||||
*.json | |||||
.env | |||||
*.txt |
@@ -3,8 +3,8 @@ | |||||
# Find out more: https://morph.io/documentation/ruby | # Find out more: https://morph.io/documentation/ruby | ||||
source "https://rubygems.org" | source "https://rubygems.org" | ||||
ruby "2.0.0" | |||||
ruby "2.3.1" | |||||
gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults" | gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults" | ||||
gem "mechanize" | |||||
gem "breasal" | |||||
gem "http" | |||||
gem "nokogiri" |
@@ -10,38 +10,43 @@ GIT | |||||
GEM | GEM | ||||
remote: https://rubygems.org/ | remote: https://rubygems.org/ | ||||
specs: | specs: | ||||
addressable (2.5.2) | |||||
public_suffix (>= 2.0.2, < 4.0) | |||||
breasal (0.0.1) | |||||
domain_name (0.5.24) | domain_name (0.5.24) | ||||
unf (>= 0.0.5, < 1.0.0) | unf (>= 0.0.5, < 1.0.0) | ||||
http (3.3.0) | |||||
addressable (~> 2.3) | |||||
http-cookie (~> 1.0) | |||||
http-form_data (~> 2.0) | |||||
http_parser.rb (~> 0.6.0) | |||||
http-cookie (1.0.2) | http-cookie (1.0.2) | ||||
domain_name (~> 0.5) | domain_name (~> 0.5) | ||||
http-form_data (2.1.1) | |||||
http_parser.rb (0.6.0) | |||||
httpclient (2.6.0.1) | httpclient (2.6.0.1) | ||||
mechanize (2.7.3) | |||||
domain_name (~> 0.5, >= 0.5.1) | |||||
http-cookie (~> 1.0) | |||||
mime-types (~> 2.0) | |||||
net-http-digest_auth (~> 1.1, >= 1.1.1) | |||||
net-http-persistent (~> 2.5, >= 2.5.2) | |||||
nokogiri (~> 1.4) | |||||
ntlm-http (~> 0.1, >= 0.1.1) | |||||
webrobots (>= 0.0.9, < 0.2) | |||||
mime-types (2.5) | |||||
mini_portile (0.6.2) | |||||
net-http-digest_auth (1.4) | |||||
net-http-persistent (2.9.4) | |||||
nokogiri (1.6.6.2) | |||||
mini_portile (~> 0.6.0) | |||||
ntlm-http (0.1.1) | |||||
mini_portile2 (2.3.0) | |||||
nokogiri (1.8.4) | |||||
mini_portile2 (~> 2.3.0) | |||||
public_suffix (3.0.2) | |||||
sqlite3 (1.3.10) | sqlite3 (1.3.10) | ||||
sqlite_magic (0.0.3) | sqlite_magic (0.0.3) | ||||
sqlite3 | sqlite3 | ||||
unf (0.1.4) | unf (0.1.4) | ||||
unf_ext | unf_ext | ||||
unf_ext (0.0.7.1) | unf_ext (0.0.7.1) | ||||
webrobots (0.1.1) | |||||
PLATFORMS | PLATFORMS | ||||
ruby | ruby | ||||
DEPENDENCIES | DEPENDENCIES | ||||
mechanize | |||||
breasal | |||||
http | |||||
nokogiri | |||||
scraperwiki! | scraperwiki! | ||||
RUBY VERSION | |||||
ruby 2.3.1p112 | |||||
BUNDLED WITH | |||||
1.16.4 |
@@ -1 +1,69 @@ | |||||
This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation) | |||||
# Merton Council planning applications scraper | |||||
This scrapes planning applications data from [Merton Council's planning database website](http://planning.merton.gov.uk/Northgate/PlanningExplorerAA/GeneralSearch.aspx) and puts it in an SQLite database. | |||||
Merton Council runs [Northgate Planning Explorer](https://www.northgateps.com). | |||||
This scraper is designed to run once per 24 hours. | |||||
It runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation). | |||||
## Schema | |||||
The schema is based on the core elements from [planningalerts.org.au](https://www.planningalerts.org.au/how_to_write_a_scraper). | |||||
## Installation | |||||
$ git clone https://github.com/adrianshort/merton-planning-applications.git | |||||
$ cd merton-planning-applications | |||||
$ bundle | |||||
### Configuration | |||||
According to the principle of _one codebase, many deploys_, this scraper is [configured using environment variables](https://12factor.net/config) rather than by editing constants in the code. | |||||
|Name|Purpose|Default|Required?| | |||||
|------------------|-----------------------------------------|----------| | |||||
|SCRAPER_DELAY |Minimum delay in seconds between HTTP requests to the server.|10|No| | |||||
|SCRAPER_USER_AGENT|User agent string sent as an HTTP request header.|_None_|Yes| | |||||
|SCRAPER_LOG_LEVEL |Controls the level of detail in the output logs according to [Ruby's `Logger` class](https://ruby-doc.org/stdlib-2.1.0/libdoc/logger/rdoc/Logger.html) constants.|1 _(Logger::INFO)_|No| | |||||
## Running | |||||
$ bundle exec ruby scraper.rb | |||||
## Logging | |||||
[Log messages are written unbuffered to `STDOUT`.](https://12factor.net/logs) You can redirect them to a file or the log drain of your choice. | |||||
$ bundle exec ruby scraper.rb >> log.txt | |||||
Morph.io will only show the first 10,000 lines of log output. This constraint doesn't apply when running elsewhere, eg on your local machine. | |||||
## Similar projects | |||||
- [maxharlow/scrape-planning-northgate](https://github.com/maxharlow/scrape-planning-northgate) (Node) | |||||
- [adrianshort/planningalerts](https://github.com/adrianshort/planningalerts), especially the [Python scrapers for Northgate Planning Explorer](https://github.com/adrianshort/planningalerts/blob/master/python_scrapers/PlanningExplorer.py) - not by me, just a copy of this project's codebase | |||||
## Tags | |||||
- Merton | |||||
- Merton Council | |||||
- London | |||||
- UK | |||||
- localgov | |||||
- localgovdigital | |||||
- opendata | |||||
- Morph | |||||
- ScraperWiki | |||||
- planning | |||||
- Planning Alerts | |||||
- plantech | |||||
- civictech | |||||
## Author | |||||
By [Adrian Short](https://www.adrianshort.org/). | |||||
This project is not by or affiliated with Merton Council. |
@@ -0,0 +1,122 @@ | |||||
require 'nokogiri' | |||||
require 'breasal' | |||||
require 'date' | |||||
require 'uri' | |||||
require 'pp' | |||||
def clean_end(s) | |||||
# Removes trailing spaces including Unicode whitespace (eg char 160) from the end of a string | |||||
# Returns nil if the resulting string is empty | |||||
s.strip! | |||||
s.sub!(/\p{Zs}+$/, '') | |||||
return nil if s == '' | |||||
s | |||||
end | |||||
def cleanup(items) | |||||
# Regex doesn't work across multiple text lines by default | |||||
items.map { |i| i.inner_html.strip.gsub(/&.+;/, '').gsub(/<span>.*<\/span>/m, '').gsub(/[\t\r\n]/m, '') } | |||||
end | |||||
def parse_details(html) | |||||
doc = Nokogiri::HTML(html) | |||||
app = {} | |||||
lists = doc.search("ul.list") | |||||
# First ul is Application Progress Summary | |||||
items = lists[0].search("li div") | |||||
values = cleanup(items) | |||||
app['date_received'] = Date.parse(values[0]) if values[0].match(DATE_REGEX) | |||||
app['status'] = clean_end(values[1]) | |||||
app['on_notice_to'] = Date.parse(values[2]) if values[2].match(DATE_REGEX) | |||||
app['recommendation'] = clean_end(values[3]) | |||||
app['date_committee'] = Date.parse(values[4]) if values[4].match(DATE_REGEX) | |||||
app['decision'] = clean_end(values[5]) | |||||
app['date_appeal_lodged'] = Date.parse(values[6]) if values[6].match(DATE_REGEX) # FIXME Is this actually a date or a Yes/No? | |||||
app['appeal_decision'] = clean_end(values[7]) | |||||
# Second ul is Application Details | |||||
items = lists[1].search("li div") | |||||
# Regex doesn't work across multiple text lines by default | |||||
values = items.map { |i| i.inner_html.strip.gsub(/&.+;/m, '') } | |||||
app['council_reference'] = clean_end(items[0].children[2].inner_text) | |||||
app['application_type'] = clean_end(items[2].children[2].inner_text) | |||||
app['applicant_name'] = clean_end(items[5].children[2].inner_text) | |||||
app['agent_name'] = clean_end(items[6].children[2].inner_text) | |||||
app['wards'] = clean_end(items[7].children[2].inner_text) | |||||
en_string = values[8].match(/Easting.+?(\d+).+?Northing.+?(\d+)/) | |||||
app['easting'] = en_string[1].to_i | |||||
app['northing'] = en_string[2].to_i | |||||
en = Breasal::EastingNorthing.new(easting: app['easting'], northing: app['northing'], type: :gb) | |||||
app['latitude'] = en.to_wgs84[:latitude] | |||||
app['longitude'] = en.to_wgs84[:longitude] | |||||
app['appeal_submitted'] = clean_end(items[9].children[2].inner_text) | |||||
app['appeal_decision'] = clean_end(items[10].children[2].inner_text) | |||||
if items[11].children[2].inner_text.match(/\d+/) | |||||
app['case_officer_phone'] = clean_end(items[11].children[2].inner_text.gsub(/[\r\n\t]/, '')).match(/(\d+)/)[1].sub(/^44/, '0') | |||||
end | |||||
app['division'] = clean_end(items[12].children[2].inner_text.gsub('-', '')) | |||||
app['case_officer_name'] = clean_end(items[13].children[2].inner_text) | |||||
app['determination_level'] = clean_end(items[14].children[2].inner_text) | |||||
app['existing_land_use'] = clean_end(items[15].children[2].inner_text) | |||||
app['proposed_land_use'] = clean_end(items[16].children[2].inner_text) | |||||
# Third ul is Other Information Available for Planning Application... | |||||
links = doc.search("a.FooterLinks") | |||||
app['documents_url'] = SITE_URL + links[0]['href'].gsub(/[\r\n\t]/, '') | |||||
app['dates_url'] = URI::encode(BASE_URL + links[1]['href']).gsub(/%0./m, '') | |||||
app['checks_url'] = URI::encode(BASE_URL + links[2]['href']).gsub(/%0./m, '') | |||||
app['meetings_url'] = URI::encode(BASE_URL + links[3]['href']).gsub(/%0./m, '') | |||||
app['constraints_url'] = URI::encode(BASE_URL + links[4]['href']).gsub(/%0./m, '') | |||||
app['site_history_url'] = URI::encode(BASE_URL + links[5]['href']).gsub(/%0./m, '') if links[5] | |||||
app | |||||
end | |||||
def parse_dates(html) | |||||
doc = Nokogiri::HTML(html) | |||||
app = {} | |||||
dates = [] | |||||
doc.search(".dataview ul div").each { |row| dates << row.children[2].inner_text } | |||||
app['date_received'] = Date.parse(dates[0]) if dates[0].match(DATE_REGEX) | |||||
app['date_first_advertised'] = Date.parse(dates[1]) if dates[1].match(DATE_REGEX) | |||||
app['date_registered'] = Date.parse(dates[2]) if dates[2].match(DATE_REGEX) | |||||
app['date_first_site_notice'] = Date.parse(dates[3]) if dates[3].match(DATE_REGEX) | |||||
app['date_valid'] = Date.parse(dates[4]) if dates[4].match(DATE_REGEX) | |||||
app['on_notice_to'] = Date.parse(dates[5]) if dates[5].match(DATE_REGEX) | |||||
app['date_validated'] = Date.parse(dates[6]) if dates[6].match(DATE_REGEX) | |||||
app['target_date'] = Date.parse(dates[7]) if dates[7].match(DATE_REGEX) | |||||
app['stat_cons_expiry_date'] = Date.parse(dates[8]) if dates[8].match(DATE_REGEX) | |||||
app['decision_expiry_date'] = Date.parse(dates[9]) if dates[9].match(DATE_REGEX) | |||||
app['first_consultation_date'] = Date.parse(dates[10]) if dates[10].match(DATE_REGEX) | |||||
app['extended_expiry_date'] = Date.parse(dates[11]) if dates[11].match(DATE_REGEX) | |||||
app | |||||
end | |||||
def parse_documents(html) | |||||
doc = Nokogiri::HTML(html) | |||||
docs = [] | |||||
doc.search("#tblContent td a").each do |d| | |||||
# title = d.inner_text.strip.match(/^[\d\w]+?_\s*(.+?)\.pdf/)[1].gsub('_', ' ') | |||||
docs << { | |||||
'title' => d.inner_text.strip, | |||||
'url' => URI::encode(SITE_URL + d['href']), | |||||
'date_last_seen' => Date.today.to_s | |||||
} | |||||
end | |||||
docs | |||||
end |
@@ -1,25 +1,233 @@ | |||||
# This is a template for a Ruby scraper on morph.io (https://morph.io) | |||||
# including some code snippets below that you should find helpful | |||||
# require 'scraperwiki' | |||||
# require 'mechanize' | |||||
# | |||||
# agent = Mechanize.new | |||||
# | |||||
# # Read in a page | |||||
# page = agent.get("http://foo.com") | |||||
# | |||||
# # Find somehing on the page using css selectors | |||||
# p page.at('div.content') | |||||
# | |||||
# # Write out to the sqlite database using scraperwiki library | |||||
# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"}) | |||||
# | |||||
# # An arbitrary query against the database | |||||
# ScraperWiki.select("* from data where 'name'='peter'") | |||||
# You don't have to do things with the Mechanize or ScraperWiki libraries. | |||||
# You can use whatever gems you want: https://morph.io/documentation/ruby | |||||
# All that matters is that your final data is written to an SQLite database | |||||
# called "data.sqlite" in the current working directory which has at least a table | |||||
# called "data". | |||||
require 'http' | |||||
require 'nokogiri' | |||||
require 'uri' | |||||
require 'scraperwiki' | |||||
require 'pp' | |||||
require_relative './parser' | |||||
require 'date' | |||||
require 'logger' | |||||
require 'securerandom' | |||||
# Northgate Planning Explorer | |||||
SITE_URL = 'http://planning.merton.gov.uk' | |||||
BASE_URL = SITE_URL + '/Northgate/PlanningExplorerAA/Generic/' | |||||
def crawl_delay | |||||
sleep DELAY_S | |||||
end | |||||
DELAY_S = ENV['SCRAPER_DELAY'].to_f || 10 # seconds. Conservatively slow by default. Scrapes approx 360 pages per hour. | |||||
USER_AGENT = ENV['SCRAPER_USER_AGENT'] | |||||
DATE_REGEX = /\d{2}-\d{2}-\d{4}/ | |||||
$stdout.sync = true # Flush output buffer after every write so log messages appear immediately. | |||||
logger = Logger.new($stdout) | |||||
logger.level = ENV['SCRAPER_LOG_LEVEL'].to_i || Logger::INFO | |||||
logger.info "Scraper starts. Let's do this." | |||||
logger.info "Delay between requests is #{DELAY_S} seconds." | |||||
logger.info "User agent is: #{USER_AGENT}" | |||||
logger.info "Log level is: #{logger.level}" | |||||
# General search | |||||
URL = SITE_URL + '/Northgate/PlanningExplorerAA/GeneralSearch.aspx' | |||||
form_vars = { | |||||
# 'cboStatusCode' => '4', # REGISTERED | |||||
'cboSelectDateValue' => 'DATE_RECEIVED', | |||||
# 'cboMonths' => '12', # 1..12 | |||||
'cboDays' => 1, | |||||
'rbGroup' => 'rbDay', | |||||
'csbtnSearch' => 'Search' # required | |||||
} | |||||
logger.info "Form variables: #{form_vars.to_s}" | |||||
headers = { | |||||
'Origin' => SITE_URL, | |||||
'Referer' => URL, | |||||
'User-Agent' => USER_AGENT | |||||
} | |||||
logger.debug "HTTP request headers:" | |||||
logger.debug(headers.to_s) | |||||
logger.debug "GET: " + URL | |||||
response = HTTP.headers(headers).get(URL) | |||||
logger.debug "Response code: HTTP " + response.code.to_s | |||||
if response.code == 200 | |||||
doc = Nokogiri::HTML(response.to_s) | |||||
asp_vars = { | |||||
'__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'], | |||||
'__VIEWSTATEGENERATOR' => doc.at('#__VIEWSTATEGENERATOR')['value'], | |||||
'__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value'] | |||||
} | |||||
else | |||||
logger.fatal "Bad response from search page. Response code: #{response.code.to_s}. Exiting." | |||||
exit 1 | |||||
end | |||||
cookies = {} | |||||
response.cookies.each { |c| cookies[c.name] = c.value } | |||||
form_vars.merge!(asp_vars) | |||||
logger.debug "GET: " + URL | |||||
response2 = HTTP.headers(headers).cookies(cookies).post(URL, :form => form_vars) | |||||
logger.debug "Response code: HTTP " + response2.code.to_s | |||||
if response2.code == 302 | |||||
# Follow the redirect manually | |||||
# Set the page size (PS) to max so we don't have to page through search results | |||||
results_url = URI::encode(SITE_URL + response2.headers['Location'].gsub!('PS=10', 'PS=99999')) | |||||
logger.debug "GET: " + URL | |||||
response3 = HTTP.headers(headers).cookies(cookies).get(results_url) | |||||
logger.debug "Response code: HTTP " + response3.code.to_s | |||||
doc = Nokogiri::HTML(response3.to_s) | |||||
else | |||||
logger.fatal "Didn't get redirected from search. Exiting." | |||||
exit 1 | |||||
end | |||||
rows = doc.search("table.display_table tr") | |||||
logger.info "Found #{rows.size - 1} applications in search results." | |||||
app_defaults = { | |||||
'la_name' => 'Merton Borough Council', | |||||
'la_slug' => 'merton', | |||||
'la_gss' => 'E09000024', # https://mapit.mysociety.org/area/2500.html | |||||
'date_details_scraped' => nil, | |||||
'date_documents_scraped' => nil, | |||||
'date_dates_scraped' => nil | |||||
} | |||||
logger.debug "Application defaults: " | |||||
logger.debug app_defaults.to_s | |||||
# Iterate over search results | |||||
rows.each do |row| | |||||
if row.at("td") # skip header row which only has th's | |||||
cells = row.search("td") | |||||
ref = cells[0].inner_text.strip | |||||
app = app_defaults.merge( | |||||
'created_at' => Time.now.to_s, | |||||
'uuid' => SecureRandom.uuid | |||||
) | |||||
begin | |||||
res = ScraperWiki.select("* from applications where council_reference=?", ref) | |||||
rescue # In case the table doesn't exist, which it won't on first run | |||||
true | |||||
end | |||||
app = res[0] if res && res[0] # res will be nil if the table doesn't exist; [] if that record doesn't exist | |||||
app['council_reference'] = ref | |||||
app['info_url'] = URI::encode(BASE_URL + cells[0].at("a")['href'].strip) | |||||
app['info_url'].gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this? | |||||
app['address'] = cells[1].inner_text.strip | |||||
app['description'] = cells[2].inner_text.strip | |||||
app['status'] = cells[3].inner_text.strip | |||||
raw_date_received = cells[4].inner_text.strip | |||||
if raw_date_received != '--' | |||||
app['date_received'] = Date.parse(raw_date_received) | |||||
else | |||||
app['date_received'] = nil | |||||
end | |||||
app['decision'] = cells[5].inner_text.strip | |||||
app['date_scraped'] = Date.today.to_s | |||||
app['updated_at'] = Time.now.to_s | |||||
ScraperWiki.save_sqlite(['council_reference'], app, 'applications') | |||||
end | |||||
end | |||||
# Scrape details for all apps that don't have them | |||||
apps = ScraperWiki.select("* from applications where date_details_scraped is null or date_details_scraped < ? order by date_received desc", Date.today.to_s) | |||||
logger.info "Scraping details for #{apps.size} applications." | |||||
i = 0 | |||||
apps.each do |app| | |||||
i += 1 | |||||
logger.info "#{i} of #{apps.size}: Scraping details for app: #{app['council_reference']}." | |||||
crawl_delay | |||||
# Scrape details page | |||||
res = HTTP.headers(headers).cookies(cookies).get(app['info_url']) | |||||
if res.code == 200 | |||||
# Parse details page | |||||
parsed_details = parse_details(res.to_s) | |||||
app.merge!(parsed_details) | |||||
app['date_details_scraped'] = Date.today.to_s | |||||
app['updated_at'] = Time.now.to_s | |||||
ScraperWiki.save_sqlite(['council_reference'], app, 'applications') | |||||
else | |||||
logger.error "Failed to get #{app['info_url']} - HTTP " + res.code.to_s # FIXME improve message | |||||
end | |||||
end | |||||
# Scrape dates page for apps that don't have them | |||||
apps = ScraperWiki.select("* from applications where date_dates_scraped is null or date_dates_scraped < ? order by date_received desc", Date.today.to_s) | |||||
logger.info "Scraping dates for #{apps.size} applications." | |||||
i = 0 | |||||
apps.each do |app| | |||||
i += 1 | |||||
logger.info "#{i} of #{apps.size}: Scraping dates for #{app['council_reference']}." | |||||
crawl_delay | |||||
# Scrape dates page | |||||
res = HTTP.headers(headers).cookies(cookies).get(app['dates_url']) | |||||
if res.code == 200 | |||||
# Parse dates page | |||||
parsed_dates = parse_dates(res.to_s) | |||||
app.merge!(parsed_dates) | |||||
app['date_dates_scraped'] = Date.today.to_s | |||||
app['updated_at'] = Time.now.to_s | |||||
ScraperWiki.save_sqlite(['council_reference'], app, 'applications') | |||||
else | |||||
logger.error "Error: " + res.code.to_s # FIXME improve message | |||||
end | |||||
end | |||||
# Scrape documents for apps that don't have them | |||||
apps = ScraperWiki.select("* from applications where date_documents_scraped is null or date_documents_scraped < ? order by date_received desc", Date.today.to_s) | |||||
logger.info "Scraping documents for #{apps.size} applications." | |||||
i = 0 | |||||
apps.each do |app| | |||||
i += 1 | |||||
logger.info "#{i} of #{apps.size}: Scraping documents for #{app['council_reference']}." | |||||
crawl_delay | |||||
# Scrape documents page | |||||
res = HTTP.headers(headers).cookies(cookies).get(app['documents_url']) | |||||
if res.code == 200 | |||||
# Parse documents page | |||||
docs = parse_documents(res.to_s) | |||||
docs.each do |d| | |||||
d['council_reference'] = app['council_reference'] | |||||
ScraperWiki.save_sqlite(['council_reference', 'url'], d, 'documents') | |||||
end | |||||
app['documents_qty'] = docs.size | |||||
app['date_documents_scraped'] = Date.today.to_s | |||||
ScraperWiki.save_sqlite(['council_reference'], app, 'applications') | |||||
else | |||||
logger.error "Error: " + res.code.to_s # FIXME improve message | |||||
end | |||||
end | |||||
logger.info "Scraper finishes. We did it." | |||||
logger.close |