From 01b19c45074d0cbe52bc31c37fb8653d05675711 Mon Sep 17 00:00:00 2001 From: Adrian Short Date: Thu, 6 Sep 2018 15:53:59 +0100 Subject: [PATCH] Add scraper code --- .gitignore | 11 ++- Gemfile | 8 +- Gemfile.lock | 41 ++++---- README.md | 70 +++++++++++++- parser.rb | 122 ++++++++++++++++++++++++ scraper.rb | 258 ++++++++++++++++++++++++++++++++++++++++++++++----- 6 files changed, 460 insertions(+), 50 deletions(-) create mode 100644 parser.rb diff --git a/.gitignore b/.gitignore index 66d464d..38bac37 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,9 @@ -# Ignore output of scraper -data.sqlite +*.sqlite +*.db +.ruby-* +.DS_Store +*.csv +*.xls* +*.json +.env +*.txt diff --git a/Gemfile b/Gemfile index 6ab45dc..3784d12 100644 --- a/Gemfile +++ b/Gemfile @@ -3,8 +3,8 @@ # Find out more: https://morph.io/documentation/ruby source "https://rubygems.org" - -ruby "2.0.0" - +ruby "2.3.1" gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults" -gem "mechanize" +gem "breasal" +gem "http" +gem "nokogiri" diff --git a/Gemfile.lock b/Gemfile.lock index 30fb5f3..543ff79 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -10,38 +10,43 @@ GIT GEM remote: https://rubygems.org/ specs: + addressable (2.5.2) + public_suffix (>= 2.0.2, < 4.0) + breasal (0.0.1) domain_name (0.5.24) unf (>= 0.0.5, < 1.0.0) + http (3.3.0) + addressable (~> 2.3) + http-cookie (~> 1.0) + http-form_data (~> 2.0) + http_parser.rb (~> 0.6.0) http-cookie (1.0.2) domain_name (~> 0.5) + http-form_data (2.1.1) + http_parser.rb (0.6.0) httpclient (2.6.0.1) - mechanize (2.7.3) - domain_name (~> 0.5, >= 0.5.1) - http-cookie (~> 1.0) - mime-types (~> 2.0) - net-http-digest_auth (~> 1.1, >= 1.1.1) - net-http-persistent (~> 2.5, >= 2.5.2) - nokogiri (~> 1.4) - ntlm-http (~> 0.1, >= 0.1.1) - webrobots (>= 0.0.9, < 0.2) - mime-types (2.5) - mini_portile (0.6.2) - net-http-digest_auth (1.4) - net-http-persistent (2.9.4) - nokogiri (1.6.6.2) - mini_portile (~> 0.6.0) - ntlm-http (0.1.1) + mini_portile2 (2.3.0) + nokogiri (1.8.4) + mini_portile2 (~> 2.3.0) + public_suffix (3.0.2) sqlite3 (1.3.10) sqlite_magic (0.0.3) sqlite3 unf (0.1.4) unf_ext unf_ext (0.0.7.1) - webrobots (0.1.1) PLATFORMS ruby DEPENDENCIES - mechanize + breasal + http + nokogiri scraperwiki! + +RUBY VERSION + ruby 2.3.1p112 + +BUNDLED WITH + 1.16.4 diff --git a/README.md b/README.md index e541894..5d81107 100644 --- a/README.md +++ b/README.md @@ -1 +1,69 @@ -This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation) \ No newline at end of file +# Merton Council planning applications scraper + +This scrapes planning applications data from [Merton Council's planning database website](http://planning.merton.gov.uk/Northgate/PlanningExplorerAA/GeneralSearch.aspx) and puts it in an SQLite database. + +Merton Council runs [Northgate Planning Explorer](https://www.northgateps.com). + +This scraper is designed to run once per 24 hours. + +It runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation). + +## Schema + +The schema is based on the core elements from [planningalerts.org.au](https://www.planningalerts.org.au/how_to_write_a_scraper). + +## Installation + + $ git clone https://github.com/adrianshort/merton-planning-applications.git + $ cd merton-planning-applications + $ bundle + +### Configuration + +According to the principle of _one codebase, many deploys_, this scraper is [configured using environment variables](https://12factor.net/config) rather than by editing constants in the code. + + +|Name|Purpose|Default|Required?| +|------------------|-----------------------------------------|----------| +|SCRAPER_DELAY |Minimum delay in seconds between HTTP requests to the server.|10|No| +|SCRAPER_USER_AGENT|User agent string sent as an HTTP request header.|_None_|Yes| +|SCRAPER_LOG_LEVEL |Controls the level of detail in the output logs according to [Ruby's `Logger` class](https://ruby-doc.org/stdlib-2.1.0/libdoc/logger/rdoc/Logger.html) constants.|1 _(Logger::INFO)_|No| + +## Running + + $ bundle exec ruby scraper.rb + +## Logging + +[Log messages are written unbuffered to `STDOUT`.](https://12factor.net/logs) You can redirect them to a file or the log drain of your choice. + + $ bundle exec ruby scraper.rb >> log.txt + +Morph.io will only show the first 10,000 lines of log output. This constraint doesn't apply when running elsewhere, eg on your local machine. + +## Similar projects + +- [maxharlow/scrape-planning-northgate](https://github.com/maxharlow/scrape-planning-northgate) (Node) +- [adrianshort/planningalerts](https://github.com/adrianshort/planningalerts), especially the [Python scrapers for Northgate Planning Explorer](https://github.com/adrianshort/planningalerts/blob/master/python_scrapers/PlanningExplorer.py) - not by me, just a copy of this project's codebase + +## Tags + +- Merton +- Merton Council +- London +- UK +- localgov +- localgovdigital +- opendata +- Morph +- ScraperWiki +- planning +- Planning Alerts +- plantech +- civictech + +## Author + +By [Adrian Short](https://www.adrianshort.org/). + +This project is not by or affiliated with Merton Council. diff --git a/parser.rb b/parser.rb new file mode 100644 index 0000000..ad9612b --- /dev/null +++ b/parser.rb @@ -0,0 +1,122 @@ +require 'nokogiri' +require 'breasal' +require 'date' +require 'uri' +require 'pp' + +def clean_end(s) + # Removes trailing spaces including Unicode whitespace (eg char 160) from the end of a string + # Returns nil if the resulting string is empty + s.strip! + s.sub!(/\p{Zs}+$/, '') + return nil if s == '' + s +end + +def cleanup(items) + # Regex doesn't work across multiple text lines by default + items.map { |i| i.inner_html.strip.gsub(/&.+;/, '').gsub(/.*<\/span>/m, '').gsub(/[\t\r\n]/m, '') } +end + +def parse_details(html) + doc = Nokogiri::HTML(html) + app = {} + lists = doc.search("ul.list") + + # First ul is Application Progress Summary + items = lists[0].search("li div") + values = cleanup(items) + + app['date_received'] = Date.parse(values[0]) if values[0].match(DATE_REGEX) + app['status'] = clean_end(values[1]) + app['on_notice_to'] = Date.parse(values[2]) if values[2].match(DATE_REGEX) + app['recommendation'] = clean_end(values[3]) + app['date_committee'] = Date.parse(values[4]) if values[4].match(DATE_REGEX) + app['decision'] = clean_end(values[5]) + app['date_appeal_lodged'] = Date.parse(values[6]) if values[6].match(DATE_REGEX) # FIXME Is this actually a date or a Yes/No? + app['appeal_decision'] = clean_end(values[7]) + + # Second ul is Application Details + + items = lists[1].search("li div") + # Regex doesn't work across multiple text lines by default + values = items.map { |i| i.inner_html.strip.gsub(/&.+;/m, '') } + + app['council_reference'] = clean_end(items[0].children[2].inner_text) + app['application_type'] = clean_end(items[2].children[2].inner_text) + app['applicant_name'] = clean_end(items[5].children[2].inner_text) + app['agent_name'] = clean_end(items[6].children[2].inner_text) + app['wards'] = clean_end(items[7].children[2].inner_text) + + en_string = values[8].match(/Easting.+?(\d+).+?Northing.+?(\d+)/) + app['easting'] = en_string[1].to_i + app['northing'] = en_string[2].to_i + en = Breasal::EastingNorthing.new(easting: app['easting'], northing: app['northing'], type: :gb) + app['latitude'] = en.to_wgs84[:latitude] + app['longitude'] = en.to_wgs84[:longitude] + + app['appeal_submitted'] = clean_end(items[9].children[2].inner_text) + app['appeal_decision'] = clean_end(items[10].children[2].inner_text) + + if items[11].children[2].inner_text.match(/\d+/) + app['case_officer_phone'] = clean_end(items[11].children[2].inner_text.gsub(/[\r\n\t]/, '')).match(/(\d+)/)[1].sub(/^44/, '0') + end + + app['division'] = clean_end(items[12].children[2].inner_text.gsub('-', '')) + app['case_officer_name'] = clean_end(items[13].children[2].inner_text) + app['determination_level'] = clean_end(items[14].children[2].inner_text) + app['existing_land_use'] = clean_end(items[15].children[2].inner_text) + app['proposed_land_use'] = clean_end(items[16].children[2].inner_text) + + + # Third ul is Other Information Available for Planning Application... + + links = doc.search("a.FooterLinks") + app['documents_url'] = SITE_URL + links[0]['href'].gsub(/[\r\n\t]/, '') + app['dates_url'] = URI::encode(BASE_URL + links[1]['href']).gsub(/%0./m, '') + app['checks_url'] = URI::encode(BASE_URL + links[2]['href']).gsub(/%0./m, '') + app['meetings_url'] = URI::encode(BASE_URL + links[3]['href']).gsub(/%0./m, '') + app['constraints_url'] = URI::encode(BASE_URL + links[4]['href']).gsub(/%0./m, '') + app['site_history_url'] = URI::encode(BASE_URL + links[5]['href']).gsub(/%0./m, '') if links[5] + + app +end + +def parse_dates(html) + doc = Nokogiri::HTML(html) + app = {} + dates = [] + doc.search(".dataview ul div").each { |row| dates << row.children[2].inner_text } + + app['date_received'] = Date.parse(dates[0]) if dates[0].match(DATE_REGEX) + app['date_first_advertised'] = Date.parse(dates[1]) if dates[1].match(DATE_REGEX) + app['date_registered'] = Date.parse(dates[2]) if dates[2].match(DATE_REGEX) + app['date_first_site_notice'] = Date.parse(dates[3]) if dates[3].match(DATE_REGEX) + app['date_valid'] = Date.parse(dates[4]) if dates[4].match(DATE_REGEX) + app['on_notice_to'] = Date.parse(dates[5]) if dates[5].match(DATE_REGEX) + app['date_validated'] = Date.parse(dates[6]) if dates[6].match(DATE_REGEX) + app['target_date'] = Date.parse(dates[7]) if dates[7].match(DATE_REGEX) + app['stat_cons_expiry_date'] = Date.parse(dates[8]) if dates[8].match(DATE_REGEX) + app['decision_expiry_date'] = Date.parse(dates[9]) if dates[9].match(DATE_REGEX) + app['first_consultation_date'] = Date.parse(dates[10]) if dates[10].match(DATE_REGEX) + app['extended_expiry_date'] = Date.parse(dates[11]) if dates[11].match(DATE_REGEX) + + app +end + +def parse_documents(html) + doc = Nokogiri::HTML(html) + docs = [] + + doc.search("#tblContent td a").each do |d| + # title = d.inner_text.strip.match(/^[\d\w]+?_\s*(.+?)\.pdf/)[1].gsub('_', ' ') + + docs << { + 'title' => d.inner_text.strip, + 'url' => URI::encode(SITE_URL + d['href']), + 'date_last_seen' => Date.today.to_s + } + end + + docs +end diff --git a/scraper.rb b/scraper.rb index 5799e98..f066649 100644 --- a/scraper.rb +++ b/scraper.rb @@ -1,25 +1,233 @@ -# This is a template for a Ruby scraper on morph.io (https://morph.io) -# including some code snippets below that you should find helpful - -# require 'scraperwiki' -# require 'mechanize' -# -# agent = Mechanize.new -# -# # Read in a page -# page = agent.get("http://foo.com") -# -# # Find somehing on the page using css selectors -# p page.at('div.content') -# -# # Write out to the sqlite database using scraperwiki library -# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"}) -# -# # An arbitrary query against the database -# ScraperWiki.select("* from data where 'name'='peter'") - -# You don't have to do things with the Mechanize or ScraperWiki libraries. -# You can use whatever gems you want: https://morph.io/documentation/ruby -# All that matters is that your final data is written to an SQLite database -# called "data.sqlite" in the current working directory which has at least a table -# called "data". +require 'http' +require 'nokogiri' +require 'uri' +require 'scraperwiki' +require 'pp' +require_relative './parser' +require 'date' +require 'logger' +require 'securerandom' + +# Northgate Planning Explorer + +SITE_URL = 'http://planning.merton.gov.uk' +BASE_URL = SITE_URL + '/Northgate/PlanningExplorerAA/Generic/' + +def crawl_delay + sleep DELAY_S +end + +DELAY_S = ENV['SCRAPER_DELAY'].to_f || 10 # seconds. Conservatively slow by default. Scrapes approx 360 pages per hour. +USER_AGENT = ENV['SCRAPER_USER_AGENT'] +DATE_REGEX = /\d{2}-\d{2}-\d{4}/ + +$stdout.sync = true # Flush output buffer after every write so log messages appear immediately. +logger = Logger.new($stdout) +logger.level = ENV['SCRAPER_LOG_LEVEL'].to_i || Logger::INFO +logger.info "Scraper starts. Let's do this." +logger.info "Delay between requests is #{DELAY_S} seconds." +logger.info "User agent is: #{USER_AGENT}" +logger.info "Log level is: #{logger.level}" + +# General search +URL = SITE_URL + '/Northgate/PlanningExplorerAA/GeneralSearch.aspx' + +form_vars = { + # 'cboStatusCode' => '4', # REGISTERED + 'cboSelectDateValue' => 'DATE_RECEIVED', + # 'cboMonths' => '12', # 1..12 + 'cboDays' => 1, + 'rbGroup' => 'rbDay', + 'csbtnSearch' => 'Search' # required +} + +logger.info "Form variables: #{form_vars.to_s}" + +headers = { + 'Origin' => SITE_URL, + 'Referer' => URL, + 'User-Agent' => USER_AGENT +} + +logger.debug "HTTP request headers:" +logger.debug(headers.to_s) + +logger.debug "GET: " + URL +response = HTTP.headers(headers).get(URL) +logger.debug "Response code: HTTP " + response.code.to_s + +if response.code == 200 + doc = Nokogiri::HTML(response.to_s) + asp_vars = { + '__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'], + '__VIEWSTATEGENERATOR' => doc.at('#__VIEWSTATEGENERATOR')['value'], + '__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value'] + } +else + logger.fatal "Bad response from search page. Response code: #{response.code.to_s}. Exiting." + exit 1 +end + +cookies = {} +response.cookies.each { |c| cookies[c.name] = c.value } + +form_vars.merge!(asp_vars) + +logger.debug "GET: " + URL +response2 = HTTP.headers(headers).cookies(cookies).post(URL, :form => form_vars) +logger.debug "Response code: HTTP " + response2.code.to_s + +if response2.code == 302 + # Follow the redirect manually + # Set the page size (PS) to max so we don't have to page through search results + results_url = URI::encode(SITE_URL + response2.headers['Location'].gsub!('PS=10', 'PS=99999')) + + logger.debug "GET: " + URL + response3 = HTTP.headers(headers).cookies(cookies).get(results_url) + logger.debug "Response code: HTTP " + response3.code.to_s + doc = Nokogiri::HTML(response3.to_s) +else + logger.fatal "Didn't get redirected from search. Exiting." + exit 1 +end + +rows = doc.search("table.display_table tr") +logger.info "Found #{rows.size - 1} applications in search results." + +app_defaults = { + 'la_name' => 'Merton Borough Council', + 'la_slug' => 'merton', + 'la_gss' => 'E09000024', # https://mapit.mysociety.org/area/2500.html + 'date_details_scraped' => nil, + 'date_documents_scraped' => nil, + 'date_dates_scraped' => nil +} +logger.debug "Application defaults: " +logger.debug app_defaults.to_s + +# Iterate over search results +rows.each do |row| + if row.at("td") # skip header row which only has th's + cells = row.search("td") + ref = cells[0].inner_text.strip + + app = app_defaults.merge( + 'created_at' => Time.now.to_s, + 'uuid' => SecureRandom.uuid + ) + + begin + res = ScraperWiki.select("* from applications where council_reference=?", ref) + rescue # In case the table doesn't exist, which it won't on first run + true + end + + app = res[0] if res && res[0] # res will be nil if the table doesn't exist; [] if that record doesn't exist + + app['council_reference'] = ref + app['info_url'] = URI::encode(BASE_URL + cells[0].at("a")['href'].strip) + app['info_url'].gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this? + + + app['address'] = cells[1].inner_text.strip + app['description'] = cells[2].inner_text.strip + app['status'] = cells[3].inner_text.strip + + raw_date_received = cells[4].inner_text.strip + + if raw_date_received != '--' + app['date_received'] = Date.parse(raw_date_received) + else + app['date_received'] = nil + end + + app['decision'] = cells[5].inner_text.strip + app['date_scraped'] = Date.today.to_s + + app['updated_at'] = Time.now.to_s + ScraperWiki.save_sqlite(['council_reference'], app, 'applications') + end +end + +# Scrape details for all apps that don't have them +apps = ScraperWiki.select("* from applications where date_details_scraped is null or date_details_scraped < ? order by date_received desc", Date.today.to_s) + +logger.info "Scraping details for #{apps.size} applications." + +i = 0 +apps.each do |app| + i += 1 + logger.info "#{i} of #{apps.size}: Scraping details for app: #{app['council_reference']}." + crawl_delay + + # Scrape details page + res = HTTP.headers(headers).cookies(cookies).get(app['info_url']) + if res.code == 200 + # Parse details page + parsed_details = parse_details(res.to_s) + app.merge!(parsed_details) + app['date_details_scraped'] = Date.today.to_s + app['updated_at'] = Time.now.to_s + ScraperWiki.save_sqlite(['council_reference'], app, 'applications') + else + logger.error "Failed to get #{app['info_url']} - HTTP " + res.code.to_s # FIXME improve message + end +end + +# Scrape dates page for apps that don't have them +apps = ScraperWiki.select("* from applications where date_dates_scraped is null or date_dates_scraped < ? order by date_received desc", Date.today.to_s) +logger.info "Scraping dates for #{apps.size} applications." + +i = 0 +apps.each do |app| + i += 1 + logger.info "#{i} of #{apps.size}: Scraping dates for #{app['council_reference']}." + crawl_delay + + # Scrape dates page + res = HTTP.headers(headers).cookies(cookies).get(app['dates_url']) + + if res.code == 200 + # Parse dates page + parsed_dates = parse_dates(res.to_s) + app.merge!(parsed_dates) + app['date_dates_scraped'] = Date.today.to_s + app['updated_at'] = Time.now.to_s + ScraperWiki.save_sqlite(['council_reference'], app, 'applications') + else + logger.error "Error: " + res.code.to_s # FIXME improve message + end +end + +# Scrape documents for apps that don't have them +apps = ScraperWiki.select("* from applications where date_documents_scraped is null or date_documents_scraped < ? order by date_received desc", Date.today.to_s) +logger.info "Scraping documents for #{apps.size} applications." + +i = 0 +apps.each do |app| + i += 1 + logger.info "#{i} of #{apps.size}: Scraping documents for #{app['council_reference']}." + crawl_delay + + # Scrape documents page + res = HTTP.headers(headers).cookies(cookies).get(app['documents_url']) + + if res.code == 200 + # Parse documents page + docs = parse_documents(res.to_s) + + docs.each do |d| + d['council_reference'] = app['council_reference'] + ScraperWiki.save_sqlite(['council_reference', 'url'], d, 'documents') + end + + app['documents_qty'] = docs.size + app['date_documents_scraped'] = Date.today.to_s + ScraperWiki.save_sqlite(['council_reference'], app, 'applications') + else + logger.error "Error: " + res.code.to_s # FIXME improve message + end +end + +logger.info "Scraper finishes. We did it." +logger.close