From 01b19c45074d0cbe52bc31c37fb8653d05675711 Mon Sep 17 00:00:00 2001
From: Adrian Short <adrian@adrianshort.org>
Date: Thu, 6 Sep 2018 15:53:59 +0100
Subject: [PATCH] Add scraper code

---
 .gitignore   |  11 ++-
 Gemfile      |   8 +-
 Gemfile.lock |  41 ++++----
 README.md    |  70 +++++++++++++-
 parser.rb    | 122 ++++++++++++++++++++++++
 scraper.rb   | 258 ++++++++++++++++++++++++++++++++++++++++++++++-----
 6 files changed, 460 insertions(+), 50 deletions(-)
 create mode 100644 parser.rb

diff --git a/.gitignore b/.gitignore
index 66d464d..38bac37 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,9 @@
-# Ignore output of scraper
-data.sqlite
+*.sqlite
+*.db
+.ruby-*
+.DS_Store
+*.csv
+*.xls*
+*.json
+.env
+*.txt
diff --git a/Gemfile b/Gemfile
index 6ab45dc..3784d12 100644
--- a/Gemfile
+++ b/Gemfile
@@ -3,8 +3,8 @@
 # Find out more: https://morph.io/documentation/ruby
 
 source "https://rubygems.org"
-
-ruby "2.0.0"
-
+ruby "2.3.1"
 gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
-gem "mechanize"
+gem "breasal"
+gem "http"
+gem "nokogiri"
diff --git a/Gemfile.lock b/Gemfile.lock
index 30fb5f3..543ff79 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -10,38 +10,43 @@ GIT
 GEM
   remote: https://rubygems.org/
   specs:
+    addressable (2.5.2)
+      public_suffix (>= 2.0.2, < 4.0)
+    breasal (0.0.1)
     domain_name (0.5.24)
       unf (>= 0.0.5, < 1.0.0)
+    http (3.3.0)
+      addressable (~> 2.3)
+      http-cookie (~> 1.0)
+      http-form_data (~> 2.0)
+      http_parser.rb (~> 0.6.0)
     http-cookie (1.0.2)
       domain_name (~> 0.5)
+    http-form_data (2.1.1)
+    http_parser.rb (0.6.0)
     httpclient (2.6.0.1)
-    mechanize (2.7.3)
-      domain_name (~> 0.5, >= 0.5.1)
-      http-cookie (~> 1.0)
-      mime-types (~> 2.0)
-      net-http-digest_auth (~> 1.1, >= 1.1.1)
-      net-http-persistent (~> 2.5, >= 2.5.2)
-      nokogiri (~> 1.4)
-      ntlm-http (~> 0.1, >= 0.1.1)
-      webrobots (>= 0.0.9, < 0.2)
-    mime-types (2.5)
-    mini_portile (0.6.2)
-    net-http-digest_auth (1.4)
-    net-http-persistent (2.9.4)
-    nokogiri (1.6.6.2)
-      mini_portile (~> 0.6.0)
-    ntlm-http (0.1.1)
+    mini_portile2 (2.3.0)
+    nokogiri (1.8.4)
+      mini_portile2 (~> 2.3.0)
+    public_suffix (3.0.2)
     sqlite3 (1.3.10)
     sqlite_magic (0.0.3)
       sqlite3
     unf (0.1.4)
       unf_ext
     unf_ext (0.0.7.1)
-    webrobots (0.1.1)
 
 PLATFORMS
   ruby
 
 DEPENDENCIES
-  mechanize
+  breasal
+  http
+  nokogiri
   scraperwiki!
+
+RUBY VERSION
+   ruby 2.3.1p112
+
+BUNDLED WITH
+   1.16.4
diff --git a/README.md b/README.md
index e541894..5d81107 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,69 @@
-This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation)
\ No newline at end of file
+# Merton Council planning applications scraper
+
+This scrapes planning applications data from [Merton Council's planning database website](http://planning.merton.gov.uk/Northgate/PlanningExplorerAA/GeneralSearch.aspx) and puts it in an SQLite database.
+
+Merton Council runs [Northgate Planning Explorer](https://www.northgateps.com).
+
+This scraper is designed to run once per 24 hours.
+
+It runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation).
+
+## Schema
+
+The schema is based on the core elements from [planningalerts.org.au](https://www.planningalerts.org.au/how_to_write_a_scraper).
+
+## Installation
+
+    $ git clone https://github.com/adrianshort/merton-planning-applications.git
+    $ cd merton-planning-applications
+    $ bundle
+
+### Configuration
+
+According to the principle of _one codebase, many deploys_, this scraper is [configured using environment variables](https://12factor.net/config) rather than by editing constants in the code.
+
+
+|Name|Purpose|Default|Required?|
+|------------------|-----------------------------------------|----------|
+|SCRAPER_DELAY     |Minimum delay in seconds between HTTP requests to the server.|10|No|
+|SCRAPER_USER_AGENT|User agent string sent as an HTTP request header.|_None_|Yes|
+|SCRAPER_LOG_LEVEL |Controls the level of detail in the output logs according to [Ruby's `Logger` class](https://ruby-doc.org/stdlib-2.1.0/libdoc/logger/rdoc/Logger.html) constants.|1 _(Logger::INFO)_|No|
+
+## Running
+
+    $ bundle exec ruby scraper.rb
+
+## Logging
+
+[Log messages are written unbuffered to `STDOUT`.](https://12factor.net/logs) You can redirect them to a file or the log drain of your choice.
+
+    $ bundle exec ruby scraper.rb >> log.txt
+
+Morph.io will only show the first 10,000 lines of log output. This constraint doesn't apply when running elsewhere, eg on your local machine.
+
+## Similar projects
+
+- [maxharlow/scrape-planning-northgate](https://github.com/maxharlow/scrape-planning-northgate) (Node)
+- [adrianshort/planningalerts](https://github.com/adrianshort/planningalerts), especially the [Python scrapers for Northgate Planning Explorer](https://github.com/adrianshort/planningalerts/blob/master/python_scrapers/PlanningExplorer.py) - not by me, just a copy of this project's codebase
+
+## Tags
+
+- Merton
+- Merton Council
+- London
+- UK
+- localgov
+- localgovdigital
+- opendata
+- Morph
+- ScraperWiki
+- planning
+- Planning Alerts
+- plantech
+- civictech
+
+## Author
+
+By [Adrian Short](https://www.adrianshort.org/).
+
+This project is not by or affiliated with Merton Council.
diff --git a/parser.rb b/parser.rb
new file mode 100644
index 0000000..ad9612b
--- /dev/null
+++ b/parser.rb
@@ -0,0 +1,122 @@
+require 'nokogiri'
+require 'breasal'
+require 'date'
+require 'uri'
+require 'pp'
+
+def clean_end(s)
+  # Removes trailing spaces including Unicode whitespace (eg char 160) from the end of a string
+  # Returns nil if the resulting string is empty
+  s.strip!
+  s.sub!(/\p{Zs}+$/, '')
+  return nil if s == ''
+  s
+end
+
+def cleanup(items)
+  # Regex doesn't work across multiple text lines by default
+  items.map { |i| i.inner_html.strip.gsub(/&.+;/, '').gsub(/<span>.*<\/span>/m, '').gsub(/[\t\r\n]/m, '') }
+end
+
+def parse_details(html)
+  doc = Nokogiri::HTML(html)
+  app = {}
+  lists = doc.search("ul.list")
+
+  # First ul is Application Progress Summary
+  items = lists[0].search("li div")
+  values = cleanup(items)
+
+  app['date_received'] = Date.parse(values[0]) if values[0].match(DATE_REGEX)
+  app['status'] = clean_end(values[1])
+  app['on_notice_to'] = Date.parse(values[2]) if values[2].match(DATE_REGEX)
+  app['recommendation'] = clean_end(values[3])
+  app['date_committee'] = Date.parse(values[4]) if values[4].match(DATE_REGEX)
+  app['decision'] = clean_end(values[5])
+  app['date_appeal_lodged'] = Date.parse(values[6]) if values[6].match(DATE_REGEX) # FIXME Is this actually a date or a Yes/No?
+  app['appeal_decision'] = clean_end(values[7])
+
+  # Second ul is Application Details
+
+  items = lists[1].search("li div")
+  # Regex doesn't work across multiple text lines by default
+  values = items.map { |i| i.inner_html.strip.gsub(/&.+;/m, '') }
+
+  app['council_reference'] = clean_end(items[0].children[2].inner_text)
+  app['application_type'] = clean_end(items[2].children[2].inner_text)
+  app['applicant_name'] = clean_end(items[5].children[2].inner_text)
+  app['agent_name'] = clean_end(items[6].children[2].inner_text)
+  app['wards'] = clean_end(items[7].children[2].inner_text)
+
+  en_string = values[8].match(/Easting.+?(\d+).+?Northing.+?(\d+)/)
+  app['easting'] = en_string[1].to_i
+  app['northing'] = en_string[2].to_i
+  en = Breasal::EastingNorthing.new(easting: app['easting'], northing: app['northing'], type: :gb)
+  app['latitude'] = en.to_wgs84[:latitude]
+  app['longitude'] = en.to_wgs84[:longitude]
+
+  app['appeal_submitted'] = clean_end(items[9].children[2].inner_text)
+  app['appeal_decision'] = clean_end(items[10].children[2].inner_text)
+
+  if items[11].children[2].inner_text.match(/\d+/)
+    app['case_officer_phone'] = clean_end(items[11].children[2].inner_text.gsub(/[\r\n\t]/, '')).match(/(\d+)/)[1].sub(/^44/, '0')
+  end
+  
+  app['division'] = clean_end(items[12].children[2].inner_text.gsub('-', ''))
+  app['case_officer_name'] = clean_end(items[13].children[2].inner_text)
+  app['determination_level'] = clean_end(items[14].children[2].inner_text)
+  app['existing_land_use'] = clean_end(items[15].children[2].inner_text)
+  app['proposed_land_use'] = clean_end(items[16].children[2].inner_text)
+
+
+  # Third ul is Other Information Available for Planning Application...
+
+  links = doc.search("a.FooterLinks")
+  app['documents_url'] = SITE_URL + links[0]['href'].gsub(/[\r\n\t]/, '')
+  app['dates_url'] = URI::encode(BASE_URL + links[1]['href']).gsub(/%0./m, '')
+  app['checks_url'] = URI::encode(BASE_URL + links[2]['href']).gsub(/%0./m, '')
+  app['meetings_url'] = URI::encode(BASE_URL + links[3]['href']).gsub(/%0./m, '')
+  app['constraints_url'] = URI::encode(BASE_URL + links[4]['href']).gsub(/%0./m, '')
+  app['site_history_url'] = URI::encode(BASE_URL + links[5]['href']).gsub(/%0./m, '') if links[5]
+
+  app
+end
+
+def parse_dates(html)
+  doc = Nokogiri::HTML(html)
+  app = {}
+  dates = []
+  doc.search(".dataview ul div").each { |row| dates << row.children[2].inner_text }
+  
+  app['date_received'] = Date.parse(dates[0]) if dates[0].match(DATE_REGEX)
+  app['date_first_advertised'] = Date.parse(dates[1]) if dates[1].match(DATE_REGEX)
+  app['date_registered'] = Date.parse(dates[2]) if dates[2].match(DATE_REGEX)
+  app['date_first_site_notice'] = Date.parse(dates[3]) if dates[3].match(DATE_REGEX)
+  app['date_valid'] = Date.parse(dates[4]) if dates[4].match(DATE_REGEX)
+  app['on_notice_to'] = Date.parse(dates[5]) if dates[5].match(DATE_REGEX)
+  app['date_validated'] = Date.parse(dates[6]) if dates[6].match(DATE_REGEX)
+  app['target_date'] = Date.parse(dates[7]) if dates[7].match(DATE_REGEX)
+  app['stat_cons_expiry_date'] = Date.parse(dates[8]) if dates[8].match(DATE_REGEX)
+  app['decision_expiry_date'] = Date.parse(dates[9]) if dates[9].match(DATE_REGEX)
+  app['first_consultation_date'] = Date.parse(dates[10]) if dates[10].match(DATE_REGEX)
+  app['extended_expiry_date'] = Date.parse(dates[11]) if dates[11].match(DATE_REGEX)
+  
+  app
+end
+
+def parse_documents(html)
+  doc = Nokogiri::HTML(html)
+  docs = []
+
+  doc.search("#tblContent td a").each do |d|
+    # title = d.inner_text.strip.match(/^[\d\w]+?_\s*(.+?)\.pdf/)[1].gsub('_', ' ')
+    
+    docs << {
+      'title' => d.inner_text.strip,
+      'url' => URI::encode(SITE_URL + d['href']),
+      'date_last_seen' => Date.today.to_s
+    }
+  end
+
+  docs
+end
diff --git a/scraper.rb b/scraper.rb
index 5799e98..f066649 100644
--- a/scraper.rb
+++ b/scraper.rb
@@ -1,25 +1,233 @@
-# This is a template for a Ruby scraper on morph.io (https://morph.io)
-# including some code snippets below that you should find helpful
-
-# require 'scraperwiki'
-# require 'mechanize'
-#
-# agent = Mechanize.new
-#
-# # Read in a page
-# page = agent.get("http://foo.com")
-#
-# # Find somehing on the page using css selectors
-# p page.at('div.content')
-#
-# # Write out to the sqlite database using scraperwiki library
-# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"})
-#
-# # An arbitrary query against the database
-# ScraperWiki.select("* from data where 'name'='peter'")
-
-# You don't have to do things with the Mechanize or ScraperWiki libraries.
-# You can use whatever gems you want: https://morph.io/documentation/ruby
-# All that matters is that your final data is written to an SQLite database
-# called "data.sqlite" in the current working directory which has at least a table
-# called "data".
+require 'http'
+require 'nokogiri'
+require 'uri'
+require 'scraperwiki'
+require 'pp'
+require_relative './parser'
+require 'date'
+require 'logger'
+require 'securerandom'
+
+# Northgate Planning Explorer
+
+SITE_URL = 'http://planning.merton.gov.uk'
+BASE_URL = SITE_URL + '/Northgate/PlanningExplorerAA/Generic/'
+
+def crawl_delay
+  sleep DELAY_S
+end
+
+DELAY_S = ENV['SCRAPER_DELAY'].to_f || 10 # seconds. Conservatively slow by default. Scrapes approx 360 pages per hour.
+USER_AGENT = ENV['SCRAPER_USER_AGENT']
+DATE_REGEX = /\d{2}-\d{2}-\d{4}/
+
+$stdout.sync = true # Flush output buffer after every write so log messages appear immediately.
+logger = Logger.new($stdout)
+logger.level = ENV['SCRAPER_LOG_LEVEL'].to_i || Logger::INFO
+logger.info "Scraper starts. Let's do this."
+logger.info "Delay between requests is #{DELAY_S} seconds."
+logger.info "User agent is: #{USER_AGENT}"
+logger.info "Log level is: #{logger.level}"
+
+# General search
+URL = SITE_URL + '/Northgate/PlanningExplorerAA/GeneralSearch.aspx'
+
+form_vars = {
+  # 'cboStatusCode' => '4', # REGISTERED
+  'cboSelectDateValue' => 'DATE_RECEIVED',
+  # 'cboMonths' => '12', # 1..12
+  'cboDays' => 1,
+  'rbGroup' => 'rbDay',
+  'csbtnSearch' => 'Search' # required
+}
+
+logger.info "Form variables: #{form_vars.to_s}"
+
+headers = {
+  'Origin' => SITE_URL,
+  'Referer' => URL,
+  'User-Agent' => USER_AGENT
+}
+
+logger.debug "HTTP request headers:"
+logger.debug(headers.to_s)
+
+logger.debug "GET: " + URL
+response = HTTP.headers(headers).get(URL)
+logger.debug "Response code: HTTP " + response.code.to_s
+
+if response.code == 200
+  doc = Nokogiri::HTML(response.to_s)
+  asp_vars = {
+    '__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'],
+    '__VIEWSTATEGENERATOR' => doc.at('#__VIEWSTATEGENERATOR')['value'],
+    '__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value']
+   }
+else
+  logger.fatal "Bad response from search page. Response code: #{response.code.to_s}. Exiting."
+  exit 1
+end
+
+cookies = {}
+response.cookies.each { |c| cookies[c.name] = c.value }
+
+form_vars.merge!(asp_vars)
+
+logger.debug "GET: " + URL
+response2 = HTTP.headers(headers).cookies(cookies).post(URL, :form => form_vars)
+logger.debug "Response code: HTTP " + response2.code.to_s
+
+if response2.code == 302
+  # Follow the redirect manually
+  # Set the page size (PS) to max so we don't have to page through search results
+  results_url = URI::encode(SITE_URL + response2.headers['Location'].gsub!('PS=10', 'PS=99999'))
+  
+  logger.debug "GET: " + URL
+  response3 = HTTP.headers(headers).cookies(cookies).get(results_url)
+  logger.debug "Response code: HTTP " + response3.code.to_s
+  doc = Nokogiri::HTML(response3.to_s)
+else
+  logger.fatal "Didn't get redirected from search. Exiting."
+  exit 1
+end
+
+rows = doc.search("table.display_table tr")
+logger.info "Found #{rows.size - 1} applications in search results."
+
+app_defaults = {
+  'la_name' => 'Merton Borough Council',
+  'la_slug' => 'merton',
+  'la_gss' => 'E09000024', # https://mapit.mysociety.org/area/2500.html
+  'date_details_scraped' => nil,
+  'date_documents_scraped' => nil,
+  'date_dates_scraped' => nil
+}
+logger.debug "Application defaults: "
+logger.debug app_defaults.to_s
+
+# Iterate over search results
+rows.each do |row|
+  if row.at("td") # skip header row which only has th's
+    cells = row.search("td")
+    ref = cells[0].inner_text.strip
+
+    app = app_defaults.merge(
+      'created_at' => Time.now.to_s,
+      'uuid' => SecureRandom.uuid
+    )
+
+    begin
+      res = ScraperWiki.select("* from applications where council_reference=?", ref)
+    rescue # In case the table doesn't exist, which it won't on first run
+      true
+    end
+    
+    app = res[0] if res && res[0] # res will be nil if the table doesn't exist; [] if that record doesn't exist
+
+    app['council_reference'] = ref
+    app['info_url'] = URI::encode(BASE_URL + cells[0].at("a")['href'].strip)
+    app['info_url'].gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
+
+
+    app['address'] = cells[1].inner_text.strip
+    app['description'] = cells[2].inner_text.strip
+    app['status'] = cells[3].inner_text.strip
+    
+    raw_date_received = cells[4].inner_text.strip
+    
+    if raw_date_received != '--'
+      app['date_received'] = Date.parse(raw_date_received)
+    else
+      app['date_received'] = nil
+    end
+    
+    app['decision'] = cells[5].inner_text.strip
+    app['date_scraped'] = Date.today.to_s
+
+    app['updated_at'] = Time.now.to_s
+    ScraperWiki.save_sqlite(['council_reference'], app, 'applications')
+  end
+end
+
+# Scrape details for all apps that don't have them
+apps = ScraperWiki.select("* from applications where date_details_scraped is null or date_details_scraped < ? order by date_received desc", Date.today.to_s)
+
+logger.info "Scraping details for #{apps.size} applications."
+
+i = 0
+apps.each do |app|
+  i += 1
+  logger.info "#{i} of #{apps.size}: Scraping details for app: #{app['council_reference']}."
+  crawl_delay
+  
+  # Scrape details page
+  res = HTTP.headers(headers).cookies(cookies).get(app['info_url'])
+  if res.code == 200
+    # Parse details page
+    parsed_details = parse_details(res.to_s)
+    app.merge!(parsed_details)
+    app['date_details_scraped'] = Date.today.to_s
+    app['updated_at'] = Time.now.to_s
+    ScraperWiki.save_sqlite(['council_reference'], app, 'applications')
+  else
+    logger.error "Failed to get #{app['info_url']} - HTTP " + res.code.to_s # FIXME improve message
+  end
+end
+
+# Scrape dates page for apps that don't have them
+apps = ScraperWiki.select("* from applications where date_dates_scraped is null or date_dates_scraped < ? order by date_received desc", Date.today.to_s)
+logger.info "Scraping dates for #{apps.size} applications."
+
+i = 0
+apps.each do |app|
+  i += 1
+  logger.info "#{i} of #{apps.size}: Scraping dates for #{app['council_reference']}."
+  crawl_delay
+  
+  # Scrape dates page
+  res = HTTP.headers(headers).cookies(cookies).get(app['dates_url'])
+
+  if res.code == 200
+    # Parse dates page
+    parsed_dates = parse_dates(res.to_s)
+    app.merge!(parsed_dates)
+    app['date_dates_scraped'] = Date.today.to_s
+    app['updated_at'] = Time.now.to_s
+    ScraperWiki.save_sqlite(['council_reference'], app, 'applications')
+  else
+    logger.error "Error: " + res.code.to_s # FIXME improve message
+  end
+end
+
+# Scrape documents for apps that don't have them
+apps = ScraperWiki.select("* from applications where date_documents_scraped is null or date_documents_scraped < ? order by date_received desc", Date.today.to_s)
+logger.info "Scraping documents for #{apps.size} applications."
+
+i = 0
+apps.each do |app|
+  i += 1
+  logger.info "#{i} of #{apps.size}: Scraping documents for #{app['council_reference']}."
+  crawl_delay
+  
+  # Scrape documents page
+  res = HTTP.headers(headers).cookies(cookies).get(app['documents_url'])
+
+  if res.code == 200
+    # Parse documents page
+    docs = parse_documents(res.to_s)
+
+    docs.each do |d|
+      d['council_reference'] = app['council_reference']
+      ScraperWiki.save_sqlite(['council_reference', 'url'], d, 'documents')
+    end
+
+    app['documents_qty'] = docs.size
+    app['date_documents_scraped'] = Date.today.to_s
+    ScraperWiki.save_sqlite(['council_reference'], app, 'applications')
+  else
+    logger.error "Error: " + res.code.to_s # FIXME improve message
+  end
+end
+
+logger.info "Scraper finishes. We did it."
+logger.close