123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253 |
- require 'http'
- require 'nokogiri'
- require 'uri'
- require 'scraperwiki'
- require 'pp'
- require_relative './parser'
- require 'date'
- require 'logger'
- require 'securerandom'
- # Northgate Planning Explorer
- SITE_URL = 'https://planning.merton.gov.uk'
- BASE_URL = SITE_URL + '/Northgate/PlanningExplorerAA/Generic/'
- def crawl_delay
- sleep DELAY_S
- end
- DELAY_S = ENV['MORPH_DELAY'].to_f || 10 # seconds. Conservatively slow by default. Scrapes approx 360 pages per hour.
- DATE_REGEX = /\d{2}-\d{2}-\d{4}/
- $stdout.sync = true # Flush output buffer after every write so log messages appear immediately.
- logger = Logger.new($stdout)
- logger.level = ENV['MORPH_LOG_LEVEL'].to_i || Logger::INFO
- logger.info "Scraper starts. Let's do this."
- logger.info "Delay between requests is #{DELAY_S} seconds."
- logger.info "User agent is: #{USER_AGENT}"
- logger.info "Log level is: #{logger.level}"
- # General search
- URL = SITE_URL + '/Northgate/PlanningExplorerAA/GeneralSearch.aspx'
- form_vars = {
- 'cboSelectDateValue' => 'DATE_RECEIVED',
- 'csbtnSearch' => 'Search' # required
- }
- # If both MORPH_DAYS and MORPH_MONTHS are set, MORPH_DAYS should be used.
- logger.fatal "Neither MORPH_MONTHS nor MORPH_DAYS set. Nothing to scrape. Exiting."
- exit 1
- end
- form_vars.merge!({
- 'cboMonths' => ENV['MORPH_MONTHS'],
- 'rbGroup' => 'rbMonth'
- })
- end
- form_vars.merge!({
- 'cboMonths' => nil,
- 'cboDays' => ENV['MORPH_DAYS'],
- 'rbGroup' => 'rbDay'
- })
- end
- form_vars.merge!({ 'cboStatusCode' => ENV['MORPH_STATUS']}) if ENV['MORPH_STATUS']
- logger.info "Form variables: #{form_vars.to_s}"
- headers = {
- 'Origin' => SITE_URL,
- 'Referer' => URL,
- 'User-Agent' => USER_AGENT
- }
- logger.debug "HTTP request headers:"
- logger.debug(headers.to_s)
- logger.debug "GET: " + URL
- response = HTTP.headers(headers).get(URL)
- logger.debug "Response code: HTTP " + response.code.to_s
- if response.code == 200
- doc = Nokogiri::HTML(response.to_s)
- asp_vars = {
- '__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'],
- '__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value']
- }
- else
- logger.fatal "Bad response from search page. Response code: #{response.code.to_s}. Exiting."
- exit 1
- end
- cookies = {}
- response.cookies.each { |c| cookies[c.name] = c.value }
- form_vars.merge!(asp_vars)
- logger.debug "GET: " + URL
- response2 = HTTP.headers(headers).cookies(cookies).post(URL, :form => form_vars)
- logger.debug "Response code: HTTP " + response2.code.to_s
- if response2.code == 302
- # Follow the redirect manually
- # Set the page size (PS) to max so we don't have to page through search results
- results_url = URI::encode(SITE_URL + response2.headers['Location'].gsub!('PS=10', 'PS=99999'))
- logger.debug "GET: " + URL
- response3 = HTTP.headers(headers).cookies(cookies).get(results_url)
- logger.debug "Response code: HTTP " + response3.code.to_s
- doc = Nokogiri::HTML(response3.to_s)
- else
- logger.fatal "Didn't get redirected from search. Exiting."
- exit 1
- end
- rows = doc.search("table.display_table tr")
- logger.info "Found #{rows.size - 1} applications in search results."
- app_defaults = {
- 'la_name' => 'Merton Borough Council',
- 'la_slug' => 'merton',
- 'la_gss' => 'E09000024', # https://mapit.mysociety.org/area/2500.html
- 'date_details_scraped' => nil,
- 'date_documents_scraped' => nil,
- 'date_dates_scraped' => nil
- }
- logger.debug "Application defaults: "
- logger.debug app_defaults.to_s
- # Iterate over search results
- rows.each do |row|
- if row.at("td") # skip header row which only has th's
- cells = row.search("td")
- ref = cells[0].inner_text.strip
- app = app_defaults.merge(
- 'created_at' => Time.now.to_s,
- 'uuid' => SecureRandom.uuid
- )
- begin
- res = ScraperWiki.select("* from applications where council_reference=?", ref)
- rescue # In case the table doesn't exist, which it won't on first run
- true
- end
- app = res[0] if res && res[0] # res will be nil if the table doesn't exist; [] if that record doesn't exist
- app['council_reference'] = ref
- app['info_url'] = URI::encode(BASE_URL + cells[0].at("a")['href'].strip)
- app['info_url'].gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
- app['address'] = cells[1].inner_text.strip
- app['description'] = cells[2].inner_text.strip
- app['status'] = cells[3].inner_text.strip
- raw_date_received = cells[4].inner_text.strip
- if raw_date_received != '--'
- app['date_received'] = Date.parse(raw_date_received)
- else
- app['date_received'] = nil
- end
- app['decision'] = cells[5].inner_text.strip
- app['date_scraped'] = Date.today.to_s
- app['updated_at'] = Time.now.to_s
- ScraperWiki.save_sqlite(['council_reference'], app, 'applications')
- end
- end
- # Scrape details for all apps that don't have them
- apps = ScraperWiki.select("* from applications where date_details_scraped is null or date_details_scraped < ? order by date_received desc", Date.today.to_s)
- logger.info "Scraping details for #{apps.size} applications."
- i = 0
- apps.each do |app|
- i += 1
- logger.info "#{i} of #{apps.size}: Scraping details for app: #{app['council_reference']}."
- crawl_delay
- # Scrape details page
- res = HTTP.headers(headers).cookies(cookies).get(app['info_url'])
- if res.code == 200
- # Parse details page
- parsed_details = parse_details(res.to_s)
- app.merge!(parsed_details)
- app['date_details_scraped'] = Date.today.to_s
- app['updated_at'] = Time.now.to_s
- ScraperWiki.save_sqlite(['council_reference'], app, 'applications')
- else
- logger.error "Failed to get #{app['info_url']} - HTTP " + res.code.to_s # FIXME improve message
- end
- end
- # Scrape dates page for apps that don't have them
- apps = ScraperWiki.select("* from applications where date_dates_scraped is null or date_dates_scraped < ? order by date_received desc", Date.today.to_s)
- logger.info "Scraping dates for #{apps.size} applications."
- i = 0
- apps.each do |app|
- i += 1
- logger.info "#{i} of #{apps.size}: Scraping dates for #{app['council_reference']}."
- crawl_delay
- # Scrape dates page
- res = HTTP.headers(headers).cookies(cookies).get(app['dates_url'])
- if res.code == 200
- # Parse dates page
- parsed_dates = parse_dates(res.to_s)
- app.merge!(parsed_dates)
- app['date_dates_scraped'] = Date.today.to_s
- app['updated_at'] = Time.now.to_s
- ScraperWiki.save_sqlite(['council_reference'], app, 'applications')
- else
- logger.error "Error: " + res.code.to_s # FIXME improve message
- end
- end
- # Scrape documents for apps that don't have them
- apps = ScraperWiki.select("* from applications where date_documents_scraped is null or date_documents_scraped < ? order by date_received desc", Date.today.to_s)
- logger.info "Scraping documents for #{apps.size} applications."
- i = 0
- apps.each do |app|
- i += 1
- logger.info "#{i} of #{apps.size}: Scraping documents for #{app['council_reference']}."
- crawl_delay
- # Scrape documents page
- res = HTTP.headers(headers).cookies(cookies).get(app['documents_url'])
- if res.code == 200
- # Parse documents page
- docs = parse_documents(res.to_s)
- docs.each do |d|
- d['council_reference'] = app['council_reference']
- ScraperWiki.save_sqlite(['council_reference', 'url'], d, 'documents')
- end
- app['documents_qty'] = docs.size
- app['date_documents_scraped'] = Date.today.to_s
- ScraperWiki.save_sqlite(['council_reference'], app, 'applications')
- else
- logger.error "Error: " + res.code.to_s # FIXME improve message
- end
- end
- logger.info "Scraper finishes. We did it."
- logger.close