adrianshort
/
merton-planning-applications
mirror of https://github.com/adrianshort/merton-planning-applications.git


			
							require 'http'
require 'nokogiri'
require 'uri'
require 'scraperwiki'
require 'pp'
require_relative './parser'
require 'date'
require 'logger'
require 'securerandom'

# Northgate Planning Explorer

SITE_URL = 'http://planning.merton.gov.uk'
BASE_URL = SITE_URL + '/Northgate/PlanningExplorerAA/Generic/'

def crawl_delay
  sleep DELAY_S
end

DELAY_S = ENV['SCRAPER_DELAY'].to_f || 10 # seconds. Conservatively slow by default. Scrapes approx 360 pages per hour.
USER_AGENT = ENV['SCRAPER_USER_AGENT']
DATE_REGEX = /\d{2}-\d{2}-\d{4}/

$stdout.sync = true # Flush output buffer after every write so log messages appear immediately.
logger = Logger.new($stdout)
logger.level = ENV['SCRAPER_LOG_LEVEL'].to_i || Logger::INFO
logger.info "Scraper starts. Let's do this."
logger.info "Delay between requests is #{DELAY_S} seconds."
logger.info "User agent is: #{USER_AGENT}"
logger.info "Log level is: #{logger.level}"

# General search
URL = SITE_URL + '/Northgate/PlanningExplorerAA/GeneralSearch.aspx'

form_vars = {
  # 'cboStatusCode' => '4', # REGISTERED
  'cboSelectDateValue' => 'DATE_RECEIVED',
  # 'cboMonths' => '12', # 1..12
  'cboDays' => 1,
  'rbGroup' => 'rbDay',
  'csbtnSearch' => 'Search' # required
}

logger.info "Form variables: #{form_vars.to_s}"

headers = {
  'Origin' => SITE_URL,
  'Referer' => URL,
  'User-Agent' => USER_AGENT
}

logger.debug "HTTP request headers:"
logger.debug(headers.to_s)

logger.debug "GET: " + URL
response = HTTP.headers(headers).get(URL)
logger.debug "Response code: HTTP " + response.code.to_s

if response.code == 200
  doc = Nokogiri::HTML(response.to_s)
  asp_vars = {
    '__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'],
    '__VIEWSTATEGENERATOR' => doc.at('#__VIEWSTATEGENERATOR')['value'],
    '__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value']
   }
else
  logger.fatal "Bad response from search page. Response code: #{response.code.to_s}. Exiting."
  exit 1
end

cookies = {}
response.cookies.each { |c| cookies[c.name] = c.value }

form_vars.merge!(asp_vars)

logger.debug "GET: " + URL
response2 = HTTP.headers(headers).cookies(cookies).post(URL, :form => form_vars)
logger.debug "Response code: HTTP " + response2.code.to_s

if response2.code == 302
  # Follow the redirect manually
  # Set the page size (PS) to max so we don't have to page through search results
  results_url = URI::encode(SITE_URL + response2.headers['Location'].gsub!('PS=10', 'PS=99999'))
  
  logger.debug "GET: " + URL
  response3 = HTTP.headers(headers).cookies(cookies).get(results_url)
  logger.debug "Response code: HTTP " + response3.code.to_s
  doc = Nokogiri::HTML(response3.to_s)
else
  logger.fatal "Didn't get redirected from search. Exiting."
  exit 1
end

rows = doc.search("table.display_table tr")
logger.info "Found #{rows.size - 1} applications in search results."

app_defaults = {
  'la_name' => 'Merton Borough Council',
  'la_slug' => 'merton',
  'la_gss' => 'E09000024', # https://mapit.mysociety.org/area/2500.html
  'date_details_scraped' => nil,
  'date_documents_scraped' => nil,
  'date_dates_scraped' => nil
}
logger.debug "Application defaults: "
logger.debug app_defaults.to_s

# Iterate over search results
rows.each do |row|
  if row.at("td") # skip header row which only has th's
    cells = row.search("td")
    ref = cells[0].inner_text.strip

    app = app_defaults.merge(
      'created_at' => Time.now.to_s,
      'uuid' => SecureRandom.uuid
    )

    begin
      res = ScraperWiki.select("* from applications where council_reference=?", ref)
    rescue # In case the table doesn't exist, which it won't on first run
      true
    end
    
    app = res[0] if res && res[0] # res will be nil if the table doesn't exist; [] if that record doesn't exist

    app['council_reference'] = ref
    app['info_url'] = URI::encode(BASE_URL + cells[0].at("a")['href'].strip)
    app['info_url'].gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?


    app['address'] = cells[1].inner_text.strip
    app['description'] = cells[2].inner_text.strip
    app['status'] = cells[3].inner_text.strip
    
    raw_date_received = cells[4].inner_text.strip
    
    if raw_date_received != '--'
      app['date_received'] = Date.parse(raw_date_received)
    else
      app['date_received'] = nil
    end
    
    app['decision'] = cells[5].inner_text.strip
    app['date_scraped'] = Date.today.to_s

    app['updated_at'] = Time.now.to_s
    ScraperWiki.save_sqlite(['council_reference'], app, 'applications')
  end
end

# Scrape details for all apps that don't have them
apps = ScraperWiki.select("* from applications where date_details_scraped is null or date_details_scraped < ? order by date_received desc", Date.today.to_s)

logger.info "Scraping details for #{apps.size} applications."

i = 0
apps.each do |app|
  i += 1
  logger.info "#{i} of #{apps.size}: Scraping details for app: #{app['council_reference']}."
  crawl_delay
  
  # Scrape details page
  res = HTTP.headers(headers).cookies(cookies).get(app['info_url'])
  if res.code == 200
    # Parse details page
    parsed_details = parse_details(res.to_s)
    app.merge!(parsed_details)
    app['date_details_scraped'] = Date.today.to_s
    app['updated_at'] = Time.now.to_s
    ScraperWiki.save_sqlite(['council_reference'], app, 'applications')
  else
    logger.error "Failed to get #{app['info_url']} - HTTP " + res.code.to_s # FIXME improve message
  end
end

# Scrape dates page for apps that don't have them
apps = ScraperWiki.select("* from applications where date_dates_scraped is null or date_dates_scraped < ? order by date_received desc", Date.today.to_s)
logger.info "Scraping dates for #{apps.size} applications."

i = 0
apps.each do |app|
  i += 1
  logger.info "#{i} of #{apps.size}: Scraping dates for #{app['council_reference']}."
  crawl_delay
  
  # Scrape dates page
  res = HTTP.headers(headers).cookies(cookies).get(app['dates_url'])

  if res.code == 200
    # Parse dates page
    parsed_dates = parse_dates(res.to_s)
    app.merge!(parsed_dates)
    app['date_dates_scraped'] = Date.today.to_s
    app['updated_at'] = Time.now.to_s
    ScraperWiki.save_sqlite(['council_reference'], app, 'applications')
  else
    logger.error "Error: " + res.code.to_s # FIXME improve message
  end
end

# Scrape documents for apps that don't have them
apps = ScraperWiki.select("* from applications where date_documents_scraped is null or date_documents_scraped < ? order by date_received desc", Date.today.to_s)
logger.info "Scraping documents for #{apps.size} applications."

i = 0
apps.each do |app|
  i += 1
  logger.info "#{i} of #{apps.size}: Scraping documents for #{app['council_reference']}."
  crawl_delay
  
  # Scrape documents page
  res = HTTP.headers(headers).cookies(cookies).get(app['documents_url'])

  if res.code == 200
    # Parse documents page
    docs = parse_documents(res.to_s)

    docs.each do |d|
      d['council_reference'] = app['council_reference']
      ScraperWiki.save_sqlite(['council_reference', 'url'], d, 'documents')
    end

    app['documents_qty'] = docs.size
    app['date_documents_scraped'] = Date.today.to_s
    ScraperWiki.save_sqlite(['council_reference'], app, 'applications')
  else
    logger.error "Error: " + res.code.to_s # FIXME improve message
  end
end

logger.info "Scraper finishes. We did it."
logger.close