First commit

6 years ago · 7f48783e71
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,9 @@
 /.bundle/
 /.yardoc
 /Gemfile.lock
 /_yardoc/
 /coverage/
 /doc/
 /pkg/
 /spec/reports/
 /tmp/
--- a/+ 4
+++ b/+ 4
@@ -0,0 +1,4 @@
 source "https://rubygems.org"
 # Specify your gem's dependencies in idox_planning_scraper.gemspec
 gemspec
--- a/README.md
+++ b/README.md
@@ -0,0 +1,70 @@
 # UK Planning Scraper
 **PRE-ALPHA: Only works with some Idox sites and spews a lot of stuff to STDOUT. Not for production use.**
 This gem scrapes planning applications data from UK council/local planning authority websites, eg Westminster City Council. Data is returned as an array of hashes, one hash for each planning application.
 This scraper gem doesn't use a database. Storing the output is up to you. It's just a convenient way to get the data.
 Currently this only works for some Idox sites. The ultimate aim is to provide a consistent interface in a single gem for all variants of all planning systems: Idox Public Access, Northgate Planning Explorer, OcellaWeb, and all the one-off systems.
 This project is not affiliated with any organisation.
 ## Installation
 Add this line to your application's Gemfile:
 ```ruby
 gem 'uk_planning_scraper', :git => 'https://github.com/adrianshort/uk_planning_scraper/'
 ```
 And then execute:
    $ bundle
 Or install it yourself as:
    $ gem install specific_install
    $ gem specific_install adrianshort/uk_planning_scraper
 ## Usage
 ```ruby
 require 'uk_planning_scraper'
 require 'date'
 require 'pp'
 # change this to the URL of the advanced search page for the council you want
 url = 'https://planning.anytown.gov.uk/online-applications/search.do?action=advanced'
 options = {
  delay: 10, # seconds between scrape requests; optional, defaults to 10
 }
 params = {
  validated_from: Date.today - 30, # Must be a Date object; optional
  validated_to: Date.today, # Must be a Date object; optional
  description: 'keywords to search for', # Optional
 }
 apps = UKPlanningScraper.search(url, params, options)
 pp apps
 ```
 Try [ScraperWiki](https://github.com/openaustralia/scraperwiki-ruby) if you want a quick and easy way to throw the results into an SQLite database:
 ```ruby
 require 'scraperwiki' # Must be installed, of course
 ScraperWiki.save_sqlite([:council_reference], apps)
 ```
 ## Development
 After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
 To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
 ## Contributing
 Bug reports and pull requests are welcome on GitHub at https://github.com/adrianshort/uk_planning_scraper.
--- a/+ 2
+++ b/+ 2
@@ -0,0 +1,2 @@
 require "bundler/gem_tasks"
 task :default => :spec
--- a/bin/console
+++ b/bin/console
@@ -0,0 +1,14 @@
 #!/usr/bin/env ruby
 require "bundler/setup"
 require "uk_planning_scraper"
 # You can add fixtures and/or initialization code here to make experimenting
 # with your gem easier. You can also use a different console, if you like.
 # (If you use this, don't forget to add pry to your Gemfile!)
 # require "pry"
 # Pry.start
 require "irb"
 IRB.start(__FILE__)
--- a/bin/setup
+++ b/bin/setup
@@ -0,0 +1,8 @@
 #!/usr/bin/env bash
 set -euo pipefail
 IFS=$'\n\t'
 set -vx
 bundle install
 # Do any other automated setup that you need to do here
--- a/lib/uk_planning_scraper.rb
+++ b/lib/uk_planning_scraper.rb
@@ -0,0 +1,79 @@
 require "uk_planning_scraper/version"
 require 'mechanize'
 require 'time'
 require 'logger'
 require 'pp'
 module UKPlanningScraper
  def self.search(search_url, criteria, options = {})
    default_options = {
      delay: 10,
    }
    @options = default_options.merge(options) # The user-supplied options override the defaults
    @search_url = search_url
    @base_url = search_url.match(/(https?:\/\/.+?)\//)[1]
    apps = []
    # Regex doesn't work for Newham, Greenwich, Tower Hamlets which don't have the Received date in the text
    meta_regex = /Ref\. No:\s+(.+)\s+.+\s+Received:\s+(.+)\s+.+\s+Validated:\s+(.+)\s+.+\s+Status:\s+(.+)/
    agent = Mechanize.new
    puts "Getting: #{@search_url}"
    page = agent.get(@search_url) # load the search form page
    # Fill out and submit search form
    form = page.form('searchCriteriaForm')
    # form.action = form.action + '&searchCriteria.resultsPerPage=100'
    # Some councils don't have the received from/to dates on their form, eg Newham
    form.send(:"date(applicationReceivedStart)", criteria[:received_from].strftime("%d/%m/%Y")) if criteria[:received_from]
    form.send(:"date(applicationReceivedEnd)", criteria[:received_to].strftime("%d/%m/%Y")) if criteria[:received_to]
    form.send(:"date(applicationValidatedStart)", criteria[:validated_from].strftime("%d/%m/%Y")) if criteria[:validated_from]
    form.send(:"date(applicationValidatedEnd)", criteria[:validated_to].strftime("%d/%m/%Y")) if criteria[:validated_to]
    form.send(:"searchCriteria\.description", criteria[:description])
    # Some councils don't have the applicant name on their form, eg Bexley
    form.send(:"searchCriteria\.applicantName", criteria[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
    form.send(:"searchCriteria\.caseType", criteria[:application_type])
    page = form.submit
    loop do
      # Parse search results
      items = page.search('li.searchresult')
      puts "Found #{items.size} apps on this page."
      items.each do |app|
        matches = app.at("p.metaInfo").inner_html.match(meta_regex)
        data = {
          council_reference: matches[1].strip,
          scraped_at: Time.now,
          date_received: Date.parse(matches[2]),
          date_validated: Date.parse(matches[3]),
          info_url: @base_url + app.at('a')['href'],
          address: app.at('p.address').inner_text.strip,
          description: app.at('a').inner_text.strip,
          status: matches[4].strip
        }
        apps << data
      end
      # Get the Next button from the pager, if there is one
      if next_button = page.at('a.next')
        next_url = @base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
        sleep @options[:delay]
        puts "Getting: #{next_url}"
        page = agent.get(next_url)
      else
        break
      end
    end
    apps
  end
 end
--- a/lib/uk_planning_scraper/version.rb
+++ b/lib/uk_planning_scraper/version.rb
@@ -0,0 +1,3 @@
 module UKPlanningScraper
  VERSION = "0.1.0"
 end
--- a/uk_planning_scraper.gemspec
+++ b/uk_planning_scraper.gemspec
@@ -0,0 +1,35 @@
 # coding: utf-8
 lib = File.expand_path("../lib", __FILE__)
 $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 require "uk_planning_scraper/version"
 Gem::Specification.new do |spec|
  spec.name          = "uk_planning_scraper"
  spec.version       = UKPlanningScraper::VERSION
  spec.authors       = ["Adrian Short"]
  spec.summary       = %q{Get planning applications data from UK council websites.}
  # spec.description   = %q{TODO: Write a longer description or delete this line.}
  spec.homepage      = "https://github.com/adrianshort/uk_planning_scraper/"
  # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
  # to allow pushing to a single host or delete this section to allow pushing to any host.
  if spec.respond_to?(:metadata)
    spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
  else
    raise "RubyGems 2.0 or newer is required to protect against " \
      "public gem pushes."
  end
  spec.files         = `git ls-files -z`.split("\x0").reject do |f|
    f.match(%r{^(test|spec|features)/})
  end
  spec.bindir        = "exe"
  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
  spec.require_paths = ["lib"]
  spec.add_development_dependency "bundler", "~> 1.15"
  spec.add_development_dependency "rake", "~> 10.0"
  spec.add_runtime_dependency "mechanize", "~> 2.7"
 end