commit 7f48783e71195a884ae6c2ea80f28decaa2d9530 Author: Adrian Short Date: Fri Sep 14 10:11:14 2018 +0100 First commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0cb6eeb --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +/.bundle/ +/.yardoc +/Gemfile.lock +/_yardoc/ +/coverage/ +/doc/ +/pkg/ +/spec/reports/ +/tmp/ diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..15791ba --- /dev/null +++ b/Gemfile @@ -0,0 +1,4 @@ +source "https://rubygems.org" + +# Specify your gem's dependencies in idox_planning_scraper.gemspec +gemspec diff --git a/README.md b/README.md new file mode 100644 index 0000000..da321ef --- /dev/null +++ b/README.md @@ -0,0 +1,70 @@ +# UK Planning Scraper + +**PRE-ALPHA: Only works with some Idox sites and spews a lot of stuff to STDOUT. Not for production use.** + +This gem scrapes planning applications data from UK council/local planning authority websites, eg Westminster City Council. Data is returned as an array of hashes, one hash for each planning application. + +This scraper gem doesn't use a database. Storing the output is up to you. It's just a convenient way to get the data. + +Currently this only works for some Idox sites. The ultimate aim is to provide a consistent interface in a single gem for all variants of all planning systems: Idox Public Access, Northgate Planning Explorer, OcellaWeb, and all the one-off systems. + +This project is not affiliated with any organisation. + +## Installation + +Add this line to your application's Gemfile: + +```ruby +gem 'uk_planning_scraper', :git => 'https://github.com/adrianshort/uk_planning_scraper/' +``` + +And then execute: + + $ bundle + +Or install it yourself as: + + $ gem install specific_install + $ gem specific_install adrianshort/uk_planning_scraper + +## Usage + +```ruby +require 'uk_planning_scraper' +require 'date' +require 'pp' + +# change this to the URL of the advanced search page for the council you want +url = 'https://planning.anytown.gov.uk/online-applications/search.do?action=advanced' + +options = { + delay: 10, # seconds between scrape requests; optional, defaults to 10 +} + +params = { + validated_from: Date.today - 30, # Must be a Date object; optional + validated_to: Date.today, # Must be a Date object; optional + description: 'keywords to search for', # Optional +} + +apps = UKPlanningScraper.search(url, params, options) +pp apps + +``` + +Try [ScraperWiki](https://github.com/openaustralia/scraperwiki-ruby) if you want a quick and easy way to throw the results into an SQLite database: + +```ruby +require 'scraperwiki' # Must be installed, of course +ScraperWiki.save_sqlite([:council_reference], apps) +``` + +## Development + +After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment. + +To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org). + +## Contributing + +Bug reports and pull requests are welcome on GitHub at https://github.com/adrianshort/uk_planning_scraper. diff --git a/Rakefile b/Rakefile new file mode 100644 index 0000000..43022f7 --- /dev/null +++ b/Rakefile @@ -0,0 +1,2 @@ +require "bundler/gem_tasks" +task :default => :spec diff --git a/bin/console b/bin/console new file mode 100755 index 0000000..8ec06dc --- /dev/null +++ b/bin/console @@ -0,0 +1,14 @@ +#!/usr/bin/env ruby + +require "bundler/setup" +require "uk_planning_scraper" + +# You can add fixtures and/or initialization code here to make experimenting +# with your gem easier. You can also use a different console, if you like. + +# (If you use this, don't forget to add pry to your Gemfile!) +# require "pry" +# Pry.start + +require "irb" +IRB.start(__FILE__) diff --git a/bin/setup b/bin/setup new file mode 100755 index 0000000..dce67d8 --- /dev/null +++ b/bin/setup @@ -0,0 +1,8 @@ +#!/usr/bin/env bash +set -euo pipefail +IFS=$'\n\t' +set -vx + +bundle install + +# Do any other automated setup that you need to do here diff --git a/lib/uk_planning_scraper.rb b/lib/uk_planning_scraper.rb new file mode 100644 index 0000000..85b737c --- /dev/null +++ b/lib/uk_planning_scraper.rb @@ -0,0 +1,79 @@ +require "uk_planning_scraper/version" +require 'mechanize' +require 'time' +require 'logger' +require 'pp' + +module UKPlanningScraper + def self.search(search_url, criteria, options = {}) + default_options = { + delay: 10, + } + @options = default_options.merge(options) # The user-supplied options override the defaults + + @search_url = search_url + @base_url = search_url.match(/(https?:\/\/.+?)\//)[1] + + apps = [] + # Regex doesn't work for Newham, Greenwich, Tower Hamlets which don't have the Received date in the text + meta_regex = /Ref\. No:\s+(.+)\s+.+\s+Received:\s+(.+)\s+.+\s+Validated:\s+(.+)\s+.+\s+Status:\s+(.+)/ + + agent = Mechanize.new + puts "Getting: #{@search_url}" + page = agent.get(@search_url) # load the search form page + + + # Fill out and submit search form + form = page.form('searchCriteriaForm') + # form.action = form.action + '&searchCriteria.resultsPerPage=100' + + # Some councils don't have the received from/to dates on their form, eg Newham + form.send(:"date(applicationReceivedStart)", criteria[:received_from].strftime("%d/%m/%Y")) if criteria[:received_from] + form.send(:"date(applicationReceivedEnd)", criteria[:received_to].strftime("%d/%m/%Y")) if criteria[:received_to] + + form.send(:"date(applicationValidatedStart)", criteria[:validated_from].strftime("%d/%m/%Y")) if criteria[:validated_from] + form.send(:"date(applicationValidatedEnd)", criteria[:validated_to].strftime("%d/%m/%Y")) if criteria[:validated_to] + + form.send(:"searchCriteria\.description", criteria[:description]) + + # Some councils don't have the applicant name on their form, eg Bexley + form.send(:"searchCriteria\.applicantName", criteria[:applicant_name]) if form.has_field? 'searchCriteria.applicantName' + form.send(:"searchCriteria\.caseType", criteria[:application_type]) + page = form.submit + + loop do + # Parse search results + items = page.search('li.searchresult') + + puts "Found #{items.size} apps on this page." + + items.each do |app| + matches = app.at("p.metaInfo").inner_html.match(meta_regex) + + data = { + council_reference: matches[1].strip, + scraped_at: Time.now, + date_received: Date.parse(matches[2]), + date_validated: Date.parse(matches[3]), + info_url: @base_url + app.at('a')['href'], + address: app.at('p.address').inner_text.strip, + description: app.at('a').inner_text.strip, + status: matches[4].strip + } + + apps << data + end + + # Get the Next button from the pager, if there is one + if next_button = page.at('a.next') + next_url = @base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100' + sleep @options[:delay] + puts "Getting: #{next_url}" + page = agent.get(next_url) + else + break + end + end + apps + end +end diff --git a/lib/uk_planning_scraper/version.rb b/lib/uk_planning_scraper/version.rb new file mode 100644 index 0000000..b3a61db --- /dev/null +++ b/lib/uk_planning_scraper/version.rb @@ -0,0 +1,3 @@ +module UKPlanningScraper + VERSION = "0.1.0" +end diff --git a/uk_planning_scraper.gemspec b/uk_planning_scraper.gemspec new file mode 100644 index 0000000..e8f1913 --- /dev/null +++ b/uk_planning_scraper.gemspec @@ -0,0 +1,35 @@ +# coding: utf-8 +lib = File.expand_path("../lib", __FILE__) +$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) +require "uk_planning_scraper/version" + +Gem::Specification.new do |spec| + spec.name = "uk_planning_scraper" + spec.version = UKPlanningScraper::VERSION + spec.authors = ["Adrian Short"] + + spec.summary = %q{Get planning applications data from UK council websites.} + # spec.description = %q{TODO: Write a longer description or delete this line.} + spec.homepage = "https://github.com/adrianshort/uk_planning_scraper/" + + # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host' + # to allow pushing to a single host or delete this section to allow pushing to any host. + if spec.respond_to?(:metadata) + spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'" + else + raise "RubyGems 2.0 or newer is required to protect against " \ + "public gem pushes." + end + + spec.files = `git ls-files -z`.split("\x0").reject do |f| + f.match(%r{^(test|spec|features)/}) + end + spec.bindir = "exe" + spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } + spec.require_paths = ["lib"] + + spec.add_development_dependency "bundler", "~> 1.15" + spec.add_development_dependency "rake", "~> 10.0" + + spec.add_runtime_dependency "mechanize", "~> 2.7" +end