@@ -0,0 +1,9 @@ | |||||
/.bundle/ | |||||
/.yardoc | |||||
/Gemfile.lock | |||||
/_yardoc/ | |||||
/coverage/ | |||||
/doc/ | |||||
/pkg/ | |||||
/spec/reports/ | |||||
/tmp/ |
@@ -0,0 +1,4 @@ | |||||
source "https://rubygems.org" | |||||
# Specify your gem's dependencies in idox_planning_scraper.gemspec | |||||
gemspec |
@@ -0,0 +1,70 @@ | |||||
# UK Planning Scraper | |||||
**PRE-ALPHA: Only works with some Idox sites and spews a lot of stuff to STDOUT. Not for production use.** | |||||
This gem scrapes planning applications data from UK council/local planning authority websites, eg Westminster City Council. Data is returned as an array of hashes, one hash for each planning application. | |||||
This scraper gem doesn't use a database. Storing the output is up to you. It's just a convenient way to get the data. | |||||
Currently this only works for some Idox sites. The ultimate aim is to provide a consistent interface in a single gem for all variants of all planning systems: Idox Public Access, Northgate Planning Explorer, OcellaWeb, and all the one-off systems. | |||||
This project is not affiliated with any organisation. | |||||
## Installation | |||||
Add this line to your application's Gemfile: | |||||
```ruby | |||||
gem 'uk_planning_scraper', :git => 'https://github.com/adrianshort/uk_planning_scraper/' | |||||
``` | |||||
And then execute: | |||||
$ bundle | |||||
Or install it yourself as: | |||||
$ gem install specific_install | |||||
$ gem specific_install adrianshort/uk_planning_scraper | |||||
## Usage | |||||
```ruby | |||||
require 'uk_planning_scraper' | |||||
require 'date' | |||||
require 'pp' | |||||
# change this to the URL of the advanced search page for the council you want | |||||
url = 'https://planning.anytown.gov.uk/online-applications/search.do?action=advanced' | |||||
options = { | |||||
delay: 10, # seconds between scrape requests; optional, defaults to 10 | |||||
} | |||||
params = { | |||||
validated_from: Date.today - 30, # Must be a Date object; optional | |||||
validated_to: Date.today, # Must be a Date object; optional | |||||
description: 'keywords to search for', # Optional | |||||
} | |||||
apps = UKPlanningScraper.search(url, params, options) | |||||
pp apps | |||||
``` | |||||
Try [ScraperWiki](https://github.com/openaustralia/scraperwiki-ruby) if you want a quick and easy way to throw the results into an SQLite database: | |||||
```ruby | |||||
require 'scraperwiki' # Must be installed, of course | |||||
ScraperWiki.save_sqlite([:council_reference], apps) | |||||
``` | |||||
## Development | |||||
After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment. | |||||
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org). | |||||
## Contributing | |||||
Bug reports and pull requests are welcome on GitHub at https://github.com/adrianshort/uk_planning_scraper. |
@@ -0,0 +1,2 @@ | |||||
require "bundler/gem_tasks" | |||||
task :default => :spec |
@@ -0,0 +1,14 @@ | |||||
#!/usr/bin/env ruby | |||||
require "bundler/setup" | |||||
require "uk_planning_scraper" | |||||
# You can add fixtures and/or initialization code here to make experimenting | |||||
# with your gem easier. You can also use a different console, if you like. | |||||
# (If you use this, don't forget to add pry to your Gemfile!) | |||||
# require "pry" | |||||
# Pry.start | |||||
require "irb" | |||||
IRB.start(__FILE__) |
@@ -0,0 +1,8 @@ | |||||
#!/usr/bin/env bash | |||||
set -euo pipefail | |||||
IFS=$'\n\t' | |||||
set -vx | |||||
bundle install | |||||
# Do any other automated setup that you need to do here |
@@ -0,0 +1,79 @@ | |||||
require "uk_planning_scraper/version" | |||||
require 'mechanize' | |||||
require 'time' | |||||
require 'logger' | |||||
require 'pp' | |||||
module UKPlanningScraper | |||||
def self.search(search_url, criteria, options = {}) | |||||
default_options = { | |||||
delay: 10, | |||||
} | |||||
@options = default_options.merge(options) # The user-supplied options override the defaults | |||||
@search_url = search_url | |||||
@base_url = search_url.match(/(https?:\/\/.+?)\//)[1] | |||||
apps = [] | |||||
# Regex doesn't work for Newham, Greenwich, Tower Hamlets which don't have the Received date in the text | |||||
meta_regex = /Ref\. No:\s+(.+)\s+.+\s+Received:\s+(.+)\s+.+\s+Validated:\s+(.+)\s+.+\s+Status:\s+(.+)/ | |||||
agent = Mechanize.new | |||||
puts "Getting: #{@search_url}" | |||||
page = agent.get(@search_url) # load the search form page | |||||
# Fill out and submit search form | |||||
form = page.form('searchCriteriaForm') | |||||
# form.action = form.action + '&searchCriteria.resultsPerPage=100' | |||||
# Some councils don't have the received from/to dates on their form, eg Newham | |||||
form.send(:"date(applicationReceivedStart)", criteria[:received_from].strftime("%d/%m/%Y")) if criteria[:received_from] | |||||
form.send(:"date(applicationReceivedEnd)", criteria[:received_to].strftime("%d/%m/%Y")) if criteria[:received_to] | |||||
form.send(:"date(applicationValidatedStart)", criteria[:validated_from].strftime("%d/%m/%Y")) if criteria[:validated_from] | |||||
form.send(:"date(applicationValidatedEnd)", criteria[:validated_to].strftime("%d/%m/%Y")) if criteria[:validated_to] | |||||
form.send(:"searchCriteria\.description", criteria[:description]) | |||||
# Some councils don't have the applicant name on their form, eg Bexley | |||||
form.send(:"searchCriteria\.applicantName", criteria[:applicant_name]) if form.has_field? 'searchCriteria.applicantName' | |||||
form.send(:"searchCriteria\.caseType", criteria[:application_type]) | |||||
page = form.submit | |||||
loop do | |||||
# Parse search results | |||||
items = page.search('li.searchresult') | |||||
puts "Found #{items.size} apps on this page." | |||||
items.each do |app| | |||||
matches = app.at("p.metaInfo").inner_html.match(meta_regex) | |||||
data = { | |||||
council_reference: matches[1].strip, | |||||
scraped_at: Time.now, | |||||
date_received: Date.parse(matches[2]), | |||||
date_validated: Date.parse(matches[3]), | |||||
info_url: @base_url + app.at('a')['href'], | |||||
address: app.at('p.address').inner_text.strip, | |||||
description: app.at('a').inner_text.strip, | |||||
status: matches[4].strip | |||||
} | |||||
apps << data | |||||
end | |||||
# Get the Next button from the pager, if there is one | |||||
if next_button = page.at('a.next') | |||||
next_url = @base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100' | |||||
sleep @options[:delay] | |||||
puts "Getting: #{next_url}" | |||||
page = agent.get(next_url) | |||||
else | |||||
break | |||||
end | |||||
end | |||||
apps | |||||
end | |||||
end |
@@ -0,0 +1,3 @@ | |||||
module UKPlanningScraper | |||||
VERSION = "0.1.0" | |||||
end |
@@ -0,0 +1,35 @@ | |||||
# coding: utf-8 | |||||
lib = File.expand_path("../lib", __FILE__) | |||||
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) | |||||
require "uk_planning_scraper/version" | |||||
Gem::Specification.new do |spec| | |||||
spec.name = "uk_planning_scraper" | |||||
spec.version = UKPlanningScraper::VERSION | |||||
spec.authors = ["Adrian Short"] | |||||
spec.summary = %q{Get planning applications data from UK council websites.} | |||||
# spec.description = %q{TODO: Write a longer description or delete this line.} | |||||
spec.homepage = "https://github.com/adrianshort/uk_planning_scraper/" | |||||
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host' | |||||
# to allow pushing to a single host or delete this section to allow pushing to any host. | |||||
if spec.respond_to?(:metadata) | |||||
spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'" | |||||
else | |||||
raise "RubyGems 2.0 or newer is required to protect against " \ | |||||
"public gem pushes." | |||||
end | |||||
spec.files = `git ls-files -z`.split("\x0").reject do |f| | |||||
f.match(%r{^(test|spec|features)/}) | |||||
end | |||||
spec.bindir = "exe" | |||||
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } | |||||
spec.require_paths = ["lib"] | |||||
spec.add_development_dependency "bundler", "~> 1.15" | |||||
spec.add_development_dependency "rake", "~> 10.0" | |||||
spec.add_runtime_dependency "mechanize", "~> 2.7" | |||||
end |