@@ -0,0 +1,9 @@ | |||
/.bundle/ | |||
/.yardoc | |||
/Gemfile.lock | |||
/_yardoc/ | |||
/coverage/ | |||
/doc/ | |||
/pkg/ | |||
/spec/reports/ | |||
/tmp/ |
@@ -0,0 +1,4 @@ | |||
source "https://rubygems.org" | |||
# Specify your gem's dependencies in idox_planning_scraper.gemspec | |||
gemspec |
@@ -0,0 +1,70 @@ | |||
# UK Planning Scraper | |||
**PRE-ALPHA: Only works with some Idox sites and spews a lot of stuff to STDOUT. Not for production use.** | |||
This gem scrapes planning applications data from UK council/local planning authority websites, eg Westminster City Council. Data is returned as an array of hashes, one hash for each planning application. | |||
This scraper gem doesn't use a database. Storing the output is up to you. It's just a convenient way to get the data. | |||
Currently this only works for some Idox sites. The ultimate aim is to provide a consistent interface in a single gem for all variants of all planning systems: Idox Public Access, Northgate Planning Explorer, OcellaWeb, and all the one-off systems. | |||
This project is not affiliated with any organisation. | |||
## Installation | |||
Add this line to your application's Gemfile: | |||
```ruby | |||
gem 'uk_planning_scraper', :git => 'https://github.com/adrianshort/uk_planning_scraper/' | |||
``` | |||
And then execute: | |||
$ bundle | |||
Or install it yourself as: | |||
$ gem install specific_install | |||
$ gem specific_install adrianshort/uk_planning_scraper | |||
## Usage | |||
```ruby | |||
require 'uk_planning_scraper' | |||
require 'date' | |||
require 'pp' | |||
# change this to the URL of the advanced search page for the council you want | |||
url = 'https://planning.anytown.gov.uk/online-applications/search.do?action=advanced' | |||
options = { | |||
delay: 10, # seconds between scrape requests; optional, defaults to 10 | |||
} | |||
params = { | |||
validated_from: Date.today - 30, # Must be a Date object; optional | |||
validated_to: Date.today, # Must be a Date object; optional | |||
description: 'keywords to search for', # Optional | |||
} | |||
apps = UKPlanningScraper.search(url, params, options) | |||
pp apps | |||
``` | |||
Try [ScraperWiki](https://github.com/openaustralia/scraperwiki-ruby) if you want a quick and easy way to throw the results into an SQLite database: | |||
```ruby | |||
require 'scraperwiki' # Must be installed, of course | |||
ScraperWiki.save_sqlite([:council_reference], apps) | |||
``` | |||
## Development | |||
After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment. | |||
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org). | |||
## Contributing | |||
Bug reports and pull requests are welcome on GitHub at https://github.com/adrianshort/uk_planning_scraper. |
@@ -0,0 +1,2 @@ | |||
require "bundler/gem_tasks" | |||
task :default => :spec |
@@ -0,0 +1,14 @@ | |||
#!/usr/bin/env ruby | |||
require "bundler/setup" | |||
require "uk_planning_scraper" | |||
# You can add fixtures and/or initialization code here to make experimenting | |||
# with your gem easier. You can also use a different console, if you like. | |||
# (If you use this, don't forget to add pry to your Gemfile!) | |||
# require "pry" | |||
# Pry.start | |||
require "irb" | |||
IRB.start(__FILE__) |
@@ -0,0 +1,8 @@ | |||
#!/usr/bin/env bash | |||
set -euo pipefail | |||
IFS=$'\n\t' | |||
set -vx | |||
bundle install | |||
# Do any other automated setup that you need to do here |
@@ -0,0 +1,79 @@ | |||
require "uk_planning_scraper/version" | |||
require 'mechanize' | |||
require 'time' | |||
require 'logger' | |||
require 'pp' | |||
module UKPlanningScraper | |||
def self.search(search_url, criteria, options = {}) | |||
default_options = { | |||
delay: 10, | |||
} | |||
@options = default_options.merge(options) # The user-supplied options override the defaults | |||
@search_url = search_url | |||
@base_url = search_url.match(/(https?:\/\/.+?)\//)[1] | |||
apps = [] | |||
# Regex doesn't work for Newham, Greenwich, Tower Hamlets which don't have the Received date in the text | |||
meta_regex = /Ref\. No:\s+(.+)\s+.+\s+Received:\s+(.+)\s+.+\s+Validated:\s+(.+)\s+.+\s+Status:\s+(.+)/ | |||
agent = Mechanize.new | |||
puts "Getting: #{@search_url}" | |||
page = agent.get(@search_url) # load the search form page | |||
# Fill out and submit search form | |||
form = page.form('searchCriteriaForm') | |||
# form.action = form.action + '&searchCriteria.resultsPerPage=100' | |||
# Some councils don't have the received from/to dates on their form, eg Newham | |||
form.send(:"date(applicationReceivedStart)", criteria[:received_from].strftime("%d/%m/%Y")) if criteria[:received_from] | |||
form.send(:"date(applicationReceivedEnd)", criteria[:received_to].strftime("%d/%m/%Y")) if criteria[:received_to] | |||
form.send(:"date(applicationValidatedStart)", criteria[:validated_from].strftime("%d/%m/%Y")) if criteria[:validated_from] | |||
form.send(:"date(applicationValidatedEnd)", criteria[:validated_to].strftime("%d/%m/%Y")) if criteria[:validated_to] | |||
form.send(:"searchCriteria\.description", criteria[:description]) | |||
# Some councils don't have the applicant name on their form, eg Bexley | |||
form.send(:"searchCriteria\.applicantName", criteria[:applicant_name]) if form.has_field? 'searchCriteria.applicantName' | |||
form.send(:"searchCriteria\.caseType", criteria[:application_type]) | |||
page = form.submit | |||
loop do | |||
# Parse search results | |||
items = page.search('li.searchresult') | |||
puts "Found #{items.size} apps on this page." | |||
items.each do |app| | |||
matches = app.at("p.metaInfo").inner_html.match(meta_regex) | |||
data = { | |||
council_reference: matches[1].strip, | |||
scraped_at: Time.now, | |||
date_received: Date.parse(matches[2]), | |||
date_validated: Date.parse(matches[3]), | |||
info_url: @base_url + app.at('a')['href'], | |||
address: app.at('p.address').inner_text.strip, | |||
description: app.at('a').inner_text.strip, | |||
status: matches[4].strip | |||
} | |||
apps << data | |||
end | |||
# Get the Next button from the pager, if there is one | |||
if next_button = page.at('a.next') | |||
next_url = @base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100' | |||
sleep @options[:delay] | |||
puts "Getting: #{next_url}" | |||
page = agent.get(next_url) | |||
else | |||
break | |||
end | |||
end | |||
apps | |||
end | |||
end |
@@ -0,0 +1,3 @@ | |||
module UKPlanningScraper | |||
VERSION = "0.1.0" | |||
end |
@@ -0,0 +1,35 @@ | |||
# coding: utf-8 | |||
lib = File.expand_path("../lib", __FILE__) | |||
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) | |||
require "uk_planning_scraper/version" | |||
Gem::Specification.new do |spec| | |||
spec.name = "uk_planning_scraper" | |||
spec.version = UKPlanningScraper::VERSION | |||
spec.authors = ["Adrian Short"] | |||
spec.summary = %q{Get planning applications data from UK council websites.} | |||
# spec.description = %q{TODO: Write a longer description or delete this line.} | |||
spec.homepage = "https://github.com/adrianshort/uk_planning_scraper/" | |||
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host' | |||
# to allow pushing to a single host or delete this section to allow pushing to any host. | |||
if spec.respond_to?(:metadata) | |||
spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'" | |||
else | |||
raise "RubyGems 2.0 or newer is required to protect against " \ | |||
"public gem pushes." | |||
end | |||
spec.files = `git ls-files -z`.split("\x0").reject do |f| | |||
f.match(%r{^(test|spec|features)/}) | |||
end | |||
spec.bindir = "exe" | |||
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } | |||
spec.require_paths = ["lib"] | |||
spec.add_development_dependency "bundler", "~> 1.15" | |||
spec.add_development_dependency "rake", "~> 10.0" | |||
spec.add_runtime_dependency "mechanize", "~> 2.7" | |||
end |