| @@ -0,0 +1,9 @@ | |||||
| /.bundle/ | |||||
| /.yardoc | |||||
| /Gemfile.lock | |||||
| /_yardoc/ | |||||
| /coverage/ | |||||
| /doc/ | |||||
| /pkg/ | |||||
| /spec/reports/ | |||||
| /tmp/ | |||||
| @@ -0,0 +1,4 @@ | |||||
| source "https://rubygems.org" | |||||
| # Specify your gem's dependencies in idox_planning_scraper.gemspec | |||||
| gemspec | |||||
| @@ -0,0 +1,70 @@ | |||||
| # UK Planning Scraper | |||||
| **PRE-ALPHA: Only works with some Idox sites and spews a lot of stuff to STDOUT. Not for production use.** | |||||
| This gem scrapes planning applications data from UK council/local planning authority websites, eg Westminster City Council. Data is returned as an array of hashes, one hash for each planning application. | |||||
| This scraper gem doesn't use a database. Storing the output is up to you. It's just a convenient way to get the data. | |||||
| Currently this only works for some Idox sites. The ultimate aim is to provide a consistent interface in a single gem for all variants of all planning systems: Idox Public Access, Northgate Planning Explorer, OcellaWeb, and all the one-off systems. | |||||
| This project is not affiliated with any organisation. | |||||
| ## Installation | |||||
| Add this line to your application's Gemfile: | |||||
| ```ruby | |||||
| gem 'uk_planning_scraper', :git => 'https://github.com/adrianshort/uk_planning_scraper/' | |||||
| ``` | |||||
| And then execute: | |||||
| $ bundle | |||||
| Or install it yourself as: | |||||
| $ gem install specific_install | |||||
| $ gem specific_install adrianshort/uk_planning_scraper | |||||
| ## Usage | |||||
| ```ruby | |||||
| require 'uk_planning_scraper' | |||||
| require 'date' | |||||
| require 'pp' | |||||
| # change this to the URL of the advanced search page for the council you want | |||||
| url = 'https://planning.anytown.gov.uk/online-applications/search.do?action=advanced' | |||||
| options = { | |||||
| delay: 10, # seconds between scrape requests; optional, defaults to 10 | |||||
| } | |||||
| params = { | |||||
| validated_from: Date.today - 30, # Must be a Date object; optional | |||||
| validated_to: Date.today, # Must be a Date object; optional | |||||
| description: 'keywords to search for', # Optional | |||||
| } | |||||
| apps = UKPlanningScraper.search(url, params, options) | |||||
| pp apps | |||||
| ``` | |||||
| Try [ScraperWiki](https://github.com/openaustralia/scraperwiki-ruby) if you want a quick and easy way to throw the results into an SQLite database: | |||||
| ```ruby | |||||
| require 'scraperwiki' # Must be installed, of course | |||||
| ScraperWiki.save_sqlite([:council_reference], apps) | |||||
| ``` | |||||
| ## Development | |||||
| After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment. | |||||
| To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org). | |||||
| ## Contributing | |||||
| Bug reports and pull requests are welcome on GitHub at https://github.com/adrianshort/uk_planning_scraper. | |||||
| @@ -0,0 +1,2 @@ | |||||
| require "bundler/gem_tasks" | |||||
| task :default => :spec | |||||
| @@ -0,0 +1,14 @@ | |||||
| #!/usr/bin/env ruby | |||||
| require "bundler/setup" | |||||
| require "uk_planning_scraper" | |||||
| # You can add fixtures and/or initialization code here to make experimenting | |||||
| # with your gem easier. You can also use a different console, if you like. | |||||
| # (If you use this, don't forget to add pry to your Gemfile!) | |||||
| # require "pry" | |||||
| # Pry.start | |||||
| require "irb" | |||||
| IRB.start(__FILE__) | |||||
| @@ -0,0 +1,8 @@ | |||||
| #!/usr/bin/env bash | |||||
| set -euo pipefail | |||||
| IFS=$'\n\t' | |||||
| set -vx | |||||
| bundle install | |||||
| # Do any other automated setup that you need to do here | |||||
| @@ -0,0 +1,79 @@ | |||||
| require "uk_planning_scraper/version" | |||||
| require 'mechanize' | |||||
| require 'time' | |||||
| require 'logger' | |||||
| require 'pp' | |||||
| module UKPlanningScraper | |||||
| def self.search(search_url, criteria, options = {}) | |||||
| default_options = { | |||||
| delay: 10, | |||||
| } | |||||
| @options = default_options.merge(options) # The user-supplied options override the defaults | |||||
| @search_url = search_url | |||||
| @base_url = search_url.match(/(https?:\/\/.+?)\//)[1] | |||||
| apps = [] | |||||
| # Regex doesn't work for Newham, Greenwich, Tower Hamlets which don't have the Received date in the text | |||||
| meta_regex = /Ref\. No:\s+(.+)\s+.+\s+Received:\s+(.+)\s+.+\s+Validated:\s+(.+)\s+.+\s+Status:\s+(.+)/ | |||||
| agent = Mechanize.new | |||||
| puts "Getting: #{@search_url}" | |||||
| page = agent.get(@search_url) # load the search form page | |||||
| # Fill out and submit search form | |||||
| form = page.form('searchCriteriaForm') | |||||
| # form.action = form.action + '&searchCriteria.resultsPerPage=100' | |||||
| # Some councils don't have the received from/to dates on their form, eg Newham | |||||
| form.send(:"date(applicationReceivedStart)", criteria[:received_from].strftime("%d/%m/%Y")) if criteria[:received_from] | |||||
| form.send(:"date(applicationReceivedEnd)", criteria[:received_to].strftime("%d/%m/%Y")) if criteria[:received_to] | |||||
| form.send(:"date(applicationValidatedStart)", criteria[:validated_from].strftime("%d/%m/%Y")) if criteria[:validated_from] | |||||
| form.send(:"date(applicationValidatedEnd)", criteria[:validated_to].strftime("%d/%m/%Y")) if criteria[:validated_to] | |||||
| form.send(:"searchCriteria\.description", criteria[:description]) | |||||
| # Some councils don't have the applicant name on their form, eg Bexley | |||||
| form.send(:"searchCriteria\.applicantName", criteria[:applicant_name]) if form.has_field? 'searchCriteria.applicantName' | |||||
| form.send(:"searchCriteria\.caseType", criteria[:application_type]) | |||||
| page = form.submit | |||||
| loop do | |||||
| # Parse search results | |||||
| items = page.search('li.searchresult') | |||||
| puts "Found #{items.size} apps on this page." | |||||
| items.each do |app| | |||||
| matches = app.at("p.metaInfo").inner_html.match(meta_regex) | |||||
| data = { | |||||
| council_reference: matches[1].strip, | |||||
| scraped_at: Time.now, | |||||
| date_received: Date.parse(matches[2]), | |||||
| date_validated: Date.parse(matches[3]), | |||||
| info_url: @base_url + app.at('a')['href'], | |||||
| address: app.at('p.address').inner_text.strip, | |||||
| description: app.at('a').inner_text.strip, | |||||
| status: matches[4].strip | |||||
| } | |||||
| apps << data | |||||
| end | |||||
| # Get the Next button from the pager, if there is one | |||||
| if next_button = page.at('a.next') | |||||
| next_url = @base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100' | |||||
| sleep @options[:delay] | |||||
| puts "Getting: #{next_url}" | |||||
| page = agent.get(next_url) | |||||
| else | |||||
| break | |||||
| end | |||||
| end | |||||
| apps | |||||
| end | |||||
| end | |||||
| @@ -0,0 +1,3 @@ | |||||
| module UKPlanningScraper | |||||
| VERSION = "0.1.0" | |||||
| end | |||||
| @@ -0,0 +1,35 @@ | |||||
| # coding: utf-8 | |||||
| lib = File.expand_path("../lib", __FILE__) | |||||
| $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) | |||||
| require "uk_planning_scraper/version" | |||||
| Gem::Specification.new do |spec| | |||||
| spec.name = "uk_planning_scraper" | |||||
| spec.version = UKPlanningScraper::VERSION | |||||
| spec.authors = ["Adrian Short"] | |||||
| spec.summary = %q{Get planning applications data from UK council websites.} | |||||
| # spec.description = %q{TODO: Write a longer description or delete this line.} | |||||
| spec.homepage = "https://github.com/adrianshort/uk_planning_scraper/" | |||||
| # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host' | |||||
| # to allow pushing to a single host or delete this section to allow pushing to any host. | |||||
| if spec.respond_to?(:metadata) | |||||
| spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'" | |||||
| else | |||||
| raise "RubyGems 2.0 or newer is required to protect against " \ | |||||
| "public gem pushes." | |||||
| end | |||||
| spec.files = `git ls-files -z`.split("\x0").reject do |f| | |||||
| f.match(%r{^(test|spec|features)/}) | |||||
| end | |||||
| spec.bindir = "exe" | |||||
| spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } | |||||
| spec.require_paths = ["lib"] | |||||
| spec.add_development_dependency "bundler", "~> 1.15" | |||||
| spec.add_development_dependency "rake", "~> 10.0" | |||||
| spec.add_runtime_dependency "mechanize", "~> 2.7" | |||||
| end | |||||