| @@ -0,0 +1,9 @@ | |||
| /.bundle/ | |||
| /.yardoc | |||
| /Gemfile.lock | |||
| /_yardoc/ | |||
| /coverage/ | |||
| /doc/ | |||
| /pkg/ | |||
| /spec/reports/ | |||
| /tmp/ | |||
| @@ -0,0 +1,4 @@ | |||
| source "https://rubygems.org" | |||
| # Specify your gem's dependencies in idox_planning_scraper.gemspec | |||
| gemspec | |||
| @@ -0,0 +1,70 @@ | |||
| # UK Planning Scraper | |||
| **PRE-ALPHA: Only works with some Idox sites and spews a lot of stuff to STDOUT. Not for production use.** | |||
| This gem scrapes planning applications data from UK council/local planning authority websites, eg Westminster City Council. Data is returned as an array of hashes, one hash for each planning application. | |||
| This scraper gem doesn't use a database. Storing the output is up to you. It's just a convenient way to get the data. | |||
| Currently this only works for some Idox sites. The ultimate aim is to provide a consistent interface in a single gem for all variants of all planning systems: Idox Public Access, Northgate Planning Explorer, OcellaWeb, and all the one-off systems. | |||
| This project is not affiliated with any organisation. | |||
| ## Installation | |||
| Add this line to your application's Gemfile: | |||
| ```ruby | |||
| gem 'uk_planning_scraper', :git => 'https://github.com/adrianshort/uk_planning_scraper/' | |||
| ``` | |||
| And then execute: | |||
| $ bundle | |||
| Or install it yourself as: | |||
| $ gem install specific_install | |||
| $ gem specific_install adrianshort/uk_planning_scraper | |||
| ## Usage | |||
| ```ruby | |||
| require 'uk_planning_scraper' | |||
| require 'date' | |||
| require 'pp' | |||
| # change this to the URL of the advanced search page for the council you want | |||
| url = 'https://planning.anytown.gov.uk/online-applications/search.do?action=advanced' | |||
| options = { | |||
| delay: 10, # seconds between scrape requests; optional, defaults to 10 | |||
| } | |||
| params = { | |||
| validated_from: Date.today - 30, # Must be a Date object; optional | |||
| validated_to: Date.today, # Must be a Date object; optional | |||
| description: 'keywords to search for', # Optional | |||
| } | |||
| apps = UKPlanningScraper.search(url, params, options) | |||
| pp apps | |||
| ``` | |||
| Try [ScraperWiki](https://github.com/openaustralia/scraperwiki-ruby) if you want a quick and easy way to throw the results into an SQLite database: | |||
| ```ruby | |||
| require 'scraperwiki' # Must be installed, of course | |||
| ScraperWiki.save_sqlite([:council_reference], apps) | |||
| ``` | |||
| ## Development | |||
| After checking out the repo, run `bin/setup` to install dependencies. You can also run `bin/console` for an interactive prompt that will allow you to experiment. | |||
| To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org). | |||
| ## Contributing | |||
| Bug reports and pull requests are welcome on GitHub at https://github.com/adrianshort/uk_planning_scraper. | |||
| @@ -0,0 +1,2 @@ | |||
| require "bundler/gem_tasks" | |||
| task :default => :spec | |||
| @@ -0,0 +1,14 @@ | |||
| #!/usr/bin/env ruby | |||
| require "bundler/setup" | |||
| require "uk_planning_scraper" | |||
| # You can add fixtures and/or initialization code here to make experimenting | |||
| # with your gem easier. You can also use a different console, if you like. | |||
| # (If you use this, don't forget to add pry to your Gemfile!) | |||
| # require "pry" | |||
| # Pry.start | |||
| require "irb" | |||
| IRB.start(__FILE__) | |||
| @@ -0,0 +1,8 @@ | |||
| #!/usr/bin/env bash | |||
| set -euo pipefail | |||
| IFS=$'\n\t' | |||
| set -vx | |||
| bundle install | |||
| # Do any other automated setup that you need to do here | |||
| @@ -0,0 +1,79 @@ | |||
| require "uk_planning_scraper/version" | |||
| require 'mechanize' | |||
| require 'time' | |||
| require 'logger' | |||
| require 'pp' | |||
| module UKPlanningScraper | |||
| def self.search(search_url, criteria, options = {}) | |||
| default_options = { | |||
| delay: 10, | |||
| } | |||
| @options = default_options.merge(options) # The user-supplied options override the defaults | |||
| @search_url = search_url | |||
| @base_url = search_url.match(/(https?:\/\/.+?)\//)[1] | |||
| apps = [] | |||
| # Regex doesn't work for Newham, Greenwich, Tower Hamlets which don't have the Received date in the text | |||
| meta_regex = /Ref\. No:\s+(.+)\s+.+\s+Received:\s+(.+)\s+.+\s+Validated:\s+(.+)\s+.+\s+Status:\s+(.+)/ | |||
| agent = Mechanize.new | |||
| puts "Getting: #{@search_url}" | |||
| page = agent.get(@search_url) # load the search form page | |||
| # Fill out and submit search form | |||
| form = page.form('searchCriteriaForm') | |||
| # form.action = form.action + '&searchCriteria.resultsPerPage=100' | |||
| # Some councils don't have the received from/to dates on their form, eg Newham | |||
| form.send(:"date(applicationReceivedStart)", criteria[:received_from].strftime("%d/%m/%Y")) if criteria[:received_from] | |||
| form.send(:"date(applicationReceivedEnd)", criteria[:received_to].strftime("%d/%m/%Y")) if criteria[:received_to] | |||
| form.send(:"date(applicationValidatedStart)", criteria[:validated_from].strftime("%d/%m/%Y")) if criteria[:validated_from] | |||
| form.send(:"date(applicationValidatedEnd)", criteria[:validated_to].strftime("%d/%m/%Y")) if criteria[:validated_to] | |||
| form.send(:"searchCriteria\.description", criteria[:description]) | |||
| # Some councils don't have the applicant name on their form, eg Bexley | |||
| form.send(:"searchCriteria\.applicantName", criteria[:applicant_name]) if form.has_field? 'searchCriteria.applicantName' | |||
| form.send(:"searchCriteria\.caseType", criteria[:application_type]) | |||
| page = form.submit | |||
| loop do | |||
| # Parse search results | |||
| items = page.search('li.searchresult') | |||
| puts "Found #{items.size} apps on this page." | |||
| items.each do |app| | |||
| matches = app.at("p.metaInfo").inner_html.match(meta_regex) | |||
| data = { | |||
| council_reference: matches[1].strip, | |||
| scraped_at: Time.now, | |||
| date_received: Date.parse(matches[2]), | |||
| date_validated: Date.parse(matches[3]), | |||
| info_url: @base_url + app.at('a')['href'], | |||
| address: app.at('p.address').inner_text.strip, | |||
| description: app.at('a').inner_text.strip, | |||
| status: matches[4].strip | |||
| } | |||
| apps << data | |||
| end | |||
| # Get the Next button from the pager, if there is one | |||
| if next_button = page.at('a.next') | |||
| next_url = @base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100' | |||
| sleep @options[:delay] | |||
| puts "Getting: #{next_url}" | |||
| page = agent.get(next_url) | |||
| else | |||
| break | |||
| end | |||
| end | |||
| apps | |||
| end | |||
| end | |||
| @@ -0,0 +1,3 @@ | |||
| module UKPlanningScraper | |||
| VERSION = "0.1.0" | |||
| end | |||
| @@ -0,0 +1,35 @@ | |||
| # coding: utf-8 | |||
| lib = File.expand_path("../lib", __FILE__) | |||
| $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) | |||
| require "uk_planning_scraper/version" | |||
| Gem::Specification.new do |spec| | |||
| spec.name = "uk_planning_scraper" | |||
| spec.version = UKPlanningScraper::VERSION | |||
| spec.authors = ["Adrian Short"] | |||
| spec.summary = %q{Get planning applications data from UK council websites.} | |||
| # spec.description = %q{TODO: Write a longer description or delete this line.} | |||
| spec.homepage = "https://github.com/adrianshort/uk_planning_scraper/" | |||
| # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host' | |||
| # to allow pushing to a single host or delete this section to allow pushing to any host. | |||
| if spec.respond_to?(:metadata) | |||
| spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'" | |||
| else | |||
| raise "RubyGems 2.0 or newer is required to protect against " \ | |||
| "public gem pushes." | |||
| end | |||
| spec.files = `git ls-files -z`.split("\x0").reject do |f| | |||
| f.match(%r{^(test|spec|features)/}) | |||
| end | |||
| spec.bindir = "exe" | |||
| spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } | |||
| spec.require_paths = ["lib"] | |||
| spec.add_development_dependency "bundler", "~> 1.15" | |||
| spec.add_development_dependency "rake", "~> 10.0" | |||
| spec.add_runtime_dependency "mechanize", "~> 2.7" | |||
| end | |||