From bdd17f0cf45bdb5a077ff8f9ca718b19e00aeb48 Mon Sep 17 00:00:00 2001 From: Adrian Short Date: Thu, 20 Sep 2018 16:53:00 +0100 Subject: [PATCH] First commit --- Gemfile | 4 ++-- Gemfile.lock | 59 +++++++++++++++++++++++++++++++++++++++------------- README.md | 4 +++- scraper.rb | 28 ++++--------------------- 4 files changed, 53 insertions(+), 42 deletions(-) diff --git a/Gemfile b/Gemfile index 6ab45dc..6f67b5a 100644 --- a/Gemfile +++ b/Gemfile @@ -4,7 +4,7 @@ source "https://rubygems.org" -ruby "2.0.0" +ruby "2.3.1" gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults" -gem "mechanize" +gem "uk_planning_scraper", git: "https://github.com/adrianshort/uk_planning_scraper.git", ref: "6d72d25" diff --git a/Gemfile.lock b/Gemfile.lock index 30fb5f3..dd0c31f 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,3 +1,12 @@ +GIT + remote: https://github.com/adrianshort/uk_planning_scraper.git + revision: 6d72d251665941b56daaefa33d463b4f590b4ace + ref: 6d72d25 + specs: + uk_planning_scraper (0.2.0) + http + mechanize (~> 2.7) + GIT remote: https://github.com/openaustralia/scraperwiki-ruby.git revision: fc50176812505e463077d5c673d504a6a234aa78 @@ -10,38 +19,58 @@ GIT GEM remote: https://rubygems.org/ specs: - domain_name (0.5.24) + addressable (2.5.2) + public_suffix (>= 2.0.2, < 4.0) + connection_pool (2.2.2) + domain_name (0.5.20180417) unf (>= 0.0.5, < 1.0.0) - http-cookie (1.0.2) + http (3.3.0) + addressable (~> 2.3) + http-cookie (~> 1.0) + http-form_data (~> 2.0) + http_parser.rb (~> 0.6.0) + http-cookie (1.0.3) domain_name (~> 0.5) + http-form_data (2.1.1) + http_parser.rb (0.6.0) httpclient (2.6.0.1) - mechanize (2.7.3) + mechanize (2.7.6) domain_name (~> 0.5, >= 0.5.1) http-cookie (~> 1.0) - mime-types (~> 2.0) + mime-types (>= 1.17.2) net-http-digest_auth (~> 1.1, >= 1.1.1) - net-http-persistent (~> 2.5, >= 2.5.2) - nokogiri (~> 1.4) + net-http-persistent (>= 2.5.2) + nokogiri (~> 1.6) ntlm-http (~> 0.1, >= 0.1.1) webrobots (>= 0.0.9, < 0.2) - mime-types (2.5) - mini_portile (0.6.2) - net-http-digest_auth (1.4) - net-http-persistent (2.9.4) - nokogiri (1.6.6.2) - mini_portile (~> 0.6.0) + mime-types (3.2.2) + mime-types-data (~> 3.2015) + mime-types-data (3.2018.0812) + mini_portile2 (2.3.0) + net-http-digest_auth (1.4.1) + net-http-persistent (3.0.0) + connection_pool (~> 2.2) + nokogiri (1.8.4) + mini_portile2 (~> 2.3.0) ntlm-http (0.1.1) + public_suffix (3.0.3) sqlite3 (1.3.10) sqlite_magic (0.0.3) sqlite3 unf (0.1.4) unf_ext - unf_ext (0.0.7.1) - webrobots (0.1.1) + unf_ext (0.0.7.5) + webrobots (0.1.2) PLATFORMS ruby DEPENDENCIES - mechanize scraperwiki! + uk_planning_scraper! + +RUBY VERSION + ruby 2.3.1p112 + +BUNDLED WITH + 1.15.4 diff --git a/README.md b/README.md index e541894..d87d340 100644 --- a/README.md +++ b/README.md @@ -1 +1,3 @@ -This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation) \ No newline at end of file +This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation). + +This exists to enable [uk_planning_scraper](https://github.com/adrianshort/uk_planning_scraper) to be tested in the Morph environment, giving us some degree of [dev/prod parity](https://12factor.net/dev-prod-parity). \ No newline at end of file diff --git a/scraper.rb b/scraper.rb index 5799e98..b35f20d 100644 --- a/scraper.rb +++ b/scraper.rb @@ -1,25 +1,5 @@ -# This is a template for a Ruby scraper on morph.io (https://morph.io) -# including some code snippets below that you should find helpful +require 'uk_planning_scraper' +require 'scraperwiki' -# require 'scraperwiki' -# require 'mechanize' -# -# agent = Mechanize.new -# -# # Read in a page -# page = agent.get("http://foo.com") -# -# # Find somehing on the page using css selectors -# p page.at('div.content') -# -# # Write out to the sqlite database using scraperwiki library -# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"}) -# -# # An arbitrary query against the database -# ScraperWiki.select("* from data where 'name'='peter'") - -# You don't have to do things with the Mechanize or ScraperWiki libraries. -# You can use whatever gems you want: https://morph.io/documentation/ruby -# All that matters is that your final data is written to an SQLite database -# called "data.sqlite" in the current working directory which has at least a table -# called "data". +apps = UKPlanningScraper::Authority.named(ENV['MORPH_AUTHORITY_NAME']).scrape({ validated_days: ENV['MORPH_DAYS'].to_i }) +ScraperWiki.save_sqlite([:authority_name, :council_reference], apps)