diff --git a/.gitignore b/.gitignore index 7b46ec7..ba0ddd7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ data.sqlite # Ignore output of scraper .ruby-gemset +.ruby-version +*.html diff --git a/Gemfile b/Gemfile index 2616a77..282d3ed 100644 --- a/Gemfile +++ b/Gemfile @@ -4,7 +4,7 @@ source "https://rubygems.org" -ruby "2.3.1" +# ruby "2.3.1" gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults" gem "mechanize" diff --git a/Gemfile.lock b/Gemfile.lock index 3f475d9..05a919e 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -10,34 +10,36 @@ GIT GEM remote: https://rubygems.org/ specs: - domain_name (0.5.24) + domain_name (0.5.20170404) unf (>= 0.0.5, < 1.0.0) - http-cookie (1.0.2) + http-cookie (1.0.3) domain_name (~> 0.5) - httpclient (2.6.0.1) - mechanize (2.7.3) + httpclient (2.8.3) + mechanize (2.7.5) domain_name (~> 0.5, >= 0.5.1) http-cookie (~> 1.0) - mime-types (~> 2.0) + mime-types (>= 1.17.2) net-http-digest_auth (~> 1.1, >= 1.1.1) net-http-persistent (~> 2.5, >= 2.5.2) - nokogiri (~> 1.4) + nokogiri (~> 1.6) ntlm-http (~> 0.1, >= 0.1.1) webrobots (>= 0.0.9, < 0.2) - mime-types (2.5) - mini_portile (0.6.2) - net-http-digest_auth (1.4) + mime-types (3.1) + mime-types-data (~> 3.2015) + mime-types-data (3.2016.0521) + mini_portile2 (2.2.0) + net-http-digest_auth (1.4.1) net-http-persistent (2.9.4) - nokogiri (1.6.6.2) - mini_portile (~> 0.6.0) + nokogiri (1.8.0) + mini_portile2 (~> 2.2.0) ntlm-http (0.1.1) - sqlite3 (1.3.10) - sqlite_magic (0.0.3) + sqlite3 (1.3.13) + sqlite_magic (0.0.6) sqlite3 unf (0.1.4) unf_ext - unf_ext (0.0.7.1) - webrobots (0.1.1) + unf_ext (0.0.7.4) + webrobots (0.1.2) PLATFORMS ruby @@ -46,8 +48,5 @@ DEPENDENCIES mechanize scraperwiki! -RUBY VERSION - ruby 2.3.1p112 - BUNDLED WITH 1.15.1 diff --git a/scraper.rb b/scraper.rb index 5799e98..1d02618 100644 --- a/scraper.rb +++ b/scraper.rb @@ -1,25 +1,55 @@ # This is a template for a Ruby scraper on morph.io (https://morph.io) # including some code snippets below that you should find helpful -# require 'scraperwiki' -# require 'mechanize' -# -# agent = Mechanize.new +require 'bundler' +Bundler.setup +require 'scraperwiki' +require 'mechanize' +require 'pp' + +BASEURL = "https://maps.kingston.gov.uk/propertyServices/planning/" + +agent = Mechanize.new +agent.verify_mode = OpenSSL::SSL::VERIFY_NONE # # # Read in a page -# page = agent.get("http://foo.com") +page = agent.get("https://maps.kingston.gov.uk/propertyServices/planning/Summary?weekListType=SRCH&recFrom=01/Jan/2017&recTo=01/Feb/2017&ward=ALL&appTyp=ALL&wardTxt=All%20Wards&appTypTxt=All%20Application%20Types&limit=50") # -# # Find somehing on the page using css selectors -# p page.at('div.content') -# -# # Write out to the sqlite database using scraperwiki library -# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"}) -# -# # An arbitrary query against the database -# ScraperWiki.select("* from data where 'name'='peter'") +# page = Nokogiri::HTML(open("page.html")) + +apps = page.search("#planningApplication") -# You don't have to do things with the Mechanize or ScraperWiki libraries. -# You can use whatever gems you want: https://morph.io/documentation/ruby -# All that matters is that your final data is written to an SQLite database -# called "data.sqlite" in the current working directory which has at least a table -# called "data". +apps.each do |app| + @title = app.at("h4").inner_text + @id = @title.match(/\d+\/\d+\/\w+/)[0] + puts @id + app.search("a").each do |link| + @url = BASEURL + link['href'].strip if link['href'].match(/Details\.aspx/) + puts @url + @map_url = link['href'].strip if link['href'].match(/\?map=/) + end + spans = app.search("span") + @description = spans[0].inner_text + @address = spans[1].inner_text + @ward = spans[2].inner_text + + begin + @date_valid = Date.parse(spans[3].inner_text) + @date_valid_text = nil + rescue ArgumentError + @date_valid = nil + @date_valid_text = spans[3].inner_text + end + + ScraperWiki.save_sqlite(["id"], + { 'id' => @id, + 'url' => @url, + 'title' => @title, + 'description' => @description, + 'address' => @address, + 'ward' => @ward, + 'date_valid' => @date_valid, + 'date_valid_text' => @date_valid_text, + 'map_url' => @map_url + }) +end