Create basic scraper

7 years ago · f5eed6d97b
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,4 @@
 data.sqlite # Ignore output of scraper
 .ruby-gemset
 .ruby-version
 *.html
--- a/+ 1
+++ b/+ 1
@@ -4,7 +4,7 @@

 source "https://rubygems.org"

 ruby "2.3.1"
 # ruby "2.3.1"

 gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
 gem "mechanize"
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -10,34 +10,36 @@ GIT
 GEM
  remote: https://rubygems.org/
  specs:
    domain_name (0.5.24)
    domain_name (0.5.20170404)
      unf (>= 0.0.5, < 1.0.0)
    http-cookie (1.0.2)
    http-cookie (1.0.3)
      domain_name (~> 0.5)
    httpclient (2.6.0.1)
    mechanize (2.7.3)
    httpclient (2.8.3)
    mechanize (2.7.5)
      domain_name (~> 0.5, >= 0.5.1)
      http-cookie (~> 1.0)
      mime-types (~> 2.0)
      mime-types (>= 1.17.2)
      net-http-digest_auth (~> 1.1, >= 1.1.1)
      net-http-persistent (~> 2.5, >= 2.5.2)
      nokogiri (~> 1.4)
      nokogiri (~> 1.6)
      ntlm-http (~> 0.1, >= 0.1.1)
      webrobots (>= 0.0.9, < 0.2)
    mime-types (2.5)
    mini_portile (0.6.2)
    net-http-digest_auth (1.4)
    mime-types (3.1)
      mime-types-data (~> 3.2015)
    mime-types-data (3.2016.0521)
    mini_portile2 (2.2.0)
    net-http-digest_auth (1.4.1)
    net-http-persistent (2.9.4)
    nokogiri (1.6.6.2)
      mini_portile (~> 0.6.0)
    nokogiri (1.8.0)
      mini_portile2 (~> 2.2.0)
    ntlm-http (0.1.1)
    sqlite3 (1.3.10)
    sqlite_magic (0.0.3)
    sqlite3 (1.3.13)
    sqlite_magic (0.0.6)
      sqlite3
    unf (0.1.4)
      unf_ext
    unf_ext (0.0.7.1)
    webrobots (0.1.1)
    unf_ext (0.0.7.4)
    webrobots (0.1.2)

 PLATFORMS
  ruby
@@ -46,8 +48,5 @@ DEPENDENCIES
  mechanize
  scraperwiki!

 RUBY VERSION
   ruby 2.3.1p112

 BUNDLED WITH
   1.15.1
--- a/scraper.rb
+++ b/scraper.rb
@@ -1,25 +1,55 @@
 # This is a template for a Ruby scraper on morph.io (https://morph.io)
 # including some code snippets below that you should find helpful

 # require 'scraperwiki'
 # require 'mechanize'
 #
 # agent = Mechanize.new
 require 'bundler'
 Bundler.setup
 require 'scraperwiki'
 require 'mechanize'
 require 'pp'

 BASEURL = "https://maps.kingston.gov.uk/propertyServices/planning/"

 agent = Mechanize.new
 agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
 #
 # # Read in a page
 # page = agent.get("http://foo.com")
 page = agent.get("https://maps.kingston.gov.uk/propertyServices/planning/Summary?weekListType=SRCH&recFrom=01/Jan/2017&recTo=01/Feb/2017&ward=ALL&appTyp=ALL&wardTxt=All%20Wards&appTypTxt=All%20Application%20Types&limit=50")
 #
 # # Find somehing on the page using css selectors
 # p page.at('div.content')
 #
 # # Write out to the sqlite database using scraperwiki library
 # ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"})
 #
 # # An arbitrary query against the database
 # ScraperWiki.select("* from data where 'name'='peter'")
 # page = Nokogiri::HTML(open("page.html"))

 apps = page.search("#planningApplication")

 # You don't have to do things with the Mechanize or ScraperWiki libraries.
 # You can use whatever gems you want: https://morph.io/documentation/ruby
 # All that matters is that your final data is written to an SQLite database
 # called "data.sqlite" in the current working directory which has at least a table
 # called "data".
 apps.each do |app|
  @title = app.at("h4").inner_text
  @id = @title.match(/\d+\/\d+\/\w+/)[0]
  puts @id
  app.search("a").each do |link|
    @url = BASEURL + link['href'].strip if link['href'].match(/Details\.aspx/)
    puts @url
    @map_url = link['href'].strip if link['href'].match(/\?map=/)
  end
  spans = app.search("span")
  @description = spans[0].inner_text
  @address = spans[1].inner_text
  @ward = spans[2].inner_text
  
  begin
    @date_valid = Date.parse(spans[3].inner_text)
    @date_valid_text = nil
  rescue ArgumentError
    @date_valid = nil
    @date_valid_text = spans[3].inner_text
  end
  
  ScraperWiki.save_sqlite(["id"],
    { 'id' => @id,
      'url' => @url,
      'title' => @title, 
      'description' => @description,
      'address' => @address,
      'ward' => @ward,
      'date_valid' => @date_valid,
      'date_valid_text' => @date_valid_text,
      'map_url' => @map_url
  })
 end