Sfoglia il codice sorgente

Create basic scraper

master
Adrian Short 7 anni fa
parent
commit
f5eed6d97b
4 ha cambiato i file con 68 aggiunte e 37 eliminazioni
  1. +2
    -0
      .gitignore
  2. +1
    -1
      Gemfile
  3. +17
    -18
      Gemfile.lock
  4. +48
    -18
      scraper.rb

+ 2
- 0
.gitignore Vedi File

@@ -1,2 +1,4 @@
data.sqlite # Ignore output of scraper
.ruby-gemset
.ruby-version
*.html

+ 1
- 1
Gemfile Vedi File

@@ -4,7 +4,7 @@

source "https://rubygems.org"

ruby "2.3.1"
# ruby "2.3.1"

gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
gem "mechanize"

+ 17
- 18
Gemfile.lock Vedi File

@@ -10,34 +10,36 @@ GIT
GEM
remote: https://rubygems.org/
specs:
domain_name (0.5.24)
domain_name (0.5.20170404)
unf (>= 0.0.5, < 1.0.0)
http-cookie (1.0.2)
http-cookie (1.0.3)
domain_name (~> 0.5)
httpclient (2.6.0.1)
mechanize (2.7.3)
httpclient (2.8.3)
mechanize (2.7.5)
domain_name (~> 0.5, >= 0.5.1)
http-cookie (~> 1.0)
mime-types (~> 2.0)
mime-types (>= 1.17.2)
net-http-digest_auth (~> 1.1, >= 1.1.1)
net-http-persistent (~> 2.5, >= 2.5.2)
nokogiri (~> 1.4)
nokogiri (~> 1.6)
ntlm-http (~> 0.1, >= 0.1.1)
webrobots (>= 0.0.9, < 0.2)
mime-types (2.5)
mini_portile (0.6.2)
net-http-digest_auth (1.4)
mime-types (3.1)
mime-types-data (~> 3.2015)
mime-types-data (3.2016.0521)
mini_portile2 (2.2.0)
net-http-digest_auth (1.4.1)
net-http-persistent (2.9.4)
nokogiri (1.6.6.2)
mini_portile (~> 0.6.0)
nokogiri (1.8.0)
mini_portile2 (~> 2.2.0)
ntlm-http (0.1.1)
sqlite3 (1.3.10)
sqlite_magic (0.0.3)
sqlite3 (1.3.13)
sqlite_magic (0.0.6)
sqlite3
unf (0.1.4)
unf_ext
unf_ext (0.0.7.1)
webrobots (0.1.1)
unf_ext (0.0.7.4)
webrobots (0.1.2)

PLATFORMS
ruby
@@ -46,8 +48,5 @@ DEPENDENCIES
mechanize
scraperwiki!

RUBY VERSION
ruby 2.3.1p112

BUNDLED WITH
1.15.1

+ 48
- 18
scraper.rb Vedi File

@@ -1,25 +1,55 @@
# This is a template for a Ruby scraper on morph.io (https://morph.io)
# including some code snippets below that you should find helpful

# require 'scraperwiki'
# require 'mechanize'
#
# agent = Mechanize.new
require 'bundler'
Bundler.setup
require 'scraperwiki'
require 'mechanize'
require 'pp'

BASEURL = "https://maps.kingston.gov.uk/propertyServices/planning/"

agent = Mechanize.new
agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
#
# # Read in a page
# page = agent.get("http://foo.com")
page = agent.get("https://maps.kingston.gov.uk/propertyServices/planning/Summary?weekListType=SRCH&recFrom=01/Jan/2017&recTo=01/Feb/2017&ward=ALL&appTyp=ALL&wardTxt=All%20Wards&appTypTxt=All%20Application%20Types&limit=50")
#
# # Find somehing on the page using css selectors
# p page.at('div.content')
#
# # Write out to the sqlite database using scraperwiki library
# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"})
#
# # An arbitrary query against the database
# ScraperWiki.select("* from data where 'name'='peter'")
# page = Nokogiri::HTML(open("page.html"))

apps = page.search("#planningApplication")

# You don't have to do things with the Mechanize or ScraperWiki libraries.
# You can use whatever gems you want: https://morph.io/documentation/ruby
# All that matters is that your final data is written to an SQLite database
# called "data.sqlite" in the current working directory which has at least a table
# called "data".
apps.each do |app|
@title = app.at("h4").inner_text
@id = @title.match(/\d+\/\d+\/\w+/)[0]
puts @id
app.search("a").each do |link|
@url = BASEURL + link['href'].strip if link['href'].match(/Details\.aspx/)
puts @url
@map_url = link['href'].strip if link['href'].match(/\?map=/)
end
spans = app.search("span")
@description = spans[0].inner_text
@address = spans[1].inner_text
@ward = spans[2].inner_text
begin
@date_valid = Date.parse(spans[3].inner_text)
@date_valid_text = nil
rescue ArgumentError
@date_valid = nil
@date_valid_text = spans[3].inner_text
end
ScraperWiki.save_sqlite(["id"],
{ 'id' => @id,
'url' => @url,
'title' => @title,
'description' => @description,
'address' => @address,
'ward' => @ward,
'date_valid' => @date_valid,
'date_valid_text' => @date_valid_text,
'map_url' => @map_url
})
end

Caricamento…
Annulla
Salva