From 19958d3d1556a0a7998115eb2587725eaabc83c6 Mon Sep 17 00:00:00 2001 From: Adrian Short Date: Mon, 26 Jun 2017 11:53:47 +0100 Subject: [PATCH] Add template for morph.io scraper --- .gitignore | 2 ++ Gemfile | 10 ++++++++++ Gemfile.lock | 47 +++++++++++++++++++++++++++++++++++++++++++++++ README.md | 1 + scraper.rb | 25 +++++++++++++++++++++++++ 5 files changed, 85 insertions(+) create mode 100644 .gitignore create mode 100644 Gemfile create mode 100644 Gemfile.lock create mode 100644 README.md create mode 100644 scraper.rb diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..66d464d --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +# Ignore output of scraper +data.sqlite diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..6ab45dc --- /dev/null +++ b/Gemfile @@ -0,0 +1,10 @@ +# It's easy to add more libraries or choose different versions. Any libraries +# specified here will be installed and made available to your morph.io scraper. +# Find out more: https://morph.io/documentation/ruby + +source "https://rubygems.org" + +ruby "2.0.0" + +gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults" +gem "mechanize" diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..30fb5f3 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,47 @@ +GIT + remote: https://github.com/openaustralia/scraperwiki-ruby.git + revision: fc50176812505e463077d5c673d504a6a234aa78 + branch: morph_defaults + specs: + scraperwiki (3.0.1) + httpclient + sqlite_magic + +GEM + remote: https://rubygems.org/ + specs: + domain_name (0.5.24) + unf (>= 0.0.5, < 1.0.0) + http-cookie (1.0.2) + domain_name (~> 0.5) + httpclient (2.6.0.1) + mechanize (2.7.3) + domain_name (~> 0.5, >= 0.5.1) + http-cookie (~> 1.0) + mime-types (~> 2.0) + net-http-digest_auth (~> 1.1, >= 1.1.1) + net-http-persistent (~> 2.5, >= 2.5.2) + nokogiri (~> 1.4) + ntlm-http (~> 0.1, >= 0.1.1) + webrobots (>= 0.0.9, < 0.2) + mime-types (2.5) + mini_portile (0.6.2) + net-http-digest_auth (1.4) + net-http-persistent (2.9.4) + nokogiri (1.6.6.2) + mini_portile (~> 0.6.0) + ntlm-http (0.1.1) + sqlite3 (1.3.10) + sqlite_magic (0.0.3) + sqlite3 + unf (0.1.4) + unf_ext + unf_ext (0.0.7.1) + webrobots (0.1.1) + +PLATFORMS + ruby + +DEPENDENCIES + mechanize + scraperwiki! diff --git a/README.md b/README.md new file mode 100644 index 0000000..e541894 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +This is a scraper that runs on [Morph](https://morph.io). To get started [see the documentation](https://morph.io/documentation) \ No newline at end of file diff --git a/scraper.rb b/scraper.rb new file mode 100644 index 0000000..5799e98 --- /dev/null +++ b/scraper.rb @@ -0,0 +1,25 @@ +# This is a template for a Ruby scraper on morph.io (https://morph.io) +# including some code snippets below that you should find helpful + +# require 'scraperwiki' +# require 'mechanize' +# +# agent = Mechanize.new +# +# # Read in a page +# page = agent.get("http://foo.com") +# +# # Find somehing on the page using css selectors +# p page.at('div.content') +# +# # Write out to the sqlite database using scraperwiki library +# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"}) +# +# # An arbitrary query against the database +# ScraperWiki.select("* from data where 'name'='peter'") + +# You don't have to do things with the Mechanize or ScraperWiki libraries. +# You can use whatever gems you want: https://morph.io/documentation/ruby +# All that matters is that your final data is written to an SQLite database +# called "data.sqlite" in the current working directory which has at least a table +# called "data".