From 9b86779fe7c9c5e3d720b14ee6d272e8433f9740 Mon Sep 17 00:00:00 2001 From: Graeme Porteous Date: Mon, 15 Apr 2019 11:58:27 +0100 Subject: [PATCH] Extract common Mechanize error handling --- lib/uk_planning_scraper/idox.rb | 39 +++++++++++++++------------------ 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/lib/uk_planning_scraper/idox.rb b/lib/uk_planning_scraper/idox.rb index 230799e..0e10c08 100644 --- a/lib/uk_planning_scraper/idox.rb +++ b/lib/uk_planning_scraper/idox.rb @@ -13,13 +13,23 @@ module UKPlanningScraper @agent ||= Mechanize.new end + def get(url, &block) + puts "Getting: #{url}" + res = agent.get(url) + + if res.code == '200' # That's a String not an Integer, ffs + block_given? ? block.call(res) : res + else + puts "Error: HTTP #{res.code}" + end + end + def scrape_idox(params, options) puts "Using Idox scraper." apps = [] - puts "Getting: #{@url}" - page = agent.get(@url) # load the search form page + page = get(@url) # load the search form page # Check that the search form is actually present. # When Idox has an internal error it returns an error page with HTTP 200. @@ -107,8 +117,7 @@ module UKPlanningScraper if next_button = page.at('a.next') next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100' sleep options[:delay] - puts "Getting: #{next_url}" - page = agent.get(next_url) + page = get(next_url) else break end @@ -117,7 +126,7 @@ module UKPlanningScraper # Scrape the summary tab for each app apps.each_with_index do |app, i| sleep options[:delay] - puts "#{i + 1} of #{apps.size}: #{app.info_url}" + puts "#{i + 1} of #{apps.size}" parse_info_url(app) if app.info_url parse_property_url(app) if app.property_url @@ -127,9 +136,7 @@ module UKPlanningScraper end # scrape_idox def parse_info_url(app) - res = agent.get(app.info_url) - - if res.code == '200' # That's a String not an Integer, ffs + get(app.info_url) do |res| # Parse the summary tab for this app app.scraped_at = Time.now @@ -196,23 +203,17 @@ module UKPlanningScraper app.property_url = base_url + property_association_link[:href] app.property_count = property_association_link.inner_text.to_i end - else - puts "Error: HTTP #{res.code}" - end # if + end # get end def parse_property_url(app) # get URLs of property pages app.property_detail_urls = [] - res = agent.get(app.property_url) - - if res.code == '200' + get(app.property_url) do |res| res.search('#Property li a').each do |property_link| app.property_detail_urls << base_url + property_link[:href] end - else - puts "Error: HTTP #{res.code}" end end @@ -221,9 +222,7 @@ module UKPlanningScraper app.properties = [] app.property_detail_urls.each do |property_url| - res = agent.get(property_url) - - if res.code == '200' + get(property_url) do |res| property = Property.new res.search('#propertyAddress tr').each do |row| @@ -251,8 +250,6 @@ module UKPlanningScraper end app.properties << property - else - puts "Error: HTTP #{res.code}" end end end