Browse Source

Extract common Mechanize error handling

pull/38/head
Graeme Porteous 5 years ago
parent
commit
9b86779fe7
1 changed files with 18 additions and 21 deletions
  1. +18
    -21
      lib/uk_planning_scraper/idox.rb

+ 18
- 21
lib/uk_planning_scraper/idox.rb View File

@@ -13,13 +13,23 @@ module UKPlanningScraper
@agent ||= Mechanize.new @agent ||= Mechanize.new
end end


def get(url, &block)
puts "Getting: #{url}"
res = agent.get(url)

if res.code == '200' # That's a String not an Integer, ffs
block_given? ? block.call(res) : res
else
puts "Error: HTTP #{res.code}"
end
end

def scrape_idox(params, options) def scrape_idox(params, options)
puts "Using Idox scraper." puts "Using Idox scraper."


apps = [] apps = []


puts "Getting: #{@url}"
page = agent.get(@url) # load the search form page
page = get(@url) # load the search form page


# Check that the search form is actually present. # Check that the search form is actually present.
# When Idox has an internal error it returns an error page with HTTP 200. # When Idox has an internal error it returns an error page with HTTP 200.
@@ -107,8 +117,7 @@ module UKPlanningScraper
if next_button = page.at('a.next') if next_button = page.at('a.next')
next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100' next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
sleep options[:delay] sleep options[:delay]
puts "Getting: #{next_url}"
page = agent.get(next_url)
page = get(next_url)
else else
break break
end end
@@ -117,7 +126,7 @@ module UKPlanningScraper
# Scrape the summary tab for each app # Scrape the summary tab for each app
apps.each_with_index do |app, i| apps.each_with_index do |app, i|
sleep options[:delay] sleep options[:delay]
puts "#{i + 1} of #{apps.size}: #{app.info_url}"
puts "#{i + 1} of #{apps.size}"


parse_info_url(app) if app.info_url parse_info_url(app) if app.info_url
parse_property_url(app) if app.property_url parse_property_url(app) if app.property_url
@@ -127,9 +136,7 @@ module UKPlanningScraper
end # scrape_idox end # scrape_idox


def parse_info_url(app) def parse_info_url(app)
res = agent.get(app.info_url)

if res.code == '200' # That's a String not an Integer, ffs
get(app.info_url) do |res|
# Parse the summary tab for this app # Parse the summary tab for this app


app.scraped_at = Time.now app.scraped_at = Time.now
@@ -196,23 +203,17 @@ module UKPlanningScraper
app.property_url = base_url + property_association_link[:href] app.property_url = base_url + property_association_link[:href]
app.property_count = property_association_link.inner_text.to_i app.property_count = property_association_link.inner_text.to_i
end end
else
puts "Error: HTTP #{res.code}"
end # if
end # get
end end


def parse_property_url(app) def parse_property_url(app)
# get URLs of property pages # get URLs of property pages
app.property_detail_urls = [] app.property_detail_urls = []


res = agent.get(app.property_url)

if res.code == '200'
get(app.property_url) do |res|
res.search('#Property li a').each do |property_link| res.search('#Property li a').each do |property_link|
app.property_detail_urls << base_url + property_link[:href] app.property_detail_urls << base_url + property_link[:href]
end end
else
puts "Error: HTTP #{res.code}"
end end
end end


@@ -221,9 +222,7 @@ module UKPlanningScraper
app.properties = [] app.properties = []


app.property_detail_urls.each do |property_url| app.property_detail_urls.each do |property_url|
res = agent.get(property_url)

if res.code == '200'
get(property_url) do |res|
property = Property.new property = Property.new


res.search('#propertyAddress tr').each do |row| res.search('#propertyAddress tr').each do |row|
@@ -251,8 +250,6 @@ module UKPlanningScraper
end end


app.properties << property app.properties << property
else
puts "Error: HTTP #{res.code}"
end end
end end
end end


Loading…
Cancel
Save