Explorar el Código

Extract common Mechanize error handling

pull/38/head
Graeme Porteous hace 5 años
padre
commit
9b86779fe7
Se han modificado 1 ficheros con 18 adiciones y 21 borrados
  1. +18
    -21
      lib/uk_planning_scraper/idox.rb

+ 18
- 21
lib/uk_planning_scraper/idox.rb Ver fichero

@@ -13,13 +13,23 @@ module UKPlanningScraper
@agent ||= Mechanize.new
end

def get(url, &block)
puts "Getting: #{url}"
res = agent.get(url)

if res.code == '200' # That's a String not an Integer, ffs
block_given? ? block.call(res) : res
else
puts "Error: HTTP #{res.code}"
end
end

def scrape_idox(params, options)
puts "Using Idox scraper."

apps = []

puts "Getting: #{@url}"
page = agent.get(@url) # load the search form page
page = get(@url) # load the search form page

# Check that the search form is actually present.
# When Idox has an internal error it returns an error page with HTTP 200.
@@ -107,8 +117,7 @@ module UKPlanningScraper
if next_button = page.at('a.next')
next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
sleep options[:delay]
puts "Getting: #{next_url}"
page = agent.get(next_url)
page = get(next_url)
else
break
end
@@ -117,7 +126,7 @@ module UKPlanningScraper
# Scrape the summary tab for each app
apps.each_with_index do |app, i|
sleep options[:delay]
puts "#{i + 1} of #{apps.size}: #{app.info_url}"
puts "#{i + 1} of #{apps.size}"

parse_info_url(app) if app.info_url
parse_property_url(app) if app.property_url
@@ -127,9 +136,7 @@ module UKPlanningScraper
end # scrape_idox

def parse_info_url(app)
res = agent.get(app.info_url)

if res.code == '200' # That's a String not an Integer, ffs
get(app.info_url) do |res|
# Parse the summary tab for this app

app.scraped_at = Time.now
@@ -196,23 +203,17 @@ module UKPlanningScraper
app.property_url = base_url + property_association_link[:href]
app.property_count = property_association_link.inner_text.to_i
end
else
puts "Error: HTTP #{res.code}"
end # if
end # get
end

def parse_property_url(app)
# get URLs of property pages
app.property_detail_urls = []

res = agent.get(app.property_url)

if res.code == '200'
get(app.property_url) do |res|
res.search('#Property li a').each do |property_link|
app.property_detail_urls << base_url + property_link[:href]
end
else
puts "Error: HTTP #{res.code}"
end
end

@@ -221,9 +222,7 @@ module UKPlanningScraper
app.properties = []

app.property_detail_urls.each do |property_url|
res = agent.get(property_url)

if res.code == '200'
get(property_url) do |res|
property = Property.new

res.search('#propertyAddress tr').each do |row|
@@ -251,8 +250,6 @@ module UKPlanningScraper
end

app.properties << property
else
puts "Error: HTTP #{res.code}"
end
end
end


Cargando…
Cancelar
Guardar