| @@ -13,13 +13,23 @@ module UKPlanningScraper | |||
| @agent ||= Mechanize.new | |||
| end | |||
| def get(url, &block) | |||
| puts "Getting: #{url}" | |||
| res = agent.get(url) | |||
| if res.code == '200' # That's a String not an Integer, ffs | |||
| block_given? ? block.call(res) : res | |||
| else | |||
| puts "Error: HTTP #{res.code}" | |||
| end | |||
| end | |||
| def scrape_idox(params, options) | |||
| puts "Using Idox scraper." | |||
| apps = [] | |||
| puts "Getting: #{@url}" | |||
| page = agent.get(@url) # load the search form page | |||
| page = get(@url) # load the search form page | |||
| # Check that the search form is actually present. | |||
| # When Idox has an internal error it returns an error page with HTTP 200. | |||
| @@ -107,8 +117,7 @@ module UKPlanningScraper | |||
| if next_button = page.at('a.next') | |||
| next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100' | |||
| sleep options[:delay] | |||
| puts "Getting: #{next_url}" | |||
| page = agent.get(next_url) | |||
| page = get(next_url) | |||
| else | |||
| break | |||
| end | |||
| @@ -117,7 +126,7 @@ module UKPlanningScraper | |||
| # Scrape the summary tab for each app | |||
| apps.each_with_index do |app, i| | |||
| sleep options[:delay] | |||
| puts "#{i + 1} of #{apps.size}: #{app.info_url}" | |||
| puts "#{i + 1} of #{apps.size}" | |||
| parse_info_url(app) if app.info_url | |||
| parse_property_url(app) if app.property_url | |||
| @@ -127,9 +136,7 @@ module UKPlanningScraper | |||
| end # scrape_idox | |||
| def parse_info_url(app) | |||
| res = agent.get(app.info_url) | |||
| if res.code == '200' # That's a String not an Integer, ffs | |||
| get(app.info_url) do |res| | |||
| # Parse the summary tab for this app | |||
| app.scraped_at = Time.now | |||
| @@ -196,23 +203,17 @@ module UKPlanningScraper | |||
| app.property_url = base_url + property_association_link[:href] | |||
| app.property_count = property_association_link.inner_text.to_i | |||
| end | |||
| else | |||
| puts "Error: HTTP #{res.code}" | |||
| end # if | |||
| end # get | |||
| end | |||
| def parse_property_url(app) | |||
| # get URLs of property pages | |||
| app.property_detail_urls = [] | |||
| res = agent.get(app.property_url) | |||
| if res.code == '200' | |||
| get(app.property_url) do |res| | |||
| res.search('#Property li a').each do |property_link| | |||
| app.property_detail_urls << base_url + property_link[:href] | |||
| end | |||
| else | |||
| puts "Error: HTTP #{res.code}" | |||
| end | |||
| end | |||
| @@ -221,9 +222,7 @@ module UKPlanningScraper | |||
| app.properties = [] | |||
| app.property_detail_urls.each do |property_url| | |||
| res = agent.get(property_url) | |||
| if res.code == '200' | |||
| get(property_url) do |res| | |||
| property = Property.new | |||
| res.search('#propertyAddress tr').each do |row| | |||
| @@ -251,8 +250,6 @@ module UKPlanningScraper | |||
| end | |||
| app.properties << property | |||
| else | |||
| puts "Error: HTTP #{res.code}" | |||
| end | |||
| end | |||
| end | |||