| @@ -1,6 +1,7 @@ | |||||
| require 'http' | require 'http' | ||||
| require 'nokogiri' | require 'nokogiri' | ||||
| require 'logger' | require 'logger' | ||||
| require 'uri' | |||||
| module UKPlanningScraper | module UKPlanningScraper | ||||
| class Authority | class Authority | ||||
| @@ -103,9 +104,13 @@ module UKPlanningScraper | |||||
| if response2.code == 302 | if response2.code == 302 | ||||
| # Follow the redirect manually | # Follow the redirect manually | ||||
| # Set the page size (PS) to max so we don't have to page through search results | # Set the page size (PS) to max so we don't have to page through search results | ||||
| logger.debug "Base URL: #{base_url}" | |||||
| logger.debug "Location: #{response2.headers['Location']}" | logger.debug "Location: #{response2.headers['Location']}" | ||||
| results_url = URI::encode(base_url + response2.headers['Location'].gsub!('PS=10', 'PS=99999')) | |||||
| logger.debug "GET: " + results_url | |||||
| location = response2.headers['Location'].gsub!('PS=10', 'PS=99999') | |||||
| results_url = URI(base_url + location) | |||||
| logger.debug "GET: " + results_url.to_s | |||||
| response3 = HTTP.headers(headers).cookies(cookies).get(results_url) | response3 = HTTP.headers(headers).cookies(cookies).get(results_url) | ||||
| logger.debug "Response code: HTTP " + response3.code.to_s | logger.debug "Response code: HTTP " + response3.code.to_s | ||||
| doc = Nokogiri::HTML(response3.to_s) | doc = Nokogiri::HTML(response3.to_s) | ||||
| @@ -125,8 +130,9 @@ module UKPlanningScraper | |||||
| app = Application.new | app = Application.new | ||||
| app.scraped_at = Time.now | app.scraped_at = Time.now | ||||
| app.council_reference = cells[0].inner_text.strip | app.council_reference = cells[0].inner_text.strip | ||||
| app.info_url = URI::encode(generic_url + cells[0].at('a')[:href].strip) | |||||
| app.info_url = generic_url + cells[0].at('a')[:href].strip | |||||
| app.info_url.gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this? | app.info_url.gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this? | ||||
| app.info_url = URI(app.info_url).to_s | |||||
| app.address = cells[1].inner_text.strip | app.address = cells[1].inner_text.strip | ||||
| app.description = cells[2].inner_text.strip | app.description = cells[2].inner_text.strip | ||||
| app.status = cells[3].inner_text.strip | app.status = cells[3].inner_text.strip | ||||