A Ruby gem to get planning applications data from UK council websites.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

128 lines
4.8 KiB

  1. require 'http'
  2. require 'nokogiri'
  3. require 'logger'
  4. module UKPlanningScraper
  5. class Authority
  6. private
  7. def scrape_northgate(params, options)
  8. puts "Using Northgate scraper."
  9. base_url = @url.match(/(https?:\/\/.+?)\//)[1]
  10. # Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive?
  11. generic_url = @url.match(/.+\//)[0] + 'Generic/'
  12. apps = []
  13. $stdout.sync = true # Flush output buffer after every write so log messages appear immediately.
  14. logger = Logger.new($stdout)
  15. logger.level = Logger::DEBUG
  16. date_regex = /\d{2}-\d{2}-\d{4}/
  17. form_vars = {
  18. 'csbtnSearch' => 'Search' # required
  19. }
  20. form_vars['txtProposal'] = params[:keywords]
  21. # Date received from and to
  22. if params[:received_from] || params[:received_to]
  23. form_vars['cboSelectDateValue'] = 'DATE_RECEIVED'
  24. form_vars['rbGroup'] = 'rbRange'
  25. form_vars['dateStart'] = params[:received_from].to_s if params[:received_from] # YYYY-MM-DD
  26. form_vars['dateEnd'] = params[:received_to].to_s if params[:received_to] # YYYY-MM-DD
  27. end
  28. # Date validated from and to
  29. if params[:validated_from] || params[:validated_to]
  30. form_vars['cboSelectDateValue'] = 'DATE_VALID'
  31. form_vars['rbGroup'] = 'rbRange'
  32. form_vars['dateStart'] = params[:validated_from].to_s if params[:validated_from] # YYYY-MM-DD
  33. form_vars['dateEnd'] = params[:validated_to].to_s if params[:validated_to] # YYYY-MM-DD
  34. end
  35. # Date decided from and to
  36. if params[:decided_from] || params[:decided_to]
  37. form_vars['cboSelectDateValue'] = 'DATE_DECISION'
  38. form_vars['rbGroup'] = 'rbRange'
  39. form_vars['dateStart'] = params[:decided_from].to_s if params[:decided_from] # YYYY-MM-DD
  40. form_vars['dateEnd'] = params[:decided_to].to_s if params[:decided_to] # YYYY-MM-DD
  41. end
  42. logger.info "Form variables: #{form_vars.to_s}"
  43. headers = {
  44. 'Origin' => base_url,
  45. 'Referer' => @url,
  46. }
  47. logger.debug "HTTP request headers:"
  48. logger.debug(headers.to_s)
  49. logger.debug "GET: " + @url
  50. response = HTTP.headers(headers).get(@url)
  51. logger.debug "Response code: HTTP " + response.code.to_s
  52. if response.code == 200
  53. doc = Nokogiri::HTML(response.to_s)
  54. asp_vars = {
  55. '__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'],
  56. '__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value']
  57. }
  58. else
  59. logger.fatal "Bad response from search page. Response code: #{response.code.to_s}."
  60. raise RuntimeError.new("Northgate: Bad response from search page. Response code: #{response.code.to_s}.")
  61. end
  62. cookies = {}
  63. response.cookies.each { |c| cookies[c.name] = c.value }
  64. form_vars.merge!(asp_vars)
  65. logger.debug "POST: " + @url
  66. response2 = HTTP.headers(headers).cookies(cookies).post(@url, :form => form_vars)
  67. logger.debug "Response code: HTTP " + response2.code.to_s
  68. if response2.code == 302
  69. # Follow the redirect manually
  70. # Set the page size (PS) to max so we don't have to page through search results
  71. logger.debug "Location: #{response2.headers['Location']}"
  72. results_url = URI::encode(base_url + response2.headers['Location'].gsub!('PS=10', 'PS=99999'))
  73. logger.debug "GET: " + results_url
  74. response3 = HTTP.headers(headers).cookies(cookies).get(results_url)
  75. logger.debug "Response code: HTTP " + response3.code.to_s
  76. doc = Nokogiri::HTML(response3.to_s)
  77. else
  78. logger.error "Didn't get redirected from search."
  79. raise RuntimeError.new("Northgate: didn't get redirected from search.")
  80. end
  81. rows = doc.search("table.display_table tr")
  82. logger.info "Found #{rows.size - 1} applications in search results." # The first row is the header row
  83. # Iterate over search results
  84. rows.each do |row|
  85. if row.at("td") # skip header row which only has th's
  86. cells = row.search("td")
  87. app = Application.new
  88. app.scraped_at = Time.now
  89. app.council_reference = cells[0].inner_text.strip
  90. app.info_url = URI::encode(generic_url + cells[0].at('a')[:href].strip)
  91. app.info_url.gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
  92. app.address = cells[1].inner_text.strip
  93. app.description = cells[2].inner_text.strip
  94. app.status = cells[3].inner_text.strip
  95. raw_date_received = cells[4].inner_text.strip
  96. app.date_received = Date.parse(raw_date_received) if raw_date_received != '--'
  97. app.decision = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney
  98. apps << app
  99. end
  100. end
  101. apps
  102. end
  103. end
  104. end