A Ruby gem to get planning applications data from UK council websites.
Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 
 

143 linhas
5.0 KiB

  1. require 'http'
  2. require 'nokogiri'
  3. require 'logger'
  4. module UKPlanningScraper
  5. class Authority
  6. private
  7. def scrape_northgate(params, options)
  8. puts "Using Northgate scraper."
  9. base_url = @url.match(/(https?:\/\/.+?)\//)[1]
  10. # Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive?
  11. generic_url = @url.match(/.+\//)[0] + 'Generic/'
  12. apps = []
  13. $stdout.sync = true # Flush output buffer after every write so log messages appear immediately.
  14. logger = Logger.new($stdout)
  15. logger.level = Logger::DEBUG
  16. date_regex = /\d{2}-\d{2}-\d{4}/
  17. form_vars = {
  18. 'csbtnSearch' => 'Search' # required
  19. }
  20. form_vars['txtProposal'] = params[:keywords]
  21. # Date received from and to
  22. if params[:received_from] || params[:received_to]
  23. form_vars['cboSelectDateValue'] = 'DATE_RECEIVED'
  24. form_vars['rbGroup'] = 'rbRange'
  25. form_vars['dateStart'] = params[:received_from].to_s if params[:received_from] # YYYY-MM-DD
  26. form_vars['dateEnd'] = params[:received_to].to_s if params[:received_to] # YYYY-MM-DD
  27. end
  28. # Date validated from and to
  29. if params[:validated_from] || params[:validated_to]
  30. form_vars['cboSelectDateValue'] = 'DATE_VALID'
  31. form_vars['rbGroup'] = 'rbRange'
  32. form_vars['dateStart'] = params[:validated_from].to_s if params[:validated_from] # YYYY-MM-DD
  33. form_vars['dateEnd'] = params[:validated_to].to_s if params[:validated_to] # YYYY-MM-DD
  34. end
  35. # Date decided from and to
  36. if params[:decided_from] || params[:decided_to]
  37. form_vars['cboSelectDateValue'] = 'DATE_DECISION'
  38. form_vars['rbGroup'] = 'rbRange'
  39. form_vars['dateStart'] = params[:decided_from].to_s if params[:decided_from] # YYYY-MM-DD
  40. form_vars['dateEnd'] = params[:decided_to].to_s if params[:decided_to] # YYYY-MM-DD
  41. end
  42. # form_vars.merge!({ 'cboStatusCode' => ENV['MORPH_STATUS']}) if ENV['MORPH_STATUS']
  43. logger.info "Form variables: #{form_vars.to_s}"
  44. headers = {
  45. 'Origin' => base_url,
  46. 'Referer' => @url,
  47. }
  48. logger.debug "HTTP request headers:"
  49. logger.debug(headers.to_s)
  50. logger.debug "GET: " + @url
  51. response = HTTP.headers(headers).get(@url)
  52. logger.debug "Response code: HTTP " + response.code.to_s
  53. if response.code == 200
  54. doc = Nokogiri::HTML(response.to_s)
  55. asp_vars = {
  56. '__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'],
  57. '__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value']
  58. }
  59. else
  60. logger.fatal "Bad response from search page. Response code: #{response.code.to_s}. Exiting."
  61. exit 1
  62. end
  63. cookies = {}
  64. response.cookies.each { |c| cookies[c.name] = c.value }
  65. form_vars.merge!(asp_vars)
  66. logger.debug "POST: " + @url
  67. response2 = HTTP.headers(headers).cookies(cookies).post(@url, :form => form_vars)
  68. logger.debug "Response code: HTTP " + response2.code.to_s
  69. if response2.code == 302
  70. # Follow the redirect manually
  71. # Set the page size (PS) to max so we don't have to page through search results
  72. logger.debug "Location: #{response2.headers['Location']}"
  73. # exit
  74. results_url = URI::encode(base_url + response2.headers['Location'].gsub!('PS=10', 'PS=99999'))
  75. logger.debug "GET: " + results_url
  76. response3 = HTTP.headers(headers).cookies(cookies).get(results_url)
  77. logger.debug "Response code: HTTP " + response3.code.to_s
  78. doc = Nokogiri::HTML(response3.to_s)
  79. else
  80. logger.fatal "Didn't get redirected from search. Exiting."
  81. exit 1
  82. end
  83. rows = doc.search("table.display_table tr")
  84. logger.info "Found #{rows.size - 1} applications in search results." # The first row is the header row
  85. # Iterate over search results
  86. rows.each do |row|
  87. if row.at("td") # skip header row which only has th's
  88. cells = row.search("td")
  89. ref = cells[0].inner_text.strip
  90. app = {
  91. scraped_at: Time.now,
  92. # date_scraped: Date.today # FIXME - Planning Alerts compatibility?
  93. }
  94. app[:council_reference] = ref
  95. app[:info_url] = URI::encode(generic_url + cells[0].at('a')[:href].strip)
  96. app[:info_url].gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
  97. app[:address] = cells[1].inner_text.strip
  98. app[:description] = cells[2].inner_text.strip
  99. app[:status] = cells[3].inner_text.strip
  100. raw_date_received = cells[4].inner_text.strip
  101. if raw_date_received != '--'
  102. app[:date_received] = Date.parse(raw_date_received)
  103. else
  104. app[:date_received] = nil
  105. end
  106. app[:decision] = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney
  107. apps << app
  108. end
  109. end
  110. apps
  111. end
  112. end
  113. end