A Ruby gem to get planning applications data from UK council websites.
25개 이상의 토픽을 선택하실 수 없습니다. Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

135 lines
5.0 KiB

  1. require 'http'
  2. require 'nokogiri'
  3. require 'logger'
  4. module UKPlanningScraper
  5. class Authority
  6. private
  7. def scrape_northgate(params, options)
  8. puts "Using Northgate scraper."
  9. base_url = @url.match(/(https?:\/\/.+?)\//)[1]
  10. # Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive?
  11. generic_url = @url.match(/.+\//)[0] + 'Generic/'
  12. apps = []
  13. $stdout.sync = true # Flush output buffer after every write so log messages appear immediately.
  14. logger = Logger.new($stdout)
  15. logger.level = Logger::DEBUG
  16. date_regex = /\d{2}-\d{2}-\d{4}/
  17. form_vars = {
  18. 'csbtnSearch' => 'Search' # required
  19. }
  20. # Keywords
  21. form_vars['txtProposal'] = params[:keywords]
  22. # Date received from and to
  23. if params[:received_from] || params[:received_to]
  24. form_vars['cboSelectDateValue'] = 'DATE_RECEIVED'
  25. form_vars['rbGroup'] = 'rbRange'
  26. form_vars['dateStart'] = params[:received_from].to_s if params[:received_from] # YYYY-MM-DD
  27. form_vars['dateEnd'] = params[:received_to].to_s if params[:received_to] # YYYY-MM-DD
  28. end
  29. # Date validated from and to
  30. if params[:validated_from] || params[:validated_to]
  31. form_vars['cboSelectDateValue'] = 'DATE_VALID'
  32. form_vars['rbGroup'] = 'rbRange'
  33. form_vars['dateStart'] = params[:validated_from].to_s if params[:validated_from] # YYYY-MM-DD
  34. form_vars['dateEnd'] = params[:validated_to].to_s if params[:validated_to] # YYYY-MM-DD
  35. end
  36. # Date decided from and to
  37. if params[:decided_from] || params[:decided_to]
  38. form_vars['cboSelectDateValue'] = 'DATE_DECISION'
  39. form_vars['rbGroup'] = 'rbRange'
  40. form_vars['dateStart'] = params[:decided_from].to_s if params[:decided_from] # YYYY-MM-DD
  41. form_vars['dateEnd'] = params[:decided_to].to_s if params[:decided_to] # YYYY-MM-DD
  42. end
  43. # Case officer code
  44. if params[:case_officer_code]
  45. form_vars['cboCaseOfficerCode'] = params[:case_officer_code]
  46. @url.sub!('GeneralSearch.aspx', 'CaseOfficerWorkloadSearch.aspx')
  47. end
  48. logger.info "Form variables: #{form_vars.to_s}"
  49. headers = {
  50. 'Origin' => base_url,
  51. 'Referer' => @url,
  52. }
  53. logger.debug "HTTP request headers:"
  54. logger.debug(headers.to_s)
  55. logger.debug "GET: " + @url
  56. response = HTTP.headers(headers).get(@url)
  57. logger.debug "Response code: HTTP " + response.code.to_s
  58. if response.code == 200
  59. doc = Nokogiri::HTML(response.to_s)
  60. asp_vars = {
  61. '__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'],
  62. '__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value']
  63. }
  64. else
  65. logger.fatal "Bad response from search page. Response code: #{response.code.to_s}."
  66. raise RuntimeError.new("Northgate: Bad response from search page. Response code: #{response.code.to_s}.")
  67. end
  68. cookies = {}
  69. response.cookies.each { |c| cookies[c.name] = c.value }
  70. form_vars.merge!(asp_vars)
  71. logger.debug "POST: " + @url
  72. response2 = HTTP.headers(headers).cookies(cookies).post(@url, :form => form_vars)
  73. logger.debug "Response code: HTTP " + response2.code.to_s
  74. if response2.code == 302
  75. # Follow the redirect manually
  76. # Set the page size (PS) to max so we don't have to page through search results
  77. logger.debug "Location: #{response2.headers['Location']}"
  78. results_url = URI::encode(base_url + response2.headers['Location'].gsub!('PS=10', 'PS=99999'))
  79. logger.debug "GET: " + results_url
  80. response3 = HTTP.headers(headers).cookies(cookies).get(results_url)
  81. logger.debug "Response code: HTTP " + response3.code.to_s
  82. doc = Nokogiri::HTML(response3.to_s)
  83. else
  84. logger.error "Didn't get redirected from search."
  85. raise RuntimeError.new("Northgate: didn't get redirected from search.")
  86. end
  87. rows = doc.search("table.display_table tr")
  88. logger.info "Found #{rows.size - 1} applications in search results." # The first row is the header row
  89. # Iterate over search results
  90. rows.each do |row|
  91. if row.at("td") # skip header row which only has th's
  92. cells = row.search("td")
  93. app = Application.new
  94. app.scraped_at = Time.now
  95. app.council_reference = cells[0].inner_text.strip
  96. app.info_url = URI::encode(generic_url + cells[0].at('a')[:href].strip)
  97. app.info_url.gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
  98. app.address = cells[1].inner_text.strip
  99. app.description = cells[2].inner_text.strip
  100. app.status = cells[3].inner_text.strip
  101. raw_date_validated = cells[4].inner_text.strip
  102. app.date_validated = Date.parse(raw_date_validated) if raw_date_validated != '--'
  103. app.decision = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney
  104. apps << app
  105. end
  106. end
  107. apps
  108. end
  109. end
  110. end