A Ruby gem to get planning applications data from UK council websites.
25개 이상의 토픽을 선택하실 수 없습니다. Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

140 lines
5.1 KiB

  1. require 'http'
  2. require 'nokogiri'
  3. require 'logger'
  4. module UKPlanningScraper
  5. class Authority
  6. private
  7. def scrape_northgate(params, options)
  8. puts "Using Northgate scraper."
  9. base_url = @url.match(/(https?:\/\/.+?)\//)[1]
  10. # Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive?
  11. generic_url = @url.match(/.+\//)[0] + 'Generic/'
  12. apps = []
  13. $stdout.sync = true # Flush output buffer after every write so log messages appear immediately.
  14. logger = Logger.new($stdout)
  15. logger.level = Logger::DEBUG
  16. date_regex = /\d{2}-\d{2}-\d{4}/
  17. form_vars = {
  18. 'csbtnSearch' => 'Search' # required
  19. }
  20. # Keywords
  21. form_vars['txtProposal'] = params[:keywords]
  22. # Date received from and to
  23. if params[:received_from] || params[:received_to]
  24. form_vars['cboSelectDateValue'] = 'DATE_RECEIVED'
  25. form_vars['rbGroup'] = 'rbRange'
  26. form_vars['dateStart'] = params[:received_from].to_s if params[:received_from] # YYYY-MM-DD
  27. form_vars['dateEnd'] = params[:received_to].to_s if params[:received_to] # YYYY-MM-DD
  28. end
  29. # Date validated from and to
  30. if params[:validated_from] || params[:validated_to]
  31. form_vars['cboSelectDateValue'] = 'DATE_VALID'
  32. form_vars['rbGroup'] = 'rbRange'
  33. form_vars['dateStart'] = params[:validated_from].to_s if params[:validated_from] # YYYY-MM-DD
  34. form_vars['dateEnd'] = params[:validated_to].to_s if params[:validated_to] # YYYY-MM-DD
  35. end
  36. # Date decided from and to
  37. if params[:decided_from] || params[:decided_to]
  38. form_vars['cboSelectDateValue'] = 'DATE_DECISION'
  39. form_vars['rbGroup'] = 'rbRange'
  40. form_vars['dateStart'] = params[:decided_from].to_s if params[:decided_from] # YYYY-MM-DD
  41. form_vars['dateEnd'] = params[:decided_to].to_s if params[:decided_to] # YYYY-MM-DD
  42. end
  43. # Status
  44. if params[:status]
  45. form_vars['cboStatusCode'] = params[:status]
  46. end
  47. # Case officer code
  48. if params[:case_officer_code]
  49. form_vars['cboCaseOfficerCode'] = params[:case_officer_code]
  50. @url.sub!('GeneralSearch.aspx', 'CaseOfficerWorkloadSearch.aspx')
  51. end
  52. logger.info "Form variables: #{form_vars.to_s}"
  53. headers = {
  54. 'Origin' => base_url,
  55. 'Referer' => @url,
  56. }
  57. logger.debug "HTTP request headers:"
  58. logger.debug(headers.to_s)
  59. logger.debug "GET: " + @url
  60. response = HTTP.headers(headers).get(@url)
  61. logger.debug "Response code: HTTP " + response.code.to_s
  62. if response.code == 200
  63. doc = Nokogiri::HTML(response.to_s)
  64. asp_vars = {
  65. '__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'],
  66. '__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value']
  67. }
  68. else
  69. logger.fatal "Bad response from search page. Response code: #{response.code.to_s}."
  70. raise RuntimeError.new("Northgate: Bad response from search page. Response code: #{response.code.to_s}.")
  71. end
  72. cookies = {}
  73. response.cookies.each { |c| cookies[c.name] = c.value }
  74. form_vars.merge!(asp_vars)
  75. logger.debug "POST: " + @url
  76. response2 = HTTP.headers(headers).cookies(cookies).post(@url, :form => form_vars)
  77. logger.debug "Response code: HTTP " + response2.code.to_s
  78. if response2.code == 302
  79. # Follow the redirect manually
  80. # Set the page size (PS) to max so we don't have to page through search results
  81. logger.debug "Location: #{response2.headers['Location']}"
  82. results_url = URI::encode(base_url + response2.headers['Location'].gsub!('PS=10', 'PS=99999'))
  83. logger.debug "GET: " + results_url
  84. response3 = HTTP.headers(headers).cookies(cookies).get(results_url)
  85. logger.debug "Response code: HTTP " + response3.code.to_s
  86. doc = Nokogiri::HTML(response3.to_s)
  87. else
  88. logger.error "Didn't get redirected from search."
  89. raise RuntimeError.new("Northgate: didn't get redirected from search.")
  90. end
  91. rows = doc.search("table.display_table tr")
  92. logger.info "Found #{rows.size - 1} applications in search results." # The first row is the header row
  93. # Iterate over search results
  94. rows.each do |row|
  95. if row.at("td") # skip header row which only has th's
  96. cells = row.search("td")
  97. app = Application.new
  98. app.scraped_at = Time.now
  99. app.council_reference = cells[0].inner_text.strip
  100. app.info_url = URI::encode(generic_url + cells[0].at('a')[:href].strip)
  101. app.info_url.gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
  102. app.address = cells[1].inner_text.strip
  103. app.description = cells[2].inner_text.strip
  104. app.status = cells[3].inner_text.strip
  105. raw_date_validated = cells[4].inner_text.strip
  106. app.date_validated = Date.parse(raw_date_validated) if raw_date_validated != '--'
  107. app.decision = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney
  108. apps << app
  109. end
  110. end
  111. apps
  112. end
  113. end
  114. end