A Ruby gem to get planning applications data from UK council websites.
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.
 
 
 

140 řádky
5.1 KiB

  1. require 'http'
  2. require 'nokogiri'
  3. require 'logger'
  4. module UKPlanningScraper
  5. class Authority
  6. private
  7. def scrape_northgate(params, options)
  8. puts "Using Northgate scraper."
  9. base_url = @url.match(/(https?:\/\/.+?)\//)[1]
  10. # Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive?
  11. generic_url = @url.match(/.+\//)[0] + 'Generic/'
  12. apps = []
  13. $stdout.sync = true # Flush output buffer after every write so log messages appear immediately.
  14. logger = Logger.new($stdout)
  15. logger.level = Logger::DEBUG
  16. date_regex = /\d{2}-\d{2}-\d{4}/
  17. form_vars = {
  18. 'csbtnSearch' => 'Search' # required
  19. }
  20. # Keywords
  21. form_vars['txtProposal'] = params[:keywords]
  22. # Date received from and to
  23. if params[:received_from] || params[:received_to]
  24. form_vars['cboSelectDateValue'] = 'DATE_RECEIVED'
  25. form_vars['rbGroup'] = 'rbRange'
  26. form_vars['dateStart'] = params[:received_from].to_s if params[:received_from] # YYYY-MM-DD
  27. form_vars['dateEnd'] = params[:received_to].to_s if params[:received_to] # YYYY-MM-DD
  28. end
  29. # Date validated from and to
  30. if params[:validated_from] || params[:validated_to]
  31. form_vars['cboSelectDateValue'] = 'DATE_VALID'
  32. form_vars['rbGroup'] = 'rbRange'
  33. form_vars['dateStart'] = params[:validated_from].to_s if params[:validated_from] # YYYY-MM-DD
  34. form_vars['dateEnd'] = params[:validated_to].to_s if params[:validated_to] # YYYY-MM-DD
  35. end
  36. # Date decided from and to
  37. if params[:decided_from] || params[:decided_to]
  38. form_vars['cboSelectDateValue'] = 'DATE_DECISION'
  39. form_vars['rbGroup'] = 'rbRange'
  40. form_vars['dateStart'] = params[:decided_from].to_s if params[:decided_from] # YYYY-MM-DD
  41. form_vars['dateEnd'] = params[:decided_to].to_s if params[:decided_to] # YYYY-MM-DD
  42. end
  43. # Status
  44. if params[:status]
  45. form_vars['cboStatusCode'] = params[:status]
  46. end
  47. # Case officer code
  48. if params[:case_officer_code]
  49. form_vars['cboCaseOfficerCode'] = params[:case_officer_code]
  50. @url.sub!('GeneralSearch.aspx', 'CaseOfficerWorkloadSearch.aspx')
  51. end
  52. logger.info "Form variables: #{form_vars.to_s}"
  53. headers = {
  54. 'Origin' => base_url,
  55. 'Referer' => @url,
  56. }
  57. logger.debug "HTTP request headers:"
  58. logger.debug(headers.to_s)
  59. logger.debug "GET: " + @url
  60. response = HTTP.headers(headers).get(@url)
  61. logger.debug "Response code: HTTP " + response.code.to_s
  62. if response.code == 200
  63. doc = Nokogiri::HTML(response.to_s)
  64. asp_vars = {
  65. '__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'],
  66. '__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value']
  67. }
  68. else
  69. logger.fatal "Bad response from search page. Response code: #{response.code.to_s}."
  70. raise RuntimeError.new("Northgate: Bad response from search page. Response code: #{response.code.to_s}.")
  71. end
  72. cookies = {}
  73. response.cookies.each { |c| cookies[c.name] = c.value }
  74. form_vars.merge!(asp_vars)
  75. logger.debug "POST: " + @url
  76. response2 = HTTP.headers(headers).cookies(cookies).post(@url, :form => form_vars)
  77. logger.debug "Response code: HTTP " + response2.code.to_s
  78. if response2.code == 302
  79. # Follow the redirect manually
  80. # Set the page size (PS) to max so we don't have to page through search results
  81. logger.debug "Location: #{response2.headers['Location']}"
  82. results_url = URI::encode(base_url + response2.headers['Location'].gsub!('PS=10', 'PS=99999'))
  83. logger.debug "GET: " + results_url
  84. response3 = HTTP.headers(headers).cookies(cookies).get(results_url)
  85. logger.debug "Response code: HTTP " + response3.code.to_s
  86. doc = Nokogiri::HTML(response3.to_s)
  87. else
  88. logger.error "Didn't get redirected from search."
  89. raise RuntimeError.new("Northgate: didn't get redirected from search.")
  90. end
  91. rows = doc.search("table.display_table tr")
  92. logger.info "Found #{rows.size - 1} applications in search results." # The first row is the header row
  93. # Iterate over search results
  94. rows.each do |row|
  95. if row.at("td") # skip header row which only has th's
  96. cells = row.search("td")
  97. app = Application.new
  98. app.scraped_at = Time.now
  99. app.council_reference = cells[0].inner_text.strip
  100. app.info_url = URI::encode(generic_url + cells[0].at('a')[:href].strip)
  101. app.info_url.gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
  102. app.address = cells[1].inner_text.strip
  103. app.description = cells[2].inner_text.strip
  104. app.status = cells[3].inner_text.strip
  105. raw_date_validated = cells[4].inner_text.strip
  106. app.date_validated = Date.parse(raw_date_validated) if raw_date_validated != '--'
  107. app.decision = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney
  108. apps << app
  109. end
  110. end
  111. apps
  112. end
  113. end
  114. end