A Ruby gem to get planning applications data from UK council websites.
Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

uk_planning_scraper.rb 5.3 KiB

há 6 anos
há 6 anos
há 6 anos
há 6 anos
há 6 anos
há 6 anos
há 6 anos
há 6 anos
há 6 anos
há 6 anos
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. require "uk_planning_scraper/version"
  2. require 'mechanize'
  3. require 'time'
  4. require 'logger'
  5. require 'pp'
  6. module UKPlanningScraper
  7. def self.search(search_url, params, options = {})
  8. default_options = {
  9. delay: 10,
  10. }
  11. @options = default_options.merge(options) # The user-supplied options override the defaults
  12. @search_url = search_url
  13. @base_url = search_url.match(/(https?:\/\/.+?)\//)[1]
  14. apps = []
  15. agent = Mechanize.new
  16. puts "Getting: #{@search_url}"
  17. page = agent.get(@search_url) # load the search form page
  18. # Fill out and submit search form
  19. form = page.form('searchCriteriaForm')
  20. # form.action = form.action + '&searchCriteria.resultsPerPage=100'
  21. # Some councils don't have the received from/to dates on their form, eg Newham
  22. form.send(:"date(applicationReceivedStart)", params[:received_from].strftime("%d/%m/%Y")) if params[:received_from]
  23. form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime("%d/%m/%Y")) if params[:received_to]
  24. form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime("%d/%m/%Y")) if params[:validated_from]
  25. form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime("%d/%m/%Y")) if params[:validated_to]
  26. form.send(:"searchCriteria\.description", params[:description])
  27. # Some councils don't have the applicant name on their form, eg Bexley
  28. form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
  29. form.send(:"searchCriteria\.caseType", params[:application_type])
  30. page = form.submit
  31. loop do
  32. # Parse search results
  33. items = page.search('li.searchresult')
  34. puts "Found #{items.size} apps on this page."
  35. items.each do |app|
  36. data = {}
  37. # Parse info line
  38. info_line = app.at("p.metaInfo").inner_text.strip
  39. bits = info_line.split('|').map { |e| e.strip.delete("\r\n") }
  40. bits.each do |bit|
  41. if matches = bit.match(/Ref\. No:\s+(.+)/)
  42. data[:council_reference] = matches[1]
  43. end
  44. if matches = bit.match(/(Received|Registered):\s+(.+)/)
  45. data[:date_received] = Date.parse(matches[2])
  46. end
  47. if matches = bit.match(/Validated:\s+(.+)/)
  48. data[:date_validated] = Date.parse(matches[1])
  49. end
  50. if matches = bit.match(/Status:\s+(.+)/)
  51. data[:status] = matches[1]
  52. end
  53. end
  54. data.merge!({
  55. scraped_at: Time.now,
  56. info_url: @base_url + app.at('a')['href'],
  57. address: app.at('p.address').inner_text.strip,
  58. description: app.at('a').inner_text.strip,
  59. })
  60. apps << data
  61. end
  62. # Get the Next button from the pager, if there is one
  63. if next_button = page.at('a.next')
  64. next_url = @base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
  65. sleep @options[:delay]
  66. puts "Getting: #{next_url}"
  67. page = agent.get(next_url)
  68. else
  69. break
  70. end
  71. end
  72. # Scrape the summary tab for each app
  73. apps.each_with_index do |app, i|
  74. sleep @options[:delay]
  75. puts "#{i + 1} of #{apps.size}: #{app[:info_url]}"
  76. res = agent.get(app[:info_url])
  77. if res.code == '200' # That's a String not an Integer, ffs
  78. # Parse the summary tab for this app
  79. app[:scraped_at] = Time.now
  80. # Does the Documents tab show if there are no documents?
  81. app[:documents_count] = res.at('#tab_documents').inner_text.match(/\d+/)[0].to_i
  82. app[:documents_url] = @base_url + res.at('#tab_documents')[:href]
  83. # We need to find values in the table by using the th labels.
  84. # The row indexes/positions change from site to site (or even app to app) so we can't rely on that.
  85. res.search('#simpleDetailsTable tr').each do |row|
  86. key = row.at('th').inner_text.strip
  87. value = row.at('td').inner_text.strip
  88. case key
  89. when 'Reference'
  90. app[:council_reference] = value
  91. when 'Alternative Reference'
  92. app[:alternative_reference] = value
  93. when 'Planning Portal Reference'
  94. app[:alternative_reference] = value
  95. when 'Application Received'
  96. app[:date_received] = Date.parse(value) if value != ''
  97. when 'Application Registered'
  98. app[:date_received] = Date.parse(value) if value != ''
  99. when 'Application Validated'
  100. app[:date_validated] = Date.parse(value) if value != ''
  101. when 'Address'
  102. app[:address] = value
  103. when 'Proposal'
  104. app[:description] = value
  105. when 'Status'
  106. app[:status] = value
  107. when 'Decision'
  108. app[:decision] = value
  109. when 'Decision Issued Date'
  110. app[:date_decision] = Date.parse(value) if value != ''
  111. when 'Appeal Status'
  112. app[:appeal_status] = value
  113. when 'Appeal Decision'
  114. app[:appeal_decision] = value
  115. else
  116. puts "Error: key '#{key}' not found"
  117. end # case
  118. end # each row
  119. else
  120. puts "Error: HTTP #{res.code}"
  121. end # if
  122. end # scrape summary tab for apps
  123. apps
  124. end # self.search
  125. end # module