A Ruby gem to get planning applications data from UK council websites.
Non puoi selezionare più di 25 argomenti Gli argomenti devono iniziare con una lettera o un numero, possono includere trattini ('-') e possono essere lunghi fino a 35 caratteri.

uk_planning_scraper.rb 5.3 KiB

6 anni fa
6 anni fa
6 anni fa
6 anni fa
6 anni fa
6 anni fa
6 anni fa
6 anni fa
6 anni fa
6 anni fa
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. require "uk_planning_scraper/version"
  2. require 'mechanize'
  3. require 'time'
  4. require 'logger'
  5. require 'pp'
  6. module UKPlanningScraper
  7. def self.search(search_url, params, options = {})
  8. default_options = {
  9. delay: 10,
  10. }
  11. @options = default_options.merge(options) # The user-supplied options override the defaults
  12. @search_url = search_url
  13. @base_url = search_url.match(/(https?:\/\/.+?)\//)[1]
  14. apps = []
  15. agent = Mechanize.new
  16. puts "Getting: #{@search_url}"
  17. page = agent.get(@search_url) # load the search form page
  18. # Fill out and submit search form
  19. form = page.form('searchCriteriaForm')
  20. # form.action = form.action + '&searchCriteria.resultsPerPage=100'
  21. # Some councils don't have the received from/to dates on their form, eg Newham
  22. form.send(:"date(applicationReceivedStart)", params[:received_from].strftime("%d/%m/%Y")) if params[:received_from]
  23. form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime("%d/%m/%Y")) if params[:received_to]
  24. form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime("%d/%m/%Y")) if params[:validated_from]
  25. form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime("%d/%m/%Y")) if params[:validated_to]
  26. form.send(:"searchCriteria\.description", params[:description])
  27. # Some councils don't have the applicant name on their form, eg Bexley
  28. form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
  29. form.send(:"searchCriteria\.caseType", params[:application_type])
  30. page = form.submit
  31. loop do
  32. # Parse search results
  33. items = page.search('li.searchresult')
  34. puts "Found #{items.size} apps on this page."
  35. items.each do |app|
  36. data = {}
  37. # Parse info line
  38. info_line = app.at("p.metaInfo").inner_text.strip
  39. bits = info_line.split('|').map { |e| e.strip.delete("\r\n") }
  40. bits.each do |bit|
  41. if matches = bit.match(/Ref\. No:\s+(.+)/)
  42. data[:council_reference] = matches[1]
  43. end
  44. if matches = bit.match(/(Received|Registered):\s+(.+)/)
  45. data[:date_received] = Date.parse(matches[2])
  46. end
  47. if matches = bit.match(/Validated:\s+(.+)/)
  48. data[:date_validated] = Date.parse(matches[1])
  49. end
  50. if matches = bit.match(/Status:\s+(.+)/)
  51. data[:status] = matches[1]
  52. end
  53. end
  54. data.merge!({
  55. scraped_at: Time.now,
  56. info_url: @base_url + app.at('a')['href'],
  57. address: app.at('p.address').inner_text.strip,
  58. description: app.at('a').inner_text.strip,
  59. })
  60. apps << data
  61. end
  62. # Get the Next button from the pager, if there is one
  63. if next_button = page.at('a.next')
  64. next_url = @base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
  65. sleep @options[:delay]
  66. puts "Getting: #{next_url}"
  67. page = agent.get(next_url)
  68. else
  69. break
  70. end
  71. end
  72. # Scrape the summary tab for each app
  73. apps.each_with_index do |app, i|
  74. sleep @options[:delay]
  75. puts "#{i + 1} of #{apps.size}: #{app[:info_url]}"
  76. res = agent.get(app[:info_url])
  77. if res.code == '200' # That's a String not an Integer, ffs
  78. # Parse the summary tab for this app
  79. app[:scraped_at] = Time.now
  80. # Does the Documents tab show if there are no documents?
  81. app[:documents_count] = res.at('#tab_documents').inner_text.match(/\d+/)[0].to_i
  82. app[:documents_url] = @base_url + res.at('#tab_documents')[:href]
  83. # We need to find values in the table by using the th labels.
  84. # The row indexes/positions change from site to site (or even app to app) so we can't rely on that.
  85. res.search('#simpleDetailsTable tr').each do |row|
  86. key = row.at('th').inner_text.strip
  87. value = row.at('td').inner_text.strip
  88. case key
  89. when 'Reference'
  90. app[:council_reference] = value
  91. when 'Alternative Reference'
  92. app[:alternative_reference] = value
  93. when 'Planning Portal Reference'
  94. app[:alternative_reference] = value
  95. when 'Application Received'
  96. app[:date_received] = Date.parse(value) if value != ''
  97. when 'Application Registered'
  98. app[:date_received] = Date.parse(value) if value != ''
  99. when 'Application Validated'
  100. app[:date_validated] = Date.parse(value) if value != ''
  101. when 'Address'
  102. app[:address] = value
  103. when 'Proposal'
  104. app[:description] = value
  105. when 'Status'
  106. app[:status] = value
  107. when 'Decision'
  108. app[:decision] = value
  109. when 'Decision Issued Date'
  110. app[:date_decision] = Date.parse(value) if value != ''
  111. when 'Appeal Status'
  112. app[:appeal_status] = value
  113. when 'Appeal Decision'
  114. app[:appeal_decision] = value
  115. else
  116. puts "Error: key '#{key}' not found"
  117. end # case
  118. end # each row
  119. else
  120. puts "Error: HTTP #{res.code}"
  121. end # if
  122. end # scrape summary tab for apps
  123. apps
  124. end # self.search
  125. end # module