A Ruby gem to get planning applications data from UK council websites.
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

200 lignes
7.6 KiB

  1. require 'mechanize'
  2. require 'pp'
  3. module UKPlanningScraper
  4. class Authority
  5. private
  6. def scrape_idox(params, options)
  7. puts "Using Idox scraper."
  8. base_url = @url.match(/(https?:\/\/.+?)\//)[1]
  9. apps = []
  10. agent = Mechanize.new
  11. puts "Getting: #{@url}"
  12. page = agent.get(@url) # load the search form page
  13. # Check that the search form is actually present.
  14. # When Idox has an internal error it returns an error page with HTTP 200.
  15. unless form = page.form('searchCriteriaForm')
  16. puts "Error: Search form page failed to load due to Idox internal error."
  17. return []
  18. end
  19. # form.action = form.action + '&searchCriteria.resultsPerPage=100'
  20. # Fill out and submit search form
  21. # Add expected fields to form if they're not already present so that searches using these terms work
  22. %w{
  23. date(applicationReceivedStart)
  24. date(applicationReceivedEnd)
  25. }.each { |f| form.add_field!(f) unless form.has_field?(f) }
  26. date_format = "%d/%m/%Y"
  27. form.send(:"date(applicationReceivedStart)", params[:received_from].strftime(date_format)) if params[:received_from]
  28. form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime(date_format)) if params[:received_to]
  29. form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime(date_format)) if params[:validated_from]
  30. form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime(date_format)) if params[:validated_to]
  31. form.send(:"date(applicationDecisionStart)", params[:decided_from].strftime(date_format)) if params[:decided_from]
  32. form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime(date_format)) if params[:decided_to]
  33. form.send(:"searchCriteria\.reference", params[:council_reference])
  34. form.send(:"searchCriteria\.description", params[:keywords])
  35. # Some councils don't have the applicant name on their form, eg Bexley
  36. form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
  37. form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType'
  38. # Only some Idox sites (eg Bolton) have a 'searchCriteria.developmentType' parameter
  39. form.send(:"searchCriteria\.developmentType", params[:development_type]) if form.has_field? 'searchCriteria.developmentType'
  40. page = form.submit
  41. if page.search('.errors').inner_text.match(/Too many results found/i)
  42. raise TooManySearchResults.new("Scrape in smaller chunks. Use shorter date ranges and/or more search parameters.")
  43. end
  44. loop do
  45. # Parse search results
  46. items = page.search('li.searchresult')
  47. puts "Found #{items.size} apps on this page."
  48. items.each do |app|
  49. data = Application.new
  50. # Parse info line
  51. info_line = app.at("p.metaInfo").inner_text.strip
  52. bits = info_line.split('|').map { |e| e.strip.delete("\r\n") }
  53. bits.each do |bit|
  54. if matches = bit.match(/Ref\. No:\s+(.+)/)
  55. data.council_reference = matches[1]
  56. end
  57. if matches = bit.match(/(Received|Registered):\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
  58. data.date_received = Date.parse(matches[2])
  59. end
  60. if matches = bit.match(/Validated:\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
  61. data.date_validated = Date.parse(matches[1])
  62. end
  63. if matches = bit.match(/Status:\s+(.+)/)
  64. data.status = matches[1]
  65. end
  66. end
  67. data.scraped_at = Time.now
  68. data.info_url = base_url + app.at('a')['href']
  69. data.address = app.at('p.address').inner_text.strip
  70. data.description = app.at('a').inner_text.strip
  71. apps << data
  72. end
  73. # Get the Next button from the pager, if there is one
  74. if next_button = page.at('a.next')
  75. next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
  76. sleep options[:delay]
  77. puts "Getting: #{next_url}"
  78. page = agent.get(next_url)
  79. else
  80. break
  81. end
  82. end
  83. # Scrape the summary tab for each app
  84. apps.each_with_index do |app, i|
  85. sleep options[:delay]
  86. puts "#{i + 1} of #{apps.size}: #{app.info_url}"
  87. res = agent.get(app.info_url)
  88. if res.code == '200' # That's a String not an Integer, ffs
  89. # Parse the summary tab for this app
  90. parse_summary(app, res)
  91. else
  92. puts "Error: HTTP #{res.code}"
  93. end # if
  94. end # scrape summary tab for apps
  95. if apps == [] && page.search('pa')
  96. app = Application.new
  97. app.council_reference = params[:council_reference] if params[:council_reference]
  98. parse_summary(app, page)
  99. apps << app
  100. end # direct hit
  101. apps
  102. end # scrape_idox
  103. def parse_summary(app, res)
  104. base_url = @url.match(/(https?:\/\/.+?)\//)[1]
  105. app.scraped_at = Time.now
  106. unless app.info_url
  107. key_val = res.link_with(id: 'tab_summary')&.href
  108. app.info_url = "#{base_url}#{key_val}"
  109. end
  110. # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
  111. # Bradford has #tab_documents but without the document count on it
  112. app.documents_count = 0
  113. if documents_link = res.at('.associateddocument a')
  114. if documents_link.inner_text.match(/\d+/)
  115. app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
  116. app.documents_url = base_url + documents_link[:href]
  117. end
  118. elsif documents_link = res.at('#tab_documents')
  119. if documents_link.inner_text.match(/\d+/)
  120. app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
  121. app.documents_url = base_url + documents_link[:href]
  122. end
  123. end
  124. # We need to find values in the table by using the th labels.
  125. # The row indexes/positions change from site to site (or even app to app) so we can't rely on that.
  126. res.search('#simpleDetailsTable tr').each do |row|
  127. key = row.at('th').inner_text.strip
  128. value = row.at('td').inner_text.strip
  129. case key
  130. when 'Reference'
  131. app.council_reference = value
  132. when 'Alternative Reference'
  133. app.alternative_reference = value unless value.empty?
  134. when 'Planning Portal Reference'
  135. app.alternative_reference = value unless value.empty?
  136. when 'Application Received'
  137. app.date_received = Date.parse(value) if value.match(/\d/)
  138. when 'Application Registered'
  139. app.date_received = Date.parse(value) if value.match(/\d/)
  140. when 'Application Validated'
  141. app.date_validated = Date.parse(value) if value.match(/\d/)
  142. when 'Address'
  143. app.address = value unless value.empty?
  144. when 'Proposal'
  145. app.description = value unless value.empty?
  146. when 'Status'
  147. app.status = value unless value.empty?
  148. when 'Decision'
  149. app.decision = value unless value.empty?
  150. when 'Decision Issued Date'
  151. app.date_decision = Date.parse(value) if value.match(/\d/)
  152. when 'Appeal Status'
  153. app.appeal_status = value unless value.empty?
  154. when 'Appeal Decision'
  155. app.appeal_decision = value unless value.empty?
  156. else
  157. puts "Error: key '#{key}' not found"
  158. end # case
  159. end
  160. end
  161. end # class
  162. end