A Ruby gem to get planning applications data from UK council websites.
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

184 lignes
7.3 KiB

  1. require 'mechanize'
  2. require 'pp'
  3. module UKPlanningScraper
  4. class Authority
  5. private
  6. def scrape_idox(params, options)
  7. puts "Using Idox scraper."
  8. base_url = @url.match(/(https?:\/\/.+?)\//)[1]
  9. apps = []
  10. agent = Mechanize.new
  11. puts "Getting: #{@url}"
  12. page = agent.get(@url) # load the search form page
  13. # Check that the search form is actually present.
  14. # When Idox has an internal error it returns an error page with HTTP 200.
  15. unless form = page.form('searchCriteriaForm')
  16. puts "Error: Search form page failed to load due to Idox internal error."
  17. return []
  18. end
  19. # form.action = form.action + '&searchCriteria.resultsPerPage=100'
  20. # Fill out and submit search form
  21. # Add expected fields to form if they're not already present so that searches using these terms work
  22. %w{
  23. date(applicationReceivedStart)
  24. date(applicationReceivedEnd)
  25. }.each { |f| form.add_field!(f) unless form.has_field?(f) }
  26. date_format = "%d/%m/%Y"
  27. form.send(:"date(applicationReceivedStart)", params[:received_from].strftime(date_format)) if params[:received_from]
  28. form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime(date_format)) if params[:received_to]
  29. form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime(date_format)) if params[:validated_from]
  30. form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime(date_format)) if params[:validated_to]
  31. form.send(:"date(applicationDecisionStart)", params[:decided_from].strftime(date_format)) if params[:decided_from]
  32. form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime(date_format)) if params[:decided_to]
  33. form.send(:"searchCriteria\.description", params[:keywords])
  34. form.send(:"searchCriteria\.caseStatus", params[:status])
  35. # Some councils don't have the applicant name on their form, eg Bexley
  36. form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
  37. form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType'
  38. # Only some Idox sites (eg Bolton) have a 'searchCriteria.developmentType' parameter
  39. form.send(:"searchCriteria\.developmentType", params[:development_type]) if form.has_field? 'searchCriteria.developmentType'
  40. page = form.submit
  41. if page.search('.errors').inner_text.match(/Too many results found/i)
  42. raise TooManySearchResults.new("Scrape in smaller chunks. Use shorter date ranges and/or more search parameters.")
  43. end
  44. loop do
  45. # Parse search results
  46. items = page.search('li.searchresult')
  47. puts "Found #{items.size} apps on this page."
  48. items.each do |app|
  49. data = Application.new
  50. # Parse info line
  51. info_line = app.at("p.metaInfo").inner_text.strip
  52. bits = info_line.split('|').map { |e| e.strip.delete("\r\n") }
  53. bits.each do |bit|
  54. if matches = bit.match(/Ref\. No:\s+(.+)/)
  55. data.council_reference = matches[1]
  56. end
  57. if matches = bit.match(/(Received|Registered):\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
  58. data.date_received = Date.parse(matches[2])
  59. end
  60. if matches = bit.match(/Validated:\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
  61. data.date_validated = Date.parse(matches[1])
  62. end
  63. if matches = bit.match(/Status:\s+(.+)/)
  64. data.status = matches[1]
  65. end
  66. end
  67. data.scraped_at = Time.now
  68. data.info_url = base_url + app.at('a')['href']
  69. data.address = app.at('p.address').inner_text.strip
  70. data.description = app.at('a').inner_text.strip
  71. apps << data
  72. end
  73. # Get the Next button from the pager, if there is one
  74. if next_button = page.at('a.next')
  75. next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
  76. sleep options[:delay]
  77. puts "Getting: #{next_url}"
  78. page = agent.get(next_url)
  79. else
  80. break
  81. end
  82. end
  83. # Scrape the summary tab for each app
  84. apps.each_with_index do |app, i|
  85. sleep options[:delay]
  86. puts "#{i + 1} of #{apps.size}: #{app.info_url}"
  87. res = agent.get(app.info_url)
  88. if res.code == '200' # That's a String not an Integer, ffs
  89. # Parse the summary tab for this app
  90. app.scraped_at = Time.now
  91. # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
  92. # Bradford has #tab_documents but without the document count on it
  93. app.documents_count = 0
  94. if documents_link = res.at('.associateddocument a')
  95. if documents_link.inner_text.match(/\d+/)
  96. app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
  97. app.documents_url = base_url + documents_link[:href]
  98. end
  99. elsif documents_link = res.at('#tab_documents')
  100. if documents_link.inner_text.match(/\d+/)
  101. app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
  102. app.documents_url = base_url + documents_link[:href]
  103. end
  104. end
  105. # We need to find values in the table by using the th labels.
  106. # The row indexes/positions change from site to site (or even app to app) so we can't rely on that.
  107. res.search('#simpleDetailsTable tr').each do |row|
  108. key = row.at('th').inner_text.strip
  109. value = row.at('td').inner_text.strip
  110. case key
  111. when 'Reference'
  112. app.council_reference = value
  113. when 'Alternative Reference'
  114. app.alternative_reference = value unless value.empty?
  115. when 'Planning Portal Reference'
  116. app.alternative_reference = value unless value.empty?
  117. when 'Application Received'
  118. app.date_received = Date.parse(value) if value.match(/\d/)
  119. when 'Application Registered'
  120. app.date_received = Date.parse(value) if value.match(/\d/)
  121. when 'Application Validated'
  122. app.date_validated = Date.parse(value) if value.match(/\d/)
  123. when 'Address'
  124. app.address = value unless value.empty?
  125. when 'Proposal'
  126. app.description = value unless value.empty?
  127. when 'Status'
  128. app.status = value unless value.empty?
  129. when 'Decision'
  130. app.decision = value unless value.empty?
  131. when 'Decision Issued Date'
  132. app.date_decision = Date.parse(value) if value.match(/\d/)
  133. when 'Appeal Status'
  134. app.appeal_status = value unless value.empty?
  135. when 'Appeal Decision'
  136. app.appeal_decision = value unless value.empty?
  137. else
  138. puts "Error: key '#{key}' not found"
  139. end # case
  140. end # each row
  141. else
  142. puts "Error: HTTP #{res.code}"
  143. end # if
  144. end # scrape summary tab for apps
  145. apps
  146. end # scrape_idox
  147. end # class
  148. end