A Ruby gem to get planning applications data from UK council websites.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

184 lines
7.3 KiB

  1. require 'mechanize'
  2. require 'pp'
  3. module UKPlanningScraper
  4. class Authority
  5. private
  6. def scrape_idox(params, options)
  7. puts "Using Idox scraper."
  8. base_url = @url.match(/(https?:\/\/.+?)\//)[1]
  9. apps = []
  10. agent = Mechanize.new
  11. puts "Getting: #{@url}"
  12. page = agent.get(@url) # load the search form page
  13. # Check that the search form is actually present.
  14. # When Idox has an internal error it returns an error page with HTTP 200.
  15. unless form = page.form('searchCriteriaForm')
  16. puts "Error: Search form page failed to load due to Idox internal error."
  17. return []
  18. end
  19. # form.action = form.action + '&searchCriteria.resultsPerPage=100'
  20. # Fill out and submit search form
  21. # Add expected fields to form if they're not already present so that searches using these terms work
  22. %w{
  23. date(applicationReceivedStart)
  24. date(applicationReceivedEnd)
  25. }.each { |f| form.add_field!(f) unless form.has_field?(f) }
  26. date_format = "%d/%m/%Y"
  27. form.send(:"date(applicationReceivedStart)", params[:received_from].strftime(date_format)) if params[:received_from]
  28. form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime(date_format)) if params[:received_to]
  29. form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime(date_format)) if params[:validated_from]
  30. form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime(date_format)) if params[:validated_to]
  31. form.send(:"date(applicationDecisionStart)", params[:decided_from].strftime(date_format)) if params[:decided_from]
  32. form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime(date_format)) if params[:decided_to]
  33. form.send(:"searchCriteria\.description", params[:keywords])
  34. form.send(:"searchCriteria\.caseStatus", params[:status])
  35. # Some councils don't have the applicant name on their form, eg Bexley
  36. form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
  37. form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType'
  38. # Only some Idox sites (eg Bolton) have a 'searchCriteria.developmentType' parameter
  39. form.send(:"searchCriteria\.developmentType", params[:development_type]) if form.has_field? 'searchCriteria.developmentType'
  40. page = form.submit
  41. if page.search('.errors').inner_text.match(/Too many results found/i)
  42. raise TooManySearchResults.new("Scrape in smaller chunks. Use shorter date ranges and/or more search parameters.")
  43. end
  44. loop do
  45. # Parse search results
  46. items = page.search('li.searchresult')
  47. puts "Found #{items.size} apps on this page."
  48. items.each do |app|
  49. data = Application.new
  50. # Parse info line
  51. info_line = app.at("p.metaInfo").inner_text.strip
  52. bits = info_line.split('|').map { |e| e.strip.delete("\r\n") }
  53. bits.each do |bit|
  54. if matches = bit.match(/Ref\. No:\s+(.+)/)
  55. data.council_reference = matches[1]
  56. end
  57. if matches = bit.match(/(Received|Registered):\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
  58. data.date_received = Date.parse(matches[2])
  59. end
  60. if matches = bit.match(/Validated:\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
  61. data.date_validated = Date.parse(matches[1])
  62. end
  63. if matches = bit.match(/Status:\s+(.+)/)
  64. data.status = matches[1]
  65. end
  66. end
  67. data.scraped_at = Time.now
  68. data.info_url = base_url + app.at('a')['href']
  69. data.address = app.at('p.address').inner_text.strip
  70. data.description = app.at('a').inner_text.strip
  71. apps << data
  72. end
  73. # Get the Next button from the pager, if there is one
  74. if next_button = page.at('a.next')
  75. next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
  76. sleep options[:delay]
  77. puts "Getting: #{next_url}"
  78. page = agent.get(next_url)
  79. else
  80. break
  81. end
  82. end
  83. # Scrape the summary tab for each app
  84. apps.each_with_index do |app, i|
  85. sleep options[:delay]
  86. puts "#{i + 1} of #{apps.size}: #{app.info_url}"
  87. res = agent.get(app.info_url)
  88. if res.code == '200' # That's a String not an Integer, ffs
  89. # Parse the summary tab for this app
  90. app.scraped_at = Time.now
  91. # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
  92. # Bradford has #tab_documents but without the document count on it
  93. app.documents_count = 0
  94. if documents_link = res.at('.associateddocument a')
  95. if documents_link.inner_text.match(/\d+/)
  96. app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
  97. app.documents_url = base_url + documents_link[:href]
  98. end
  99. elsif documents_link = res.at('#tab_documents')
  100. if documents_link.inner_text.match(/\d+/)
  101. app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
  102. app.documents_url = base_url + documents_link[:href]
  103. end
  104. end
  105. # We need to find values in the table by using the th labels.
  106. # The row indexes/positions change from site to site (or even app to app) so we can't rely on that.
  107. res.search('#simpleDetailsTable tr').each do |row|
  108. key = row.at('th').inner_text.strip
  109. value = row.at('td').inner_text.strip
  110. case key
  111. when 'Reference'
  112. app.council_reference = value
  113. when 'Alternative Reference'
  114. app.alternative_reference = value unless value.empty?
  115. when 'Planning Portal Reference'
  116. app.alternative_reference = value unless value.empty?
  117. when 'Application Received'
  118. app.date_received = Date.parse(value) if value.match(/\d/)
  119. when 'Application Registered'
  120. app.date_received = Date.parse(value) if value.match(/\d/)
  121. when 'Application Validated'
  122. app.date_validated = Date.parse(value) if value.match(/\d/)
  123. when 'Address'
  124. app.address = value unless value.empty?
  125. when 'Proposal'
  126. app.description = value unless value.empty?
  127. when 'Status'
  128. app.status = value unless value.empty?
  129. when 'Decision'
  130. app.decision = value unless value.empty?
  131. when 'Decision Issued Date'
  132. app.date_decision = Date.parse(value) if value.match(/\d/)
  133. when 'Appeal Status'
  134. app.appeal_status = value unless value.empty?
  135. when 'Appeal Decision'
  136. app.appeal_decision = value unless value.empty?
  137. else
  138. puts "Error: key '#{key}' not found"
  139. end # case
  140. end # each row
  141. else
  142. puts "Error: HTTP #{res.code}"
  143. end # if
  144. end # scrape summary tab for apps
  145. apps
  146. end # scrape_idox
  147. end # class
  148. end