A Ruby gem to get planning applications data from UK council websites.
Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 
 

261 linhas
9.2 KiB

  1. require 'mechanize'
  2. require 'pp'
  3. module UKPlanningScraper
  4. class Authority
  5. private
  6. def base_url
  7. @base_url ||= @url.match(/(https?:\/\/.+?)\//)[1]
  8. end
  9. def agent
  10. @agent ||= Mechanize.new
  11. end
  12. def scrape_idox(params, options)
  13. puts "Using Idox scraper."
  14. apps = []
  15. puts "Getting: #{@url}"
  16. page = agent.get(@url) # load the search form page
  17. # Check that the search form is actually present.
  18. # When Idox has an internal error it returns an error page with HTTP 200.
  19. unless form = page.form('searchCriteriaForm')
  20. puts "Error: Search form page failed to load due to Idox internal error."
  21. return []
  22. end
  23. # form.action = form.action + '&searchCriteria.resultsPerPage=100'
  24. # Fill out and submit search form
  25. # Add expected fields to form if they're not already present so that searches using these terms work
  26. %w{
  27. date(applicationReceivedStart)
  28. date(applicationReceivedEnd)
  29. }.each { |f| form.add_field!(f) unless form.has_field?(f) }
  30. date_format = "%d/%m/%Y"
  31. form.send(:"date(applicationReceivedStart)", params[:received_from].strftime(date_format)) if params[:received_from]
  32. form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime(date_format)) if params[:received_to]
  33. form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime(date_format)) if params[:validated_from]
  34. form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime(date_format)) if params[:validated_to]
  35. form.send(:"date(applicationDecisionStart)", params[:decided_from].strftime(date_format)) if params[:decided_from]
  36. form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime(date_format)) if params[:decided_to]
  37. form.send(:"searchCriteria\.description", params[:keywords])
  38. # Some councils don't have the applicant name on their form, eg Bexley
  39. form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
  40. form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType'
  41. # Only some Idox sites (eg Bolton) have a 'searchCriteria.developmentType' parameter
  42. form.send(:"searchCriteria\.developmentType", params[:development_type]) if form.has_field? 'searchCriteria.developmentType'
  43. page = form.submit
  44. if page.search('.errors').inner_text.match(/Too many results found/i)
  45. raise TooManySearchResults.new("Scrape in smaller chunks. Use shorter date ranges and/or more search parameters.")
  46. end
  47. loop do
  48. # Parse search results
  49. items = page.search('li.searchresult')
  50. puts "Found #{items.size} apps on this page."
  51. items.each do |app|
  52. data = Application.new
  53. # Parse info line
  54. info_line = app.at("p.metaInfo").inner_text.strip
  55. bits = info_line.split('|').map { |e| e.strip.delete("\r\n") }
  56. bits.each do |bit|
  57. if matches = bit.match(/Ref\. No:\s+(.+)/)
  58. data.council_reference = matches[1]
  59. end
  60. if matches = bit.match(/(Received|Registered):\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
  61. data.date_received = Date.parse(matches[2])
  62. end
  63. if matches = bit.match(/Validated:\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
  64. data.date_validated = Date.parse(matches[1])
  65. end
  66. if matches = bit.match(/Status:\s+(.+)/)
  67. data.status = matches[1]
  68. end
  69. end
  70. data.scraped_at = Time.now
  71. data.info_url = base_url + app.at('a')['href']
  72. data.address = app.at('p.address').inner_text.strip
  73. data.description = app.at('a').inner_text.strip
  74. apps << data
  75. end
  76. # Get the Next button from the pager, if there is one
  77. if next_button = page.at('a.next')
  78. next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
  79. sleep options[:delay]
  80. puts "Getting: #{next_url}"
  81. page = agent.get(next_url)
  82. else
  83. break
  84. end
  85. end
  86. # Scrape the summary tab for each app
  87. apps.each_with_index do |app, i|
  88. sleep options[:delay]
  89. puts "#{i + 1} of #{apps.size}: #{app.info_url}"
  90. parse_info_url(app) if app.info_url
  91. parse_property_url(app) if app.property_url
  92. parse_property_detail_urls(app) if app.property_detail_urls
  93. end # scrape summary tab for apps
  94. apps
  95. end # scrape_idox
  96. def parse_info_url(app)
  97. res = agent.get(app.info_url)
  98. if res.code == '200' # That's a String not an Integer, ffs
  99. # Parse the summary tab for this app
  100. app.scraped_at = Time.now
  101. # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
  102. # Bradford has #tab_documents but without the document count on it
  103. app.documents_count = 0
  104. if documents_link = res.at('.associateddocument a')
  105. if documents_link.inner_text.match(/\d+/)
  106. app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
  107. app.documents_url = base_url + documents_link[:href]
  108. end
  109. elsif documents_link = res.at('#tab_documents')
  110. if documents_link.inner_text.match(/\d+/)
  111. app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
  112. app.documents_url = base_url + documents_link[:href]
  113. end
  114. end
  115. # We need to find values in the table by using the th labels.
  116. # The row indexes/positions change from site to site (or even app to app) so we can't rely on that.
  117. res.search('#simpleDetailsTable tr').each do |row|
  118. key = row.at('th').inner_text.strip
  119. value = row.at('td').inner_text.strip
  120. case key
  121. when 'Reference'
  122. app.council_reference = value
  123. when 'Alternative Reference'
  124. app.alternative_reference = value unless value.empty?
  125. when 'Planning Portal Reference'
  126. app.alternative_reference = value unless value.empty?
  127. when 'Application Received'
  128. app.date_received = Date.parse(value) if value.match(/\d/)
  129. when 'Application Registered'
  130. app.date_received = Date.parse(value) if value.match(/\d/)
  131. when 'Application Validated'
  132. app.date_validated = Date.parse(value) if value.match(/\d/)
  133. when 'Address'
  134. app.address = value unless value.empty?
  135. when 'Proposal'
  136. app.description = value unless value.empty?
  137. when 'Status'
  138. app.status = value unless value.empty?
  139. when 'Decision'
  140. app.decision = value unless value.empty?
  141. when 'Decision Issued Date'
  142. app.date_decision = Date.parse(value) if value.match(/\d/)
  143. when 'Appeal Status'
  144. app.appeal_status = value unless value.empty?
  145. when 'Appeal Decision'
  146. app.appeal_decision = value unless value.empty?
  147. else
  148. puts "Error: key '#{key}' not found"
  149. end # case
  150. end # each row
  151. # find associated property link
  152. property_association_link = res.at('p.associatedproperty a')
  153. if property_association_link
  154. app.property_url = base_url + property_association_link[:href]
  155. app.property_count = property_association_link.inner_text.to_i
  156. end
  157. else
  158. puts "Error: HTTP #{res.code}"
  159. end # if
  160. end
  161. def parse_property_url(app)
  162. # get URLs of property pages
  163. app.property_detail_urls = []
  164. res = agent.get(app.property_url)
  165. if res.code == '200'
  166. res.search('#Property li a').each do |property_link|
  167. app.property_detail_urls << base_url + property_link[:href]
  168. end
  169. else
  170. puts "Error: HTTP #{res.code}"
  171. end
  172. end
  173. def parse_property_detail_urls(app)
  174. # get property details
  175. app.properties = []
  176. app.property_detail_urls.each do |property_url|
  177. res = agent.get(property_url)
  178. if res.code == '200'
  179. property = Property.new
  180. res.search('#propertyAddress tr').each do |row|
  181. key = row.at('th').inner_text.strip
  182. value = row.at('td').inner_text.strip
  183. case key
  184. when 'UPRN:'
  185. property.uprn = value
  186. when 'Full Address:'
  187. property.address = value unless value.empty?
  188. when 'Property Number:'
  189. property.number = value unless value.empty?
  190. when 'Street:'
  191. property.street = value unless value.empty?
  192. when 'Town:'
  193. property.town = value unless value.empty?
  194. when 'Postcode:'
  195. property.postcode = value unless value.empty?
  196. when 'Ward:'
  197. property.ward = value unless value.empty?
  198. when 'Parish:'
  199. property.parish = value unless value.empty?
  200. end
  201. end
  202. app.properties << property
  203. else
  204. puts "Error: HTTP #{res.code}"
  205. end
  206. end
  207. end
  208. end # class
  209. end