A Ruby gem to get planning applications data from UK council websites.
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 
 

175 linhas
6.7 KiB

  1. require 'mechanize'
  2. require 'pp'
  3. module UKPlanningScraper
  4. class Authority
  5. private
  6. def scrape_idox(params, options)
  7. puts "Using Idox scraper."
  8. base_url = @url.match(/(https?:\/\/.+?)\//)[1]
  9. apps = []
  10. agent = Mechanize.new
  11. puts "Getting: #{@url}"
  12. page = agent.get(@url) # load the search form page
  13. # Check that the search form is actually present.
  14. # When Idox has an internal error it returns an error page with HTTP 200.
  15. unless form = page.form('searchCriteriaForm')
  16. puts "Error: Search form page failed to load due to Idox internal error."
  17. return []
  18. end
  19. # form.action = form.action + '&searchCriteria.resultsPerPage=100'
  20. # Fill out and submit search form
  21. # Some councils don't have the received from/to dates on their form, eg Newham
  22. form.send(:"date(applicationReceivedStart)", params[:received_from].strftime("%d/%m/%Y")) if params[:received_from]
  23. form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime("%d/%m/%Y")) if params[:received_to]
  24. form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime("%d/%m/%Y")) if params[:validated_from]
  25. form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime("%d/%m/%Y")) if params[:validated_to]
  26. form.send(:"date(applicationDecisionStart)", params[:decided_from].strftime("%d/%m/%Y")) if params[:decided_from]
  27. form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime("%d/%m/%Y")) if params[:decided_to]
  28. form.send(:"searchCriteria\.description", params[:keywords])
  29. # Some councils don't have the applicant name on their form, eg Bexley
  30. form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
  31. form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType'
  32. # Some Idox sites (eg Bolton) call this 'searchCriteria.developmentType'
  33. form.send(:"searchCriteria\.developmentType", params[:application_type]) if form.has_field? 'searchCriteria.developmentType'
  34. page = form.submit
  35. loop do
  36. # Parse search results
  37. items = page.search('li.searchresult')
  38. puts "Found #{items.size} apps on this page."
  39. items.each do |app|
  40. data = {}
  41. # Parse info line
  42. info_line = app.at("p.metaInfo").inner_text.strip
  43. bits = info_line.split('|').map { |e| e.strip.delete("\r\n") }
  44. bits.each do |bit|
  45. if matches = bit.match(/Ref\. No:\s+(.+)/)
  46. data[:council_reference] = matches[1]
  47. end
  48. if matches = bit.match(/(Received|Registered):\s+(.+)/)
  49. data[:date_received] = Date.parse(matches[2])
  50. end
  51. if matches = bit.match(/Validated:\s+(.+)/)
  52. data[:date_validated] = Date.parse(matches[1])
  53. end
  54. if matches = bit.match(/Status:\s+(.+)/)
  55. data[:status] = matches[1]
  56. end
  57. end
  58. data.merge!({
  59. scraped_at: Time.now,
  60. info_url: base_url + app.at('a')['href'],
  61. address: app.at('p.address').inner_text.strip,
  62. description: app.at('a').inner_text.strip,
  63. })
  64. apps << data
  65. end
  66. # Get the Next button from the pager, if there is one
  67. if next_button = page.at('a.next')
  68. next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
  69. sleep options[:delay]
  70. puts "Getting: #{next_url}"
  71. page = agent.get(next_url)
  72. else
  73. break
  74. end
  75. end
  76. # Scrape the summary tab for each app
  77. apps.each_with_index do |app, i|
  78. sleep options[:delay]
  79. puts "#{i + 1} of #{apps.size}: #{app[:info_url]}"
  80. res = agent.get(app[:info_url])
  81. if res.code == '200' # That's a String not an Integer, ffs
  82. # Parse the summary tab for this app
  83. app[:scraped_at] = Time.now
  84. # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
  85. # Bradford has #tab_documents but without the document count on it
  86. app[:documents_count] = 0
  87. app[:documents_url] = nil
  88. if documents_link = res.at('.associateddocument a')
  89. if documents_link.inner_text.match(/\d+/)
  90. app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i
  91. app[:documents_url] = base_url + documents_link[:href]
  92. end
  93. elsif documents_link = res.at('#tab_documents')
  94. if documents_link.inner_text.match(/\d+/)
  95. app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i
  96. app[:documents_url] = base_url + documents_link[:href]
  97. end
  98. end
  99. # We need to find values in the table by using the th labels.
  100. # The row indexes/positions change from site to site (or even app to app) so we can't rely on that.
  101. res.search('#simpleDetailsTable tr').each do |row|
  102. key = row.at('th').inner_text.strip
  103. value = row.at('td').inner_text.strip
  104. case key
  105. when 'Reference'
  106. app[:council_reference] = value
  107. when 'Alternative Reference'
  108. app[:alternative_reference] = value
  109. when 'Planning Portal Reference'
  110. app[:alternative_reference] = value
  111. when 'Application Received'
  112. app[:date_received] = Date.parse(value) if value.match(/\d/)
  113. when 'Application Registered'
  114. app[:date_received] = Date.parse(value) if value.match(/\d/)
  115. when 'Application Validated'
  116. app[:date_validated] = Date.parse(value) if value.match(/\d/)
  117. when 'Address'
  118. app[:address] = value
  119. when 'Proposal'
  120. app[:description] = value
  121. when 'Status'
  122. app[:status] = value
  123. when 'Decision'
  124. app[:decision] = value
  125. when 'Decision Issued Date'
  126. app[:date_decision] = Date.parse(value) if value.match(/\d/)
  127. when 'Appeal Status'
  128. app[:appeal_status] = value
  129. when 'Appeal Decision'
  130. app[:appeal_decision] = value
  131. else
  132. puts "Error: key '#{key}' not found"
  133. end # case
  134. end # each row
  135. else
  136. puts "Error: HTTP #{res.code}"
  137. end # if
  138. end # scrape summary tab for apps
  139. apps
  140. end # scrape_idox
  141. end # class
  142. end