A Ruby gem to get planning applications data from UK council websites.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

162 regels
5.9 KiB

  1. require 'mechanize'
  2. require 'pp'
  3. module UKPlanningScraper
  4. def self.scrape_idox(search_url, params, options)
  5. puts "Using Idox scraper."
  6. base_url = search_url.match(/(https?:\/\/.+?)\//)[1]
  7. apps = []
  8. agent = Mechanize.new
  9. puts "Getting: #{search_url}"
  10. page = agent.get(search_url) # load the search form page
  11. # Fill out and submit search form
  12. form = page.form('searchCriteriaForm')
  13. # form.action = form.action + '&searchCriteria.resultsPerPage=100'
  14. # Some councils don't have the received from/to dates on their form, eg Newham
  15. form.send(:"date(applicationReceivedStart)", params[:received_from].strftime("%d/%m/%Y")) if params[:received_from]
  16. form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime("%d/%m/%Y")) if params[:received_to]
  17. form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime("%d/%m/%Y")) if params[:validated_from]
  18. form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime("%d/%m/%Y")) if params[:validated_to]
  19. form.send(:"date(applicationDecisionStart)", params[:decided_from].strftime("%d/%m/%Y")) if params[:decided_from]
  20. form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime("%d/%m/%Y")) if params[:decided_to]
  21. form.send(:"searchCriteria\.description", params[:keywords])
  22. # Some councils don't have the applicant name on their form, eg Bexley
  23. form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
  24. form.send(:"searchCriteria\.caseType", params[:application_type])
  25. page = form.submit
  26. loop do
  27. # Parse search results
  28. items = page.search('li.searchresult')
  29. puts "Found #{items.size} apps on this page."
  30. items.each do |app|
  31. data = {}
  32. # Parse info line
  33. info_line = app.at("p.metaInfo").inner_text.strip
  34. bits = info_line.split('|').map { |e| e.strip.delete("\r\n") }
  35. bits.each do |bit|
  36. if matches = bit.match(/Ref\. No:\s+(.+)/)
  37. data[:council_reference] = matches[1]
  38. end
  39. if matches = bit.match(/(Received|Registered):\s+(.+)/)
  40. data[:date_received] = Date.parse(matches[2])
  41. end
  42. if matches = bit.match(/Validated:\s+(.+)/)
  43. data[:date_validated] = Date.parse(matches[1])
  44. end
  45. if matches = bit.match(/Status:\s+(.+)/)
  46. data[:status] = matches[1]
  47. end
  48. end
  49. data.merge!({
  50. scraped_at: Time.now,
  51. info_url: base_url + app.at('a')['href'],
  52. address: app.at('p.address').inner_text.strip,
  53. description: app.at('a').inner_text.strip,
  54. })
  55. apps << data
  56. end
  57. # Get the Next button from the pager, if there is one
  58. if next_button = page.at('a.next')
  59. next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
  60. sleep options[:delay]
  61. puts "Getting: #{next_url}"
  62. page = agent.get(next_url)
  63. else
  64. break
  65. end
  66. end
  67. # Scrape the summary tab for each app
  68. apps.each_with_index do |app, i|
  69. sleep options[:delay]
  70. puts "#{i + 1} of #{apps.size}: #{app[:info_url]}"
  71. res = agent.get(app[:info_url])
  72. if res.code == '200' # That's a String not an Integer, ffs
  73. # Parse the summary tab for this app
  74. app[:scraped_at] = Time.now
  75. # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
  76. # Bradford has #tab_documents but without the document count on it
  77. app[:documents_count] = 0
  78. app[:documents_url] = nil
  79. if documents_link = res.at('.associateddocument a')
  80. if documents_link.inner_text.match(/\d+/)
  81. app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i
  82. app[:documents_url] = base_url + documents_link[:href]
  83. end
  84. elsif documents_link = res.at('#tab_documents')
  85. if documents_link.inner_text.match(/\d+/)
  86. app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i
  87. app[:documents_url] = base_url + documents_link[:href]
  88. end
  89. end
  90. # We need to find values in the table by using the th labels.
  91. # The row indexes/positions change from site to site (or even app to app) so we can't rely on that.
  92. res.search('#simpleDetailsTable tr').each do |row|
  93. key = row.at('th').inner_text.strip
  94. value = row.at('td').inner_text.strip
  95. case key
  96. when 'Reference'
  97. app[:council_reference] = value
  98. when 'Alternative Reference'
  99. app[:alternative_reference] = value
  100. when 'Planning Portal Reference'
  101. app[:alternative_reference] = value
  102. when 'Application Received'
  103. app[:date_received] = Date.parse(value) if value.match(/\d/)
  104. when 'Application Registered'
  105. app[:date_received] = Date.parse(value) if value.match(/\d/)
  106. when 'Application Validated'
  107. app[:date_validated] = Date.parse(value) if value.match(/\d/)
  108. when 'Address'
  109. app[:address] = value
  110. when 'Proposal'
  111. app[:description] = value
  112. when 'Status'
  113. app[:status] = value
  114. when 'Decision'
  115. app[:decision] = value
  116. when 'Decision Issued Date'
  117. app[:date_decision] = Date.parse(value) if value.match(/\d/)
  118. when 'Appeal Status'
  119. app[:appeal_status] = value
  120. when 'Appeal Decision'
  121. app[:appeal_decision] = value
  122. else
  123. puts "Error: key '#{key}' not found"
  124. end # case
  125. end # each row
  126. else
  127. puts "Error: HTTP #{res.code}"
  128. end # if
  129. end # scrape summary tab for apps
  130. apps
  131. end # scrape_idox
  132. end