A Ruby gem to get planning applications data from UK council websites.
Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 
 

80 linhas
2.9 KiB

  1. require "uk_planning_scraper/version"
  2. require 'mechanize'
  3. require 'time'
  4. require 'logger'
  5. require 'pp'
  6. module UKPlanningScraper
  7. def self.search(search_url, criteria, options = {})
  8. default_options = {
  9. delay: 10,
  10. }
  11. @options = default_options.merge(options) # The user-supplied options override the defaults
  12. @search_url = search_url
  13. @base_url = search_url.match(/(https?:\/\/.+?)\//)[1]
  14. apps = []
  15. # Regex doesn't work for Newham, Greenwich, Tower Hamlets which don't have the Received date in the text
  16. meta_regex = /Ref\. No:\s+(.+)\s+.+\s+Received:\s+(.+)\s+.+\s+Validated:\s+(.+)\s+.+\s+Status:\s+(.+)/
  17. agent = Mechanize.new
  18. puts "Getting: #{@search_url}"
  19. page = agent.get(@search_url) # load the search form page
  20. # Fill out and submit search form
  21. form = page.form('searchCriteriaForm')
  22. # form.action = form.action + '&searchCriteria.resultsPerPage=100'
  23. # Some councils don't have the received from/to dates on their form, eg Newham
  24. form.send(:"date(applicationReceivedStart)", criteria[:received_from].strftime("%d/%m/%Y")) if criteria[:received_from]
  25. form.send(:"date(applicationReceivedEnd)", criteria[:received_to].strftime("%d/%m/%Y")) if criteria[:received_to]
  26. form.send(:"date(applicationValidatedStart)", criteria[:validated_from].strftime("%d/%m/%Y")) if criteria[:validated_from]
  27. form.send(:"date(applicationValidatedEnd)", criteria[:validated_to].strftime("%d/%m/%Y")) if criteria[:validated_to]
  28. form.send(:"searchCriteria\.description", criteria[:description])
  29. # Some councils don't have the applicant name on their form, eg Bexley
  30. form.send(:"searchCriteria\.applicantName", criteria[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
  31. form.send(:"searchCriteria\.caseType", criteria[:application_type])
  32. page = form.submit
  33. loop do
  34. # Parse search results
  35. items = page.search('li.searchresult')
  36. puts "Found #{items.size} apps on this page."
  37. items.each do |app|
  38. matches = app.at("p.metaInfo").inner_html.match(meta_regex)
  39. data = {
  40. council_reference: matches[1].strip,
  41. scraped_at: Time.now,
  42. date_received: Date.parse(matches[2]),
  43. date_validated: Date.parse(matches[3]),
  44. info_url: @base_url + app.at('a')['href'],
  45. address: app.at('p.address').inner_text.strip,
  46. description: app.at('a').inner_text.strip,
  47. status: matches[4].strip
  48. }
  49. apps << data
  50. end
  51. # Get the Next button from the pager, if there is one
  52. if next_button = page.at('a.next')
  53. next_url = @base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
  54. sleep @options[:delay]
  55. puts "Getting: #{next_url}"
  56. page = agent.get(next_url)
  57. else
  58. break
  59. end
  60. end
  61. apps
  62. end
  63. end