A Ruby gem to get planning applications data from UK council websites.
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 

80 lignes
2.9 KiB

  1. require "uk_planning_scraper/version"
  2. require 'mechanize'
  3. require 'time'
  4. require 'logger'
  5. require 'pp'
  6. module UKPlanningScraper
  7. def self.search(search_url, criteria, options = {})
  8. default_options = {
  9. delay: 10,
  10. }
  11. @options = default_options.merge(options) # The user-supplied options override the defaults
  12. @search_url = search_url
  13. @base_url = search_url.match(/(https?:\/\/.+?)\//)[1]
  14. apps = []
  15. # Regex doesn't work for Newham, Greenwich, Tower Hamlets which don't have the Received date in the text
  16. meta_regex = /Ref\. No:\s+(.+)\s+.+\s+Received:\s+(.+)\s+.+\s+Validated:\s+(.+)\s+.+\s+Status:\s+(.+)/
  17. agent = Mechanize.new
  18. puts "Getting: #{@search_url}"
  19. page = agent.get(@search_url) # load the search form page
  20. # Fill out and submit search form
  21. form = page.form('searchCriteriaForm')
  22. # form.action = form.action + '&searchCriteria.resultsPerPage=100'
  23. # Some councils don't have the received from/to dates on their form, eg Newham
  24. form.send(:"date(applicationReceivedStart)", criteria[:received_from].strftime("%d/%m/%Y")) if criteria[:received_from]
  25. form.send(:"date(applicationReceivedEnd)", criteria[:received_to].strftime("%d/%m/%Y")) if criteria[:received_to]
  26. form.send(:"date(applicationValidatedStart)", criteria[:validated_from].strftime("%d/%m/%Y")) if criteria[:validated_from]
  27. form.send(:"date(applicationValidatedEnd)", criteria[:validated_to].strftime("%d/%m/%Y")) if criteria[:validated_to]
  28. form.send(:"searchCriteria\.description", criteria[:description])
  29. # Some councils don't have the applicant name on their form, eg Bexley
  30. form.send(:"searchCriteria\.applicantName", criteria[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
  31. form.send(:"searchCriteria\.caseType", criteria[:application_type])
  32. page = form.submit
  33. loop do
  34. # Parse search results
  35. items = page.search('li.searchresult')
  36. puts "Found #{items.size} apps on this page."
  37. items.each do |app|
  38. matches = app.at("p.metaInfo").inner_html.match(meta_regex)
  39. data = {
  40. council_reference: matches[1].strip,
  41. scraped_at: Time.now,
  42. date_received: Date.parse(matches[2]),
  43. date_validated: Date.parse(matches[3]),
  44. info_url: @base_url + app.at('a')['href'],
  45. address: app.at('p.address').inner_text.strip,
  46. description: app.at('a').inner_text.strip,
  47. status: matches[4].strip
  48. }
  49. apps << data
  50. end
  51. # Get the Next button from the pager, if there is one
  52. if next_button = page.at('a.next')
  53. next_url = @base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
  54. sleep @options[:delay]
  55. puts "Getting: #{next_url}"
  56. page = agent.get(next_url)
  57. else
  58. break
  59. end
  60. end
  61. apps
  62. end
  63. end