A Ruby gem to get planning applications data from UK council websites.
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.
 
 
 

80 Zeilen
2.9 KiB

  1. require "uk_planning_scraper/version"
  2. require 'mechanize'
  3. require 'time'
  4. require 'logger'
  5. require 'pp'
  6. module UKPlanningScraper
  7. def self.search(search_url, criteria, options = {})
  8. default_options = {
  9. delay: 10,
  10. }
  11. @options = default_options.merge(options) # The user-supplied options override the defaults
  12. @search_url = search_url
  13. @base_url = search_url.match(/(https?:\/\/.+?)\//)[1]
  14. apps = []
  15. # Regex doesn't work for Newham, Greenwich, Tower Hamlets which don't have the Received date in the text
  16. meta_regex = /Ref\. No:\s+(.+)\s+.+\s+Received:\s+(.+)\s+.+\s+Validated:\s+(.+)\s+.+\s+Status:\s+(.+)/
  17. agent = Mechanize.new
  18. puts "Getting: #{@search_url}"
  19. page = agent.get(@search_url) # load the search form page
  20. # Fill out and submit search form
  21. form = page.form('searchCriteriaForm')
  22. # form.action = form.action + '&searchCriteria.resultsPerPage=100'
  23. # Some councils don't have the received from/to dates on their form, eg Newham
  24. form.send(:"date(applicationReceivedStart)", criteria[:received_from].strftime("%d/%m/%Y")) if criteria[:received_from]
  25. form.send(:"date(applicationReceivedEnd)", criteria[:received_to].strftime("%d/%m/%Y")) if criteria[:received_to]
  26. form.send(:"date(applicationValidatedStart)", criteria[:validated_from].strftime("%d/%m/%Y")) if criteria[:validated_from]
  27. form.send(:"date(applicationValidatedEnd)", criteria[:validated_to].strftime("%d/%m/%Y")) if criteria[:validated_to]
  28. form.send(:"searchCriteria\.description", criteria[:description])
  29. # Some councils don't have the applicant name on their form, eg Bexley
  30. form.send(:"searchCriteria\.applicantName", criteria[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
  31. form.send(:"searchCriteria\.caseType", criteria[:application_type])
  32. page = form.submit
  33. loop do
  34. # Parse search results
  35. items = page.search('li.searchresult')
  36. puts "Found #{items.size} apps on this page."
  37. items.each do |app|
  38. matches = app.at("p.metaInfo").inner_html.match(meta_regex)
  39. data = {
  40. council_reference: matches[1].strip,
  41. scraped_at: Time.now,
  42. date_received: Date.parse(matches[2]),
  43. date_validated: Date.parse(matches[3]),
  44. info_url: @base_url + app.at('a')['href'],
  45. address: app.at('p.address').inner_text.strip,
  46. description: app.at('a').inner_text.strip,
  47. status: matches[4].strip
  48. }
  49. apps << data
  50. end
  51. # Get the Next button from the pager, if there is one
  52. if next_button = page.at('a.next')
  53. next_url = @base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
  54. sleep @options[:delay]
  55. puts "Getting: #{next_url}"
  56. page = agent.get(next_url)
  57. else
  58. break
  59. end
  60. end
  61. apps
  62. end
  63. end