A Ruby gem to get planning applications data from UK council websites.
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
 
 
 

80 行
2.8 KiB

  1. require "uk_planning_scraper/version"
  2. require 'mechanize'
  3. require 'time'
  4. require 'logger'
  5. require 'pp'
  6. module UKPlanningScraper
  7. def self.search(search_url, params, options = {})
  8. default_options = {
  9. delay: 10,
  10. }
  11. @options = default_options.merge(options) # The user-supplied options override the defaults
  12. @search_url = search_url
  13. @base_url = search_url.match(/(https?:\/\/.+?)\//)[1]
  14. apps = []
  15. # Regex doesn't work for Newham, Greenwich, Tower Hamlets which don't have the Received date in the text
  16. meta_regex = /Ref\. No:\s+(.+)\s+.+\s+Received:\s+(.+)\s+.+\s+Validated:\s+(.+)\s+.+\s+Status:\s+(.+)/
  17. agent = Mechanize.new
  18. puts "Getting: #{@search_url}"
  19. page = agent.get(@search_url) # load the search form page
  20. # Fill out and submit search form
  21. form = page.form('searchCriteriaForm')
  22. # form.action = form.action + '&searchCriteria.resultsPerPage=100'
  23. # Some councils don't have the received from/to dates on their form, eg Newham
  24. form.send(:"date(applicationReceivedStart)", params[:received_from].strftime("%d/%m/%Y")) if params[:received_from]
  25. form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime("%d/%m/%Y")) if params[:received_to]
  26. form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime("%d/%m/%Y")) if params[:validated_from]
  27. form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime("%d/%m/%Y")) if params[:validated_to]
  28. form.send(:"searchCriteria\.description", params[:description])
  29. # Some councils don't have the applicant name on their form, eg Bexley
  30. form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
  31. form.send(:"searchCriteria\.caseType", params[:application_type])
  32. page = form.submit
  33. loop do
  34. # Parse search results
  35. items = page.search('li.searchresult')
  36. puts "Found #{items.size} apps on this page."
  37. items.each do |app|
  38. matches = app.at("p.metaInfo").inner_html.match(meta_regex)
  39. data = {
  40. council_reference: matches[1].strip,
  41. scraped_at: Time.now,
  42. date_received: Date.parse(matches[2]),
  43. date_validated: Date.parse(matches[3]),
  44. info_url: @base_url + app.at('a')['href'],
  45. address: app.at('p.address').inner_text.strip,
  46. description: app.at('a').inner_text.strip,
  47. status: matches[4].strip
  48. }
  49. apps << data
  50. end
  51. # Get the Next button from the pager, if there is one
  52. if next_button = page.at('a.next')
  53. next_url = @base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
  54. sleep @options[:delay]
  55. puts "Getting: #{next_url}"
  56. page = agent.get(next_url)
  57. else
  58. break
  59. end
  60. end
  61. apps
  62. end
  63. end