RB Kingston upon Thames planning applications
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

scraper.rb 3.1 KiB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. require 'bundler'
  2. Bundler.setup
  3. require 'scraperwiki'
  4. require 'mechanize'
  5. require 'pp'
  6. require 'time'
  7. require 'date'
  8. require 'breasal'
  9. # Use the column names from planningalerts.org.au:
  10. # https://www.planningalerts.org.au/how_to_write_a_scraper
  11. LA_NAME = "Kingston upon Thames"
  12. LA_GSS = "E09000021" # https://mapit.mysociety.org/area/2480.html
  13. BASEURL = "https://maps.kingston.gov.uk/propertyServices/planning/"
  14. # Parse and save a single planning application
  15. def parse(app)
  16. record = {}
  17. record['la_name'] = LA_NAME
  18. record['la_gss'] = LA_GSS
  19. record['council_reference'], record['type'] = app.at("h4").inner_text.split(' - ')
  20. app.search("a").each do |link|
  21. record['info_url'] = BASEURL + link['href'].strip if link['href'].match(/Details/)
  22. record['map_url'] = link['href'].strip if link['href'].match(/\?map=/)
  23. record['images_url'] = BASEURL + link['href'].strip if link['href'].match(/ImageMenu/)
  24. record['comment_url'] = BASEURL + link['href'].strip if link['href'].match(/PlanningComments/)
  25. end
  26. if record['map_url']
  27. matches = record['map_url'].match(/x=(\d+)&y=(\d+)/)
  28. record['easting'] = matches[1].to_i
  29. record['northing'] = matches[2].to_i
  30. en = Breasal::EastingNorthing.new(easting: record['easting'], northing: record['northing'], type: :gb)
  31. record['latitude']= en.to_wgs84[:latitude]
  32. record['longitude'] = en.to_wgs84[:longitude]
  33. end
  34. spans = app.search("span")
  35. record['description'] = spans[0].inner_text
  36. record['address'] = spans[1].inner_text
  37. record['ward'] = spans[2].inner_text
  38. # Decision and decision date
  39. if matches = spans[4].inner_text.match(/(.+?)\s+(\d{1,2}\/\d{1,2}\/\d{4})/)
  40. record['decision'] = matches[1]
  41. record['date_decision'] = Date.parse(matches[2])
  42. end
  43. # Comments/consultation - consultation end date can change during lifetime of application
  44. app.search("dd").each do |dd|
  45. if matches = dd.inner_text.match(/The current closing date for comments on this application is (\d{1,2}-[A-Z][a-z]{2}-\d{4})/)
  46. record['on_notice_to'] = Date.parse(matches[1])
  47. end
  48. end
  49. # Date valid
  50. begin
  51. record['date_valid'] = Date.parse(spans[3].inner_text)
  52. record['date_valid_text'] = nil
  53. rescue ArgumentError
  54. record['date_valid'] = nil
  55. record['date_valid_text'] = spans[3].inner_text
  56. end
  57. # Scraper timestamps
  58. record['updated_at'] = Time.now
  59. record['date_scraped'] = Date.today.to_s
  60. ScraperWiki.save_sqlite(['council_reference'], record)
  61. end
  62. agent = Mechanize.new
  63. agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
  64. # Get all valid applications for the last 12 * 30 days
  65. d = Date.today
  66. 12.times do
  67. d_start = (d - 29).strftime("%d/%m/%Y")
  68. d_end = d.strftime("%d/%m/%Y")
  69. if ENV['SCRAPER_LOCAL']
  70. page = Nokogiri::HTML(open("page.html"))
  71. else
  72. url = "#{BASEURL}Summary?weekListType=SRCH&recFrom=#{d_start}&recTo=#{d_end}&ward=ALL&appTyp=ALL&wardTxt=All%20Wards&appTypTxt=All%20Application%20Types&limit=500"
  73. page = agent.get(url)
  74. puts url
  75. end
  76. apps = page.search("#planningApplication")
  77. puts apps.size, ''
  78. apps.each { |app| parse(app) }
  79. d -= 30
  80. sleep 5
  81. end
  82. # page = Nokogiri::HTML(open("page.html"))