Merton Council planning applications
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

234 lines
7.2 KiB

  1. require 'http'
  2. require 'nokogiri'
  3. require 'uri'
  4. require 'scraperwiki'
  5. require 'pp'
  6. require_relative './parser'
  7. require 'date'
  8. require 'logger'
  9. require 'securerandom'
  10. # Northgate Planning Explorer
  11. SITE_URL = 'http://planning.merton.gov.uk'
  12. BASE_URL = SITE_URL + '/Northgate/PlanningExplorerAA/Generic/'
  13. def crawl_delay
  14. sleep DELAY_S
  15. end
  16. DELAY_S = ENV['SCRAPER_DELAY'].to_f || 10 # seconds. Conservatively slow by default. Scrapes approx 360 pages per hour.
  17. USER_AGENT = ENV['SCRAPER_USER_AGENT']
  18. DATE_REGEX = /\d{2}-\d{2}-\d{4}/
  19. $stdout.sync = true # Flush output buffer after every write so log messages appear immediately.
  20. logger = Logger.new($stdout)
  21. logger.level = ENV['SCRAPER_LOG_LEVEL'].to_i || Logger::INFO
  22. logger.info "Scraper starts. Let's do this."
  23. logger.info "Delay between requests is #{DELAY_S} seconds."
  24. logger.info "User agent is: #{USER_AGENT}"
  25. logger.info "Log level is: #{logger.level}"
  26. # General search
  27. URL = SITE_URL + '/Northgate/PlanningExplorerAA/GeneralSearch.aspx'
  28. form_vars = {
  29. # 'cboStatusCode' => '4', # REGISTERED
  30. 'cboSelectDateValue' => 'DATE_RECEIVED',
  31. # 'cboMonths' => '12', # 1..12
  32. 'cboDays' => 1,
  33. 'rbGroup' => 'rbDay',
  34. 'csbtnSearch' => 'Search' # required
  35. }
  36. logger.info "Form variables: #{form_vars.to_s}"
  37. headers = {
  38. 'Origin' => SITE_URL,
  39. 'Referer' => URL,
  40. 'User-Agent' => USER_AGENT
  41. }
  42. logger.debug "HTTP request headers:"
  43. logger.debug(headers.to_s)
  44. logger.debug "GET: " + URL
  45. response = HTTP.headers(headers).get(URL)
  46. logger.debug "Response code: HTTP " + response.code.to_s
  47. if response.code == 200
  48. doc = Nokogiri::HTML(response.to_s)
  49. asp_vars = {
  50. '__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'],
  51. '__VIEWSTATEGENERATOR' => doc.at('#__VIEWSTATEGENERATOR')['value'],
  52. '__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value']
  53. }
  54. else
  55. logger.fatal "Bad response from search page. Response code: #{response.code.to_s}. Exiting."
  56. exit 1
  57. end
  58. cookies = {}
  59. response.cookies.each { |c| cookies[c.name] = c.value }
  60. form_vars.merge!(asp_vars)
  61. logger.debug "GET: " + URL
  62. response2 = HTTP.headers(headers).cookies(cookies).post(URL, :form => form_vars)
  63. logger.debug "Response code: HTTP " + response2.code.to_s
  64. if response2.code == 302
  65. # Follow the redirect manually
  66. # Set the page size (PS) to max so we don't have to page through search results
  67. results_url = URI::encode(SITE_URL + response2.headers['Location'].gsub!('PS=10', 'PS=99999'))
  68. logger.debug "GET: " + URL
  69. response3 = HTTP.headers(headers).cookies(cookies).get(results_url)
  70. logger.debug "Response code: HTTP " + response3.code.to_s
  71. doc = Nokogiri::HTML(response3.to_s)
  72. else
  73. logger.fatal "Didn't get redirected from search. Exiting."
  74. exit 1
  75. end
  76. rows = doc.search("table.display_table tr")
  77. logger.info "Found #{rows.size - 1} applications in search results."
  78. app_defaults = {
  79. 'la_name' => 'Merton Borough Council',
  80. 'la_slug' => 'merton',
  81. 'la_gss' => 'E09000024', # https://mapit.mysociety.org/area/2500.html
  82. 'date_details_scraped' => nil,
  83. 'date_documents_scraped' => nil,
  84. 'date_dates_scraped' => nil
  85. }
  86. logger.debug "Application defaults: "
  87. logger.debug app_defaults.to_s
  88. # Iterate over search results
  89. rows.each do |row|
  90. if row.at("td") # skip header row which only has th's
  91. cells = row.search("td")
  92. ref = cells[0].inner_text.strip
  93. app = app_defaults.merge(
  94. 'created_at' => Time.now.to_s,
  95. 'uuid' => SecureRandom.uuid
  96. )
  97. begin
  98. res = ScraperWiki.select("* from applications where council_reference=?", ref)
  99. rescue # In case the table doesn't exist, which it won't on first run
  100. true
  101. end
  102. app = res[0] if res && res[0] # res will be nil if the table doesn't exist; [] if that record doesn't exist
  103. app['council_reference'] = ref
  104. app['info_url'] = URI::encode(BASE_URL + cells[0].at("a")['href'].strip)
  105. app['info_url'].gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
  106. app['address'] = cells[1].inner_text.strip
  107. app['description'] = cells[2].inner_text.strip
  108. app['status'] = cells[3].inner_text.strip
  109. raw_date_received = cells[4].inner_text.strip
  110. if raw_date_received != '--'
  111. app['date_received'] = Date.parse(raw_date_received)
  112. else
  113. app['date_received'] = nil
  114. end
  115. app['decision'] = cells[5].inner_text.strip
  116. app['date_scraped'] = Date.today.to_s
  117. app['updated_at'] = Time.now.to_s
  118. ScraperWiki.save_sqlite(['council_reference'], app, 'applications')
  119. end
  120. end
  121. # Scrape details for all apps that don't have them
  122. apps = ScraperWiki.select("* from applications where date_details_scraped is null or date_details_scraped < ? order by date_received desc", Date.today.to_s)
  123. logger.info "Scraping details for #{apps.size} applications."
  124. i = 0
  125. apps.each do |app|
  126. i += 1
  127. logger.info "#{i} of #{apps.size}: Scraping details for app: #{app['council_reference']}."
  128. crawl_delay
  129. # Scrape details page
  130. res = HTTP.headers(headers).cookies(cookies).get(app['info_url'])
  131. if res.code == 200
  132. # Parse details page
  133. parsed_details = parse_details(res.to_s)
  134. app.merge!(parsed_details)
  135. app['date_details_scraped'] = Date.today.to_s
  136. app['updated_at'] = Time.now.to_s
  137. ScraperWiki.save_sqlite(['council_reference'], app, 'applications')
  138. else
  139. logger.error "Failed to get #{app['info_url']} - HTTP " + res.code.to_s # FIXME improve message
  140. end
  141. end
  142. # Scrape dates page for apps that don't have them
  143. apps = ScraperWiki.select("* from applications where date_dates_scraped is null or date_dates_scraped < ? order by date_received desc", Date.today.to_s)
  144. logger.info "Scraping dates for #{apps.size} applications."
  145. i = 0
  146. apps.each do |app|
  147. i += 1
  148. logger.info "#{i} of #{apps.size}: Scraping dates for #{app['council_reference']}."
  149. crawl_delay
  150. # Scrape dates page
  151. res = HTTP.headers(headers).cookies(cookies).get(app['dates_url'])
  152. if res.code == 200
  153. # Parse dates page
  154. parsed_dates = parse_dates(res.to_s)
  155. app.merge!(parsed_dates)
  156. app['date_dates_scraped'] = Date.today.to_s
  157. app['updated_at'] = Time.now.to_s
  158. ScraperWiki.save_sqlite(['council_reference'], app, 'applications')
  159. else
  160. logger.error "Error: " + res.code.to_s # FIXME improve message
  161. end
  162. end
  163. # Scrape documents for apps that don't have them
  164. apps = ScraperWiki.select("* from applications where date_documents_scraped is null or date_documents_scraped < ? order by date_received desc", Date.today.to_s)
  165. logger.info "Scraping documents for #{apps.size} applications."
  166. i = 0
  167. apps.each do |app|
  168. i += 1
  169. logger.info "#{i} of #{apps.size}: Scraping documents for #{app['council_reference']}."
  170. crawl_delay
  171. # Scrape documents page
  172. res = HTTP.headers(headers).cookies(cookies).get(app['documents_url'])
  173. if res.code == 200
  174. # Parse documents page
  175. docs = parse_documents(res.to_s)
  176. docs.each do |d|
  177. d['council_reference'] = app['council_reference']
  178. ScraperWiki.save_sqlite(['council_reference', 'url'], d, 'documents')
  179. end
  180. app['documents_qty'] = docs.size
  181. app['date_documents_scraped'] = Date.today.to_s
  182. ScraperWiki.save_sqlite(['council_reference'], app, 'applications')
  183. else
  184. logger.error "Error: " + res.code.to_s # FIXME improve message
  185. end
  186. end
  187. logger.info "Scraper finishes. We did it."
  188. logger.close