Merton Council planning applications
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

scraper.rb 7.6 KiB

4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. require 'http'
  2. require 'nokogiri'
  3. require 'uri'
  4. require 'scraperwiki'
  5. require 'pp'
  6. require_relative './parser'
  7. require 'date'
  8. require 'logger'
  9. require 'securerandom'
  10. # Northgate Planning Explorer
  11. SITE_URL = 'https://planning.merton.gov.uk'
  12. BASE_URL = SITE_URL + '/Northgate/PlanningExplorerAA/Generic/'
  13. def crawl_delay
  14. sleep DELAY_S
  15. end
  16. DELAY_S = ENV['MORPH_DELAY'].to_f || 10 # seconds. Conservatively slow by default. Scrapes approx 360 pages per hour.
  17. USER_AGENT = ENV['MORPH_USER_AGENT']
  18. DATE_REGEX = /\d{2}-\d{2}-\d{4}/
  19. $stdout.sync = true # Flush output buffer after every write so log messages appear immediately.
  20. logger = Logger.new($stdout)
  21. logger.level = ENV['MORPH_LOG_LEVEL'].to_i || Logger::INFO
  22. logger.info "Scraper starts. Let's do this."
  23. logger.info "Delay between requests is #{DELAY_S} seconds."
  24. logger.info "User agent is: #{USER_AGENT}"
  25. logger.info "Log level is: #{logger.level}"
  26. # General search
  27. URL = SITE_URL + '/Northgate/PlanningExplorerAA/GeneralSearch.aspx'
  28. form_vars = {
  29. 'cboSelectDateValue' => 'DATE_RECEIVED',
  30. 'csbtnSearch' => 'Search' # required
  31. }
  32. # If both MORPH_DAYS and MORPH_MONTHS are set, MORPH_DAYS should be used.
  33. unless ENV['MORPH_DAYS'] || ENV['MORPH_MONTHS']
  34. logger.fatal "Neither MORPH_MONTHS nor MORPH_DAYS set. Nothing to scrape. Exiting."
  35. exit 1
  36. end
  37. if ENV['MORPH_MONTHS']
  38. form_vars.merge!({
  39. 'cboMonths' => ENV['MORPH_MONTHS'],
  40. 'rbGroup' => 'rbMonth'
  41. })
  42. end
  43. if ENV['MORPH_DAYS']
  44. form_vars.merge!({
  45. 'cboMonths' => nil,
  46. 'cboDays' => ENV['MORPH_DAYS'],
  47. 'rbGroup' => 'rbDay'
  48. })
  49. end
  50. form_vars.merge!({ 'cboStatusCode' => ENV['MORPH_STATUS']}) if ENV['MORPH_STATUS']
  51. logger.info "Form variables: #{form_vars.to_s}"
  52. headers = {
  53. 'Origin' => SITE_URL,
  54. 'Referer' => URL,
  55. 'User-Agent' => USER_AGENT
  56. }
  57. logger.debug "HTTP request headers:"
  58. logger.debug(headers.to_s)
  59. logger.debug "GET: " + URL
  60. response = HTTP.headers(headers).get(URL)
  61. logger.debug "Response code: HTTP " + response.code.to_s
  62. if response.code == 200
  63. doc = Nokogiri::HTML(response.to_s)
  64. asp_vars = {
  65. '__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'],
  66. '__VIEWSTATEGENERATOR' => doc.at('#__VIEWSTATEGENERATOR')['value'],
  67. '__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value']
  68. }
  69. else
  70. logger.fatal "Bad response from search page. Response code: #{response.code.to_s}. Exiting."
  71. exit 1
  72. end
  73. cookies = {}
  74. response.cookies.each { |c| cookies[c.name] = c.value }
  75. form_vars.merge!(asp_vars)
  76. logger.debug "GET: " + URL
  77. response2 = HTTP.headers(headers).cookies(cookies).post(URL, :form => form_vars)
  78. logger.debug "Response code: HTTP " + response2.code.to_s
  79. if response2.code == 302
  80. # Follow the redirect manually
  81. # Set the page size (PS) to max so we don't have to page through search results
  82. results_url = URI::encode(SITE_URL + response2.headers['Location'].gsub!('PS=10', 'PS=99999'))
  83. logger.debug "GET: " + URL
  84. response3 = HTTP.headers(headers).cookies(cookies).get(results_url)
  85. logger.debug "Response code: HTTP " + response3.code.to_s
  86. doc = Nokogiri::HTML(response3.to_s)
  87. else
  88. logger.fatal "Didn't get redirected from search. Exiting."
  89. exit 1
  90. end
  91. rows = doc.search("table.display_table tr")
  92. logger.info "Found #{rows.size - 1} applications in search results."
  93. app_defaults = {
  94. 'la_name' => 'Merton Borough Council',
  95. 'la_slug' => 'merton',
  96. 'la_gss' => 'E09000024', # https://mapit.mysociety.org/area/2500.html
  97. 'date_details_scraped' => nil,
  98. 'date_documents_scraped' => nil,
  99. 'date_dates_scraped' => nil
  100. }
  101. logger.debug "Application defaults: "
  102. logger.debug app_defaults.to_s
  103. # Iterate over search results
  104. rows.each do |row|
  105. if row.at("td") # skip header row which only has th's
  106. cells = row.search("td")
  107. ref = cells[0].inner_text.strip
  108. app = app_defaults.merge(
  109. 'created_at' => Time.now.to_s,
  110. 'uuid' => SecureRandom.uuid
  111. )
  112. begin
  113. res = ScraperWiki.select("* from applications where council_reference=?", ref)
  114. rescue # In case the table doesn't exist, which it won't on first run
  115. true
  116. end
  117. app = res[0] if res && res[0] # res will be nil if the table doesn't exist; [] if that record doesn't exist
  118. app['council_reference'] = ref
  119. app['info_url'] = URI::encode(BASE_URL + cells[0].at("a")['href'].strip)
  120. app['info_url'].gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
  121. app['address'] = cells[1].inner_text.strip
  122. app['description'] = cells[2].inner_text.strip
  123. app['status'] = cells[3].inner_text.strip
  124. raw_date_received = cells[4].inner_text.strip
  125. if raw_date_received != '--'
  126. app['date_received'] = Date.parse(raw_date_received)
  127. else
  128. app['date_received'] = nil
  129. end
  130. app['decision'] = cells[5].inner_text.strip
  131. app['date_scraped'] = Date.today.to_s
  132. app['updated_at'] = Time.now.to_s
  133. ScraperWiki.save_sqlite(['council_reference'], app, 'applications')
  134. end
  135. end
  136. # Scrape details for all apps that don't have them
  137. apps = ScraperWiki.select("* from applications where date_details_scraped is null or date_details_scraped < ? order by date_received desc", Date.today.to_s)
  138. logger.info "Scraping details for #{apps.size} applications."
  139. i = 0
  140. apps.each do |app|
  141. i += 1
  142. logger.info "#{i} of #{apps.size}: Scraping details for app: #{app['council_reference']}."
  143. crawl_delay
  144. # Scrape details page
  145. res = HTTP.headers(headers).cookies(cookies).get(app['info_url'])
  146. if res.code == 200
  147. # Parse details page
  148. parsed_details = parse_details(res.to_s)
  149. app.merge!(parsed_details)
  150. app['date_details_scraped'] = Date.today.to_s
  151. app['updated_at'] = Time.now.to_s
  152. ScraperWiki.save_sqlite(['council_reference'], app, 'applications')
  153. else
  154. logger.error "Failed to get #{app['info_url']} - HTTP " + res.code.to_s # FIXME improve message
  155. end
  156. end
  157. # Scrape dates page for apps that don't have them
  158. apps = ScraperWiki.select("* from applications where date_dates_scraped is null or date_dates_scraped < ? order by date_received desc", Date.today.to_s)
  159. logger.info "Scraping dates for #{apps.size} applications."
  160. i = 0
  161. apps.each do |app|
  162. i += 1
  163. logger.info "#{i} of #{apps.size}: Scraping dates for #{app['council_reference']}."
  164. crawl_delay
  165. # Scrape dates page
  166. res = HTTP.headers(headers).cookies(cookies).get(app['dates_url'])
  167. if res.code == 200
  168. # Parse dates page
  169. parsed_dates = parse_dates(res.to_s)
  170. app.merge!(parsed_dates)
  171. app['date_dates_scraped'] = Date.today.to_s
  172. app['updated_at'] = Time.now.to_s
  173. ScraperWiki.save_sqlite(['council_reference'], app, 'applications')
  174. else
  175. logger.error "Error: " + res.code.to_s # FIXME improve message
  176. end
  177. end
  178. # Scrape documents for apps that don't have them
  179. apps = ScraperWiki.select("* from applications where date_documents_scraped is null or date_documents_scraped < ? order by date_received desc", Date.today.to_s)
  180. logger.info "Scraping documents for #{apps.size} applications."
  181. i = 0
  182. apps.each do |app|
  183. i += 1
  184. logger.info "#{i} of #{apps.size}: Scraping documents for #{app['council_reference']}."
  185. crawl_delay
  186. # Scrape documents page
  187. res = HTTP.headers(headers).cookies(cookies).get(app['documents_url'])
  188. if res.code == 200
  189. # Parse documents page
  190. docs = parse_documents(res.to_s)
  191. docs.each do |d|
  192. d['council_reference'] = app['council_reference']
  193. ScraperWiki.save_sqlite(['council_reference', 'url'], d, 'documents')
  194. end
  195. app['documents_qty'] = docs.size
  196. app['date_documents_scraped'] = Date.today.to_s
  197. ScraperWiki.save_sqlite(['council_reference'], app, 'applications')
  198. else
  199. logger.error "Error: " + res.code.to_s # FIXME improve message
  200. end
  201. end
  202. logger.info "Scraper finishes. We did it."
  203. logger.close