| @@ -1,55 +1,88 @@ | |||||
| # This is a template for a Ruby scraper on morph.io (https://morph.io) | |||||
| # including some code snippets below that you should find helpful | |||||
| require 'bundler' | require 'bundler' | ||||
| Bundler.setup | Bundler.setup | ||||
| require 'scraperwiki' | require 'scraperwiki' | ||||
| require 'mechanize' | require 'mechanize' | ||||
| require 'pp' | require 'pp' | ||||
| require 'time' | |||||
| require 'date' | |||||
| require 'active_support/all' | |||||
| # Use the column names from planningalerts.org.au: | |||||
| # https://www.planningalerts.org.au/how_to_write_a_scraper | |||||
| BASEURL = "https://maps.kingston.gov.uk/propertyServices/planning/" | BASEURL = "https://maps.kingston.gov.uk/propertyServices/planning/" | ||||
| agent = Mechanize.new | |||||
| agent.verify_mode = OpenSSL::SSL::VERIFY_NONE | |||||
| # | |||||
| # # Read in a page | |||||
| page = agent.get("https://maps.kingston.gov.uk/propertyServices/planning/Summary?weekListType=SRCH&recFrom=01/Jan/2017&recTo=01/Feb/2017&ward=ALL&appTyp=ALL&wardTxt=All%20Wards&appTypTxt=All%20Application%20Types&limit=50") | |||||
| # | |||||
| # page = Nokogiri::HTML(open("page.html")) | |||||
| apps = page.search("#planningApplication") | |||||
| apps.each do |app| | |||||
| @title = app.at("h4").inner_text | |||||
| @id = @title.match(/\d+\/\d+\/\w+/)[0] | |||||
| puts @id | |||||
| # Parse and save a single planning application | |||||
| def parse(app) | |||||
| record = {} | |||||
| record['title'] = app.at("h4").inner_text | |||||
| matches = record['title'].match(/(\d+\/\d+\/\w+)\s+-\s+(.+)/) | |||||
| record['council_reference'] = matches[1] | |||||
| record['type'] = matches[2] | |||||
| # puts record['council_reference'] | |||||
| app.search("a").each do |link| | app.search("a").each do |link| | ||||
| @url = BASEURL + link['href'].strip if link['href'].match(/Details\.aspx/) | |||||
| puts @url | |||||
| @map_url = link['href'].strip if link['href'].match(/\?map=/) | |||||
| record['info_url'] = BASEURL + link['href'].strip if link['href'].match(/Details/) | |||||
| record['map_url'] = link['href'].strip if link['href'].match(/\?map=/) | |||||
| record['images_url'] = link['href'].strip if link['href'].match(/ImageMenu/) | |||||
| record['comment_url'] = BASEURL + link['href'].strip if link['href'].match(/PlanningComments/) | |||||
| end | end | ||||
| spans = app.search("span") | spans = app.search("span") | ||||
| @description = spans[0].inner_text | |||||
| @address = spans[1].inner_text | |||||
| @ward = spans[2].inner_text | |||||
| record['description'] = spans[0].inner_text | |||||
| record['address'] = spans[1].inner_text | |||||
| record['ward'] = spans[2].inner_text | |||||
| # Decision and decision date | |||||
| if matches = spans[4].inner_text.match(/(.+?)\s+(\d{1,2}\/\d{1,2}\/\d{4})/) | |||||
| record['decision'] = matches[1] | |||||
| record['date_decision'] = Date.parse(matches[2]) | |||||
| end | |||||
| # Comments/consultation - consultation end date can change during lifetime of application | |||||
| app.search("dd").each do |dd| | |||||
| if matches = dd.inner_text.match(/The current closing date for comments on this application is (\d{1,2}-[A-Z][a-z]{2}-\d{4})/) | |||||
| record['on_notice_to'] = Date.parse(matches[1]) | |||||
| end | |||||
| end | |||||
| # Date valid | |||||
| begin | begin | ||||
| @date_valid = Date.parse(spans[3].inner_text) | |||||
| @date_valid_text = nil | |||||
| record['date_valid'] = Date.parse(spans[3].inner_text) | |||||
| record['date_valid_text'] = nil | |||||
| rescue ArgumentError | rescue ArgumentError | ||||
| @date_valid = nil | |||||
| @date_valid_text = spans[3].inner_text | |||||
| record['date_valid'] = nil | |||||
| record['date_valid_text'] = spans[3].inner_text | |||||
| end | end | ||||
| ScraperWiki.save_sqlite(["id"], | |||||
| { 'id' => @id, | |||||
| 'url' => @url, | |||||
| 'title' => @title, | |||||
| 'description' => @description, | |||||
| 'address' => @address, | |||||
| 'ward' => @ward, | |||||
| 'date_valid' => @date_valid, | |||||
| 'date_valid_text' => @date_valid_text, | |||||
| 'map_url' => @map_url | |||||
| }) | |||||
| # Scraper timestamps | |||||
| record['updated_at'] = Time.now | |||||
| record['date_scraped'] = Date.today.to_s | |||||
| ScraperWiki.save_sqlite(['council_reference'], record) | |||||
| end | end | ||||
| agent = Mechanize.new | |||||
| agent.verify_mode = OpenSSL::SSL::VERIFY_NONE | |||||
| # Get all valid applications for the last 12 * 30 days | |||||
| d = Date.today | |||||
| 12.times do | |||||
| d_start = (d - 29.days).strftime("%d/%m/%Y") | |||||
| d_end = d.strftime("%d/%m/%Y") | |||||
| url = "#{BASEURL}Summary?weekListType=SRCH&recFrom=#{d_start}&recTo=#{d_end}&ward=ALL&appTyp=ALL&wardTxt=All%20Wards&appTypTxt=All%20Application%20Types&limit=500" | |||||
| puts url | |||||
| page = agent.get(url) | |||||
| apps = page.search("#planningApplication") | |||||
| puts apps.size, '' | |||||
| apps.each { |app| parse(app) } | |||||
| d -= 30.days | |||||
| sleep 5 | |||||
| end | |||||
| # page = Nokogiri::HTML(open("page.html")) | |||||