| @@ -1,55 +1,88 @@ | |||
| # This is a template for a Ruby scraper on morph.io (https://morph.io) | |||
| # including some code snippets below that you should find helpful | |||
| require 'bundler' | |||
| Bundler.setup | |||
| require 'scraperwiki' | |||
| require 'mechanize' | |||
| require 'pp' | |||
| require 'time' | |||
| require 'date' | |||
| require 'active_support/all' | |||
| # Use the column names from planningalerts.org.au: | |||
| # https://www.planningalerts.org.au/how_to_write_a_scraper | |||
| BASEURL = "https://maps.kingston.gov.uk/propertyServices/planning/" | |||
| agent = Mechanize.new | |||
| agent.verify_mode = OpenSSL::SSL::VERIFY_NONE | |||
| # | |||
| # # Read in a page | |||
| page = agent.get("https://maps.kingston.gov.uk/propertyServices/planning/Summary?weekListType=SRCH&recFrom=01/Jan/2017&recTo=01/Feb/2017&ward=ALL&appTyp=ALL&wardTxt=All%20Wards&appTypTxt=All%20Application%20Types&limit=50") | |||
| # | |||
| # page = Nokogiri::HTML(open("page.html")) | |||
| apps = page.search("#planningApplication") | |||
| apps.each do |app| | |||
| @title = app.at("h4").inner_text | |||
| @id = @title.match(/\d+\/\d+\/\w+/)[0] | |||
| puts @id | |||
| # Parse and save a single planning application | |||
| def parse(app) | |||
| record = {} | |||
| record['title'] = app.at("h4").inner_text | |||
| matches = record['title'].match(/(\d+\/\d+\/\w+)\s+-\s+(.+)/) | |||
| record['council_reference'] = matches[1] | |||
| record['type'] = matches[2] | |||
| # puts record['council_reference'] | |||
| app.search("a").each do |link| | |||
| @url = BASEURL + link['href'].strip if link['href'].match(/Details\.aspx/) | |||
| puts @url | |||
| @map_url = link['href'].strip if link['href'].match(/\?map=/) | |||
| record['info_url'] = BASEURL + link['href'].strip if link['href'].match(/Details/) | |||
| record['map_url'] = link['href'].strip if link['href'].match(/\?map=/) | |||
| record['images_url'] = link['href'].strip if link['href'].match(/ImageMenu/) | |||
| record['comment_url'] = BASEURL + link['href'].strip if link['href'].match(/PlanningComments/) | |||
| end | |||
| spans = app.search("span") | |||
| @description = spans[0].inner_text | |||
| @address = spans[1].inner_text | |||
| @ward = spans[2].inner_text | |||
| record['description'] = spans[0].inner_text | |||
| record['address'] = spans[1].inner_text | |||
| record['ward'] = spans[2].inner_text | |||
| # Decision and decision date | |||
| if matches = spans[4].inner_text.match(/(.+?)\s+(\d{1,2}\/\d{1,2}\/\d{4})/) | |||
| record['decision'] = matches[1] | |||
| record['date_decision'] = Date.parse(matches[2]) | |||
| end | |||
| # Comments/consultation - consultation end date can change during lifetime of application | |||
| app.search("dd").each do |dd| | |||
| if matches = dd.inner_text.match(/The current closing date for comments on this application is (\d{1,2}-[A-Z][a-z]{2}-\d{4})/) | |||
| record['on_notice_to'] = Date.parse(matches[1]) | |||
| end | |||
| end | |||
| # Date valid | |||
| begin | |||
| @date_valid = Date.parse(spans[3].inner_text) | |||
| @date_valid_text = nil | |||
| record['date_valid'] = Date.parse(spans[3].inner_text) | |||
| record['date_valid_text'] = nil | |||
| rescue ArgumentError | |||
| @date_valid = nil | |||
| @date_valid_text = spans[3].inner_text | |||
| record['date_valid'] = nil | |||
| record['date_valid_text'] = spans[3].inner_text | |||
| end | |||
| ScraperWiki.save_sqlite(["id"], | |||
| { 'id' => @id, | |||
| 'url' => @url, | |||
| 'title' => @title, | |||
| 'description' => @description, | |||
| 'address' => @address, | |||
| 'ward' => @ward, | |||
| 'date_valid' => @date_valid, | |||
| 'date_valid_text' => @date_valid_text, | |||
| 'map_url' => @map_url | |||
| }) | |||
| # Scraper timestamps | |||
| record['updated_at'] = Time.now | |||
| record['date_scraped'] = Date.today.to_s | |||
| ScraperWiki.save_sqlite(['council_reference'], record) | |||
| end | |||
| agent = Mechanize.new | |||
| agent.verify_mode = OpenSSL::SSL::VERIFY_NONE | |||
| # Get all valid applications for the last 12 * 30 days | |||
| d = Date.today | |||
| 12.times do | |||
| d_start = (d - 29.days).strftime("%d/%m/%Y") | |||
| d_end = d.strftime("%d/%m/%Y") | |||
| url = "#{BASEURL}Summary?weekListType=SRCH&recFrom=#{d_start}&recTo=#{d_end}&ward=ALL&appTyp=ALL&wardTxt=All%20Wards&appTypTxt=All%20Application%20Types&limit=500" | |||
| puts url | |||
| page = agent.get(url) | |||
| apps = page.search("#planningApplication") | |||
| puts apps.size, '' | |||
| apps.each { |app| parse(app) } | |||
| d -= 30.days | |||
| sleep 5 | |||
| end | |||
| # page = Nokogiri::HTML(open("page.html")) | |||