require "uk_planning_scraper/version" require 'mechanize' require 'time' require 'logger' require 'pp' module UKPlanningScraper def self.search(search_url, params, options = {}) default_options = { delay: 10, } @options = default_options.merge(options) # The user-supplied options override the defaults @search_url = search_url @base_url = search_url.match(/(https?:\/\/.+?)\//)[1] apps = [] agent = Mechanize.new puts "Getting: #{@search_url}" page = agent.get(@search_url) # load the search form page # Fill out and submit search form form = page.form('searchCriteriaForm') # form.action = form.action + '&searchCriteria.resultsPerPage=100' # Some councils don't have the received from/to dates on their form, eg Newham form.send(:"date(applicationReceivedStart)", params[:received_from].strftime("%d/%m/%Y")) if params[:received_from] form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime("%d/%m/%Y")) if params[:received_to] form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime("%d/%m/%Y")) if params[:validated_from] form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime("%d/%m/%Y")) if params[:validated_to] form.send(:"searchCriteria\.description", params[:description]) # Some councils don't have the applicant name on their form, eg Bexley form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName' form.send(:"searchCriteria\.caseType", params[:application_type]) page = form.submit loop do # Parse search results items = page.search('li.searchresult') puts "Found #{items.size} apps on this page." items.each do |app| data = {} # Parse info line info_line = app.at("p.metaInfo").inner_text.strip bits = info_line.split('|').map { |e| e.strip.delete("\r\n") } bits.each do |bit| if matches = bit.match(/Ref\. No:\s+(.+)/) data[:council_reference] = matches[1] end if matches = bit.match(/(Received|Registered):\s+(.+)/) data[:date_received] = Date.parse(matches[2]) end if matches = bit.match(/Validated:\s+(.+)/) data[:date_validated] = Date.parse(matches[1]) end if matches = bit.match(/Status:\s+(.+)/) data[:status] = matches[1] end end data.merge!({ scraped_at: Time.now, info_url: @base_url + app.at('a')['href'], address: app.at('p.address').inner_text.strip, description: app.at('a').inner_text.strip, }) apps << data end # Get the Next button from the pager, if there is one if next_button = page.at('a.next') next_url = @base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100' sleep @options[:delay] puts "Getting: #{next_url}" page = agent.get(next_url) else break end end # Scrape the summary tab for each app apps.each_with_index do |app, i| sleep @options[:delay] puts "#{i + 1} of #{apps.size}: #{app[:info_url]}" res = agent.get(app[:info_url]) if res.code == '200' # That's a String not an Integer, ffs # Parse the summary tab for this app app[:scraped_at] = Time.now # Does the Documents tab show if there are no documents? app[:documents_count] = res.at('#tab_documents').inner_text.match(/\d+/)[0].to_i app[:documents_url] = @base_url + res.at('#tab_documents')[:href] # We need to find values in the table by using the th labels. # The row indexes/positions change from site to site (or even app to app) so we can't rely on that. res.search('#simpleDetailsTable tr').each do |row| key = row.at('th').inner_text.strip value = row.at('td').inner_text.strip case key when 'Reference' app[:council_reference] = value when 'Alternative Reference' app[:alternative_reference] = value when 'Planning Portal Reference' app[:alternative_reference] = value when 'Application Received' app[:date_received] = Date.parse(value) if value != '' when 'Application Registered' app[:date_received] = Date.parse(value) if value != '' when 'Application Validated' app[:date_validated] = Date.parse(value) if value != '' when 'Address' app[:address] = value when 'Proposal' app[:description] = value when 'Status' app[:status] = value when 'Decision' app[:decision] = value when 'Decision Issued Date' app[:date_decision] = Date.parse(value) if value != '' when 'Appeal Status' app[:appeal_status] = value when 'Appeal Decision' app[:appeal_decision] = value else puts "Error: key '#{key}' not found" end # case end # each row else puts "Error: HTTP #{res.code}" end # if end # scrape summary tab for apps apps end # self.search end # module