From 8f7e2d93e1fb23dd135c97b47a4c6db43c6e0b12 Mon Sep 17 00:00:00 2001 From: Adrian Short Date: Wed, 16 Jun 2021 13:46:02 +0100 Subject: [PATCH] Scrape Northgate dates page --- lib/uk_planning_scraper/application.rb | 62 ++++++++++++++++++- .../authority_scrape_params.rb | 9 +++ lib/uk_planning_scraper/northgate.rb | 49 ++++++++++++++- 3 files changed, 116 insertions(+), 4 deletions(-) diff --git a/lib/uk_planning_scraper/application.rb b/lib/uk_planning_scraper/application.rb index 5c17550..4985c37 100644 --- a/lib/uk_planning_scraper/application.rb +++ b/lib/uk_planning_scraper/application.rb @@ -1,22 +1,79 @@ module UKPlanningScraper class Application + # Short authority name, eg Camden. attr_accessor :authority_name + + # The authority's own reference number for this application. attr_accessor :council_reference + + # Date the application was received by the authority. attr_accessor :date_received + + # Date the application was declared valid by the authority. attr_accessor :date_validated + + # The authority's own description of the application's current status. + # There is no nationally-mandated scheme for these values, so they vary + # according to local custom. attr_accessor :status + + # The datetime at which the application data was scraped from the + # authority's website. attr_accessor :scraped_at + + # The URL of the main details page for this application. attr_accessor :info_url + + # The site address for the application. attr_accessor :address + + # The applicant's own description of the proposal. attr_accessor :description + + # The number of documents associated with this application. + # According to local custom, this may include representations by official + # consultees and the public. + # Take care when using this as a proxy for the complexity of the application + # or the scale of the public response to it. attr_accessor :documents_count + + # The URL on the authority's website where the application's documents are. attr_accessor :documents_url + + # Used or not according to local custom. Some authorities use it for the + # Planning Portal reference number for the application. attr_accessor :alternative_reference + + # The authority's own description of the decision when made. + # There is no nationally-mandated standard for these codes and custom or + # consistency may vary even within an authority. attr_accessor :decision + + # The date the authority made the decision. + # This is a reliable proxy for which applications have been decided. attr_accessor :date_decision + + # attr_accessor :appeal_status + attr_accessor :appeal_decision + # Final day of the statutory notification/consultation period for this + # application. + # If there is more than one notification then this will be used + # according to local custom + attr_accessor :consultation_end_date + + # Final day of the statutory determination period for this application. + # If the authority and applicant agree an extension of time this may be + # changed according to local custom. + attr_accessor :statutory_due_date + + # Final day of an agreed extension of the determination period for this + # application. + # This may change if there are subsequent extensions. + attr_accessor :extended_expiry_date + def to_hash { scraped_at: @scraped_at, @@ -34,7 +91,10 @@ module UKPlanningScraper documents_url: @documents_url, alternative_reference: @alternative_reference, appeal_status: @appeal_status, - appeal_decision: @appeal_decision + appeal_decision: @appeal_decision, + consultation_end_date: @consultation_end_date, + statutory_due_date: @statutory_due_date, + extended_expiry_date: @extended_expiry_date } end diff --git a/lib/uk_planning_scraper/authority_scrape_params.rb b/lib/uk_planning_scraper/authority_scrape_params.rb index b3170e2..5632c18 100644 --- a/lib/uk_planning_scraper/authority_scrape_params.rb +++ b/lib/uk_planning_scraper/authority_scrape_params.rb @@ -73,6 +73,15 @@ module UKPlanningScraper self end + def include_dates + unless system == 'northgate' + raise NoMethodError.new("include_dates is only implemented for Northgate. This authority (#{@name}) is #{system.capitalize}.") + end + + @scrape_params[:include_dates] = true + self + end + def application_type(s) unless system == 'idox' raise NoMethodError.new("application_type is only implemented for \ diff --git a/lib/uk_planning_scraper/northgate.rb b/lib/uk_planning_scraper/northgate.rb index cae1edf..d03f310 100644 --- a/lib/uk_planning_scraper/northgate.rb +++ b/lib/uk_planning_scraper/northgate.rb @@ -6,7 +6,12 @@ module UKPlanningScraper class Authority private def scrape_northgate(params, options) - puts "Using Northgate scraper." + logger = Logger.new($stdout) + logger.level = Logger::DEBUG + + logger.info "Using Northgate scraper." + logger.info "Will also scrape dates page." if params[:include_dates] + base_url = @url.match(/(https?:\/\/.+?)\//)[1] # Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive? @@ -15,8 +20,6 @@ module UKPlanningScraper apps = [] $stdout.sync = true # Flush output buffer after every write so log messages appear immediately. - logger = Logger.new($stdout) - logger.level = Logger::DEBUG date_regex = /\d{2}-\d{2}-\d{4}/ @@ -133,6 +136,46 @@ module UKPlanningScraper apps << app end end + + if params[:include_dates] + apps.each do |app| + sleep options[:delay] + + # Do we need to return the dates_url as part of the Application object? Seems unnecessary. + dates_url = app.info_url.sub("PLDetails", "PLDetailsDates") + agent = Mechanize.new + # agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE + logger.info "Getting dates page for application #{app.council_reference}: #{dates_url}" + page = agent.get(dates_url) # load the search form page + + if page.code == '200' + page.search(".dataview .list li").each do |element| + if bits = element.inner_html.match(/(.+)<\/span>.*?(\d{2}-\d{2}-\d{4})/) + # Some labels have tab characters (\t) in them + label = bits[1].strip.downcase.sub(/\s+/, ' ') + value = Date.strptime(bits[2], '%d-%m-%Y') + + case label + when 'consultation expiry' # eg Islington, Merton + app.consultation_end_date = value + when 'public consultation period ends' # eg Birmingham + app.consultation_end_date = value + + when 'stat cons expiry date' # eg Merton + app.statutory_due_date = value + when 'statutory expiry date' # eg Birmingham + app.statutory_due_date = value + + when 'extended expiry' # eg Merton, Islington + app.extended_expiry_date = value + end + end + end + + end + end + end + apps end end