From aac15c24ebc8a29c1fecce9eb84437999ba8546c Mon Sep 17 00:00:00 2001 From: Adrian Short Date: Thu, 2 Dec 2021 14:26:35 +0000 Subject: [PATCH] Scrape Northgate details page Currently we only parse: - application_type - location_easting - location_northing --- lib/uk_planning_scraper/application.rb | 15 +++++++- .../authority_scrape_params.rb | 9 +++++ lib/uk_planning_scraper/northgate.rb | 34 ++++++++++++++++++- 3 files changed, 56 insertions(+), 2 deletions(-) diff --git a/lib/uk_planning_scraper/application.rb b/lib/uk_planning_scraper/application.rb index 4985c37..89835ed 100644 --- a/lib/uk_planning_scraper/application.rb +++ b/lib/uk_planning_scraper/application.rb @@ -74,6 +74,16 @@ module UKPlanningScraper # This may change if there are subsequent extensions. attr_accessor :extended_expiry_date + # Application type: Full planning permission, advertisement, + # LDC, prior approval etc. + # Codes are specific to each local planning authority although there will + # be a high degree of overlap between LPAs + attr_accessor :application_type + + attr_accessor :location_easting + attr_accessor :location_northing + + def to_hash { scraped_at: @scraped_at, @@ -94,7 +104,10 @@ module UKPlanningScraper appeal_decision: @appeal_decision, consultation_end_date: @consultation_end_date, statutory_due_date: @statutory_due_date, - extended_expiry_date: @extended_expiry_date + extended_expiry_date: @extended_expiry_date, + application_type: @application_type, + location_easting: @location_easting, + location_northing: @location_northing } end diff --git a/lib/uk_planning_scraper/authority_scrape_params.rb b/lib/uk_planning_scraper/authority_scrape_params.rb index 5632c18..b4083d7 100644 --- a/lib/uk_planning_scraper/authority_scrape_params.rb +++ b/lib/uk_planning_scraper/authority_scrape_params.rb @@ -73,6 +73,15 @@ module UKPlanningScraper self end + def include_details + unless system == 'northgate' + raise NoMethodError.new("include_details is only implemented for Northgate. This authority (#{@name}) is #{system.capitalize}.") + end + + @scrape_params[:include_details] = true + self + end + def include_dates unless system == 'northgate' raise NoMethodError.new("include_dates is only implemented for Northgate. This authority (#{@name}) is #{system.capitalize}.") diff --git a/lib/uk_planning_scraper/northgate.rb b/lib/uk_planning_scraper/northgate.rb index d03f310..a8b6d69 100644 --- a/lib/uk_planning_scraper/northgate.rb +++ b/lib/uk_planning_scraper/northgate.rb @@ -10,6 +10,7 @@ module UKPlanningScraper logger.level = Logger::DEBUG logger.info "Using Northgate scraper." + logger.info "Will also scrape details page." if params[:include_details] logger.info "Will also scrape dates page." if params[:include_dates] base_url = @url.match(/(https?:\/\/.+?)\//)[1] @@ -137,6 +138,7 @@ module UKPlanningScraper end end + # Scrape dates page if required if params[:include_dates] apps.each do |app| sleep options[:delay] @@ -175,7 +177,37 @@ module UKPlanningScraper end end end - + + # Scrape details page if required + if params[:include_details] + apps.each do |app| + sleep options[:delay] + agent = Mechanize.new + + # agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE + logger.info "Getting details page for application #{app.council_reference}: #{app.info_url}" + page = agent.get(app.info_url) # load the search form page + + if page.code == '200' + page.search(".dataview")[2].search(".list li").each do |element| + if bits = element.inner_html.match(/(.+)<\/span>(.+)