Browse Source

Scrape Northgate details page

Currently we only parse:
	- application_type
	- location_easting
	- location_northing
northgate-dates
Adrian Short 2 years ago
parent
commit
aac15c24eb
3 changed files with 56 additions and 2 deletions
  1. +14
    -1
      lib/uk_planning_scraper/application.rb
  2. +9
    -0
      lib/uk_planning_scraper/authority_scrape_params.rb
  3. +33
    -1
      lib/uk_planning_scraper/northgate.rb

+ 14
- 1
lib/uk_planning_scraper/application.rb View File

@@ -74,6 +74,16 @@ module UKPlanningScraper
# This may change if there are subsequent extensions.
attr_accessor :extended_expiry_date

# Application type: Full planning permission, advertisement,
# LDC, prior approval etc.
# Codes are specific to each local planning authority although there will
# be a high degree of overlap between LPAs
attr_accessor :application_type

attr_accessor :location_easting
attr_accessor :location_northing


def to_hash
{
scraped_at: @scraped_at,
@@ -94,7 +104,10 @@ module UKPlanningScraper
appeal_decision: @appeal_decision,
consultation_end_date: @consultation_end_date,
statutory_due_date: @statutory_due_date,
extended_expiry_date: @extended_expiry_date
extended_expiry_date: @extended_expiry_date,
application_type: @application_type,
location_easting: @location_easting,
location_northing: @location_northing
}
end


+ 9
- 0
lib/uk_planning_scraper/authority_scrape_params.rb View File

@@ -73,6 +73,15 @@ module UKPlanningScraper
self
end

def include_details
unless system == 'northgate'
raise NoMethodError.new("include_details is only implemented for Northgate. This authority (#{@name}) is #{system.capitalize}.")
end
@scrape_params[:include_details] = true
self
end

def include_dates
unless system == 'northgate'
raise NoMethodError.new("include_dates is only implemented for Northgate. This authority (#{@name}) is #{system.capitalize}.")


+ 33
- 1
lib/uk_planning_scraper/northgate.rb View File

@@ -10,6 +10,7 @@ module UKPlanningScraper
logger.level = Logger::DEBUG

logger.info "Using Northgate scraper."
logger.info "Will also scrape details page." if params[:include_details]
logger.info "Will also scrape dates page." if params[:include_dates]
base_url = @url.match(/(https?:\/\/.+?)\//)[1]
@@ -137,6 +138,7 @@ module UKPlanningScraper
end
end
# Scrape dates page if required
if params[:include_dates]
apps.each do |app|
sleep options[:delay]
@@ -175,7 +177,37 @@ module UKPlanningScraper
end
end
end
# Scrape details page if required
if params[:include_details]
apps.each do |app|
sleep options[:delay]
agent = Mechanize.new
# agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
logger.info "Getting details page for application #{app.council_reference}: #{app.info_url}"
page = agent.get(app.info_url) # load the search form page

if page.code == '200'
page.search(".dataview")[2].search(".list li").each do |element|
if bits = element.inner_html.match(/<span>(.+)<\/span>(.+)</)
# Some labels have tab characters (\t) in them
label = bits[1].downcase.gsub(/[[:space:]]+/, ' ').strip
value = bits[2].gsub(/[[:space:]]+/, ' ').strip
case label
when 'application type'
app.application_type = value
when 'location co ordinates'
coords = value.match(/Easting.+?(\d+).+?(\d+)/)
app.location_easting = coords[1].to_i
app.location_northing = coords[2].to_i
end
end
end
end
end
end
apps
end
end


Loading…
Cancel
Save