Przeglądaj źródła

Scrape Northgate dates page

northgate-dates
Adrian Short 3 lat temu
rodzic
commit
8f7e2d93e1
Podpisane przez niezaufanego użytkownika: adrian ID klucza GPG: 4B54F9AE91AB06BB
3 zmienionych plików z 116 dodań i 4 usunięć
  1. +61
    -1
      lib/uk_planning_scraper/application.rb
  2. +9
    -0
      lib/uk_planning_scraper/authority_scrape_params.rb
  3. +46
    -3
      lib/uk_planning_scraper/northgate.rb

+ 61
- 1
lib/uk_planning_scraper/application.rb Wyświetl plik

@@ -1,22 +1,79 @@
module UKPlanningScraper
class Application
# Short authority name, eg Camden.
attr_accessor :authority_name
# The authority's own reference number for this application.
attr_accessor :council_reference
# Date the application was received by the authority.
attr_accessor :date_received
# Date the application was declared valid by the authority.
attr_accessor :date_validated
# The authority's own description of the application's current status.
# There is no nationally-mandated scheme for these values, so they vary
# according to local custom.
attr_accessor :status

# The datetime at which the application data was scraped from the
# authority's website.
attr_accessor :scraped_at
# The URL of the main details page for this application.
attr_accessor :info_url
# The site address for the application.
attr_accessor :address
# The applicant's own description of the proposal.
attr_accessor :description
# The number of documents associated with this application.
# According to local custom, this may include representations by official
# consultees and the public.
# Take care when using this as a proxy for the complexity of the application
# or the scale of the public response to it.
attr_accessor :documents_count
# The URL on the authority's website where the application's documents are.
attr_accessor :documents_url
# Used or not according to local custom. Some authorities use it for the
# Planning Portal reference number for the application.
attr_accessor :alternative_reference
# The authority's own description of the decision when made.
# There is no nationally-mandated standard for these codes and custom or
# consistency may vary even within an authority.
attr_accessor :decision
# The date the authority made the decision.
# This is a reliable proxy for which applications have been decided.
attr_accessor :date_decision
#
attr_accessor :appeal_status
attr_accessor :appeal_decision

# Final day of the statutory notification/consultation period for this
# application.
# If there is more than one notification then this will be used
# according to local custom
attr_accessor :consultation_end_date
# Final day of the statutory determination period for this application.
# If the authority and applicant agree an extension of time this may be
# changed according to local custom.
attr_accessor :statutory_due_date
# Final day of an agreed extension of the determination period for this
# application.
# This may change if there are subsequent extensions.
attr_accessor :extended_expiry_date

def to_hash
{
scraped_at: @scraped_at,
@@ -34,7 +91,10 @@ module UKPlanningScraper
documents_url: @documents_url,
alternative_reference: @alternative_reference,
appeal_status: @appeal_status,
appeal_decision: @appeal_decision
appeal_decision: @appeal_decision,
consultation_end_date: @consultation_end_date,
statutory_due_date: @statutory_due_date,
extended_expiry_date: @extended_expiry_date
}
end


+ 9
- 0
lib/uk_planning_scraper/authority_scrape_params.rb Wyświetl plik

@@ -73,6 +73,15 @@ module UKPlanningScraper
self
end

def include_dates
unless system == 'northgate'
raise NoMethodError.new("include_dates is only implemented for Northgate. This authority (#{@name}) is #{system.capitalize}.")
end
@scrape_params[:include_dates] = true
self
end

def application_type(s)
unless system == 'idox'
raise NoMethodError.new("application_type is only implemented for \


+ 46
- 3
lib/uk_planning_scraper/northgate.rb Wyświetl plik

@@ -6,7 +6,12 @@ module UKPlanningScraper
class Authority
private
def scrape_northgate(params, options)
puts "Using Northgate scraper."
logger = Logger.new($stdout)
logger.level = Logger::DEBUG

logger.info "Using Northgate scraper."
logger.info "Will also scrape dates page." if params[:include_dates]
base_url = @url.match(/(https?:\/\/.+?)\//)[1]
# Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive?
@@ -15,8 +20,6 @@ module UKPlanningScraper
apps = []

$stdout.sync = true # Flush output buffer after every write so log messages appear immediately.
logger = Logger.new($stdout)
logger.level = Logger::DEBUG

date_regex = /\d{2}-\d{2}-\d{4}/

@@ -133,6 +136,46 @@ module UKPlanningScraper
apps << app
end
end
if params[:include_dates]
apps.each do |app|
sleep options[:delay]
# Do we need to return the dates_url as part of the Application object? Seems unnecessary.
dates_url = app.info_url.sub("PLDetails", "PLDetailsDates")
agent = Mechanize.new
# agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
logger.info "Getting dates page for application #{app.council_reference}: #{dates_url}"
page = agent.get(dates_url) # load the search form page

if page.code == '200'
page.search(".dataview .list li").each do |element|
if bits = element.inner_html.match(/<span>(.+)<\/span>.*?(\d{2}-\d{2}-\d{4})/)
# Some labels have tab characters (\t) in them
label = bits[1].strip.downcase.sub(/\s+/, ' ')
value = Date.strptime(bits[2], '%d-%m-%Y')
case label
when 'consultation expiry' # eg Islington, Merton
app.consultation_end_date = value
when 'public consultation period ends' # eg Birmingham
app.consultation_end_date = value
when 'stat cons expiry date' # eg Merton
app.statutory_due_date = value
when 'statutory expiry date' # eg Birmingham
app.statutory_due_date = value
when 'extended expiry' # eg Merton, Islington
app.extended_expiry_date = value
end
end
end
end
end
end
apps
end
end


Ładowanie…
Anuluj
Zapisz