|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150 |
- require 'csv'
-
- module UKPlanningScraper
- class Authority
- # eg "Camden"
- attr_reader :name
-
- # URL of the advanced search page
- attr_reader :url
-
- # eg "idox", "northgate"
- attr_reader :system
-
- @@authorities = []
-
- def initialize(name, url)
- @name = name.strip
- @url = url.strip
- @tags = [] # Strings in arbitrary order
- @applications = [] # Application objects
- @scrape_params = {}
-
- # Determine @system when Authority is created
- if @url.match(/search\.do\?action=advanced/i)
- @system = 'idox'
- elsif @url.match(/generalsearch\.aspx/i)
- @system = 'northgate'
- elsif @url.match(/ocellaweb/i)
- @system = 'ocellaweb'
- elsif @url.match(/\/apas\//)
- @system = 'agileplanning'
- else
- @system = 'unknownsystem'
- end
- end
-
- # Scrape this authority's website for applications
-
- def scrape(options = {})
- default_options = {
- delay: 10,
- }
- # The user-supplied options override the defaults
- options = default_options.merge(options)
-
- # Select which scraper to use
- case system
- when 'idox'
- @applications = scrape_idox(@scrape_params, options)
- when 'northgate'
- @applications = scrape_northgate(@scrape_params, options)
- else
- raise SystemNotSupported.new("Planning system not supported for \
- #{@name} at URL: #{@url}")
- end
-
- # Post processing
- @applications.each do |app|
- app.authority_name = @name
- end
-
- # Output as an array of hashes
- output = []
- # FIXME - silently ignores invalid apps. How should we handle them?
- @applications.each { |app| output << app.to_hash if app.valid? }
-
- # Reset so that old params don't get used for new scrapes
- clear_scrape_params
-
- output # Single point of successful exit
- end
-
- # Return a sorted list of tags for this authority
- def tags
- @tags.sort
- end
-
- # Add multiple tags to existing tags
- def add_tags(tags)
- tags.each { |t| add_tag(t) }
- end
-
- # Add a single tag to existing tags
- def add_tag(tag)
- clean_tag = tag.strip.downcase.gsub(' ', '')
- @tags << clean_tag unless tagged?(clean_tag) # prevent duplicates
- end
-
- def tagged?(tag)
- @tags.include?(tag)
- end
-
- def self.all
- @@authorities
- end
-
- # List all the tags in use
- def self.tags
- tags = []
- @@authorities.each { |a| tags << a.tags }
- tags.flatten.uniq.sort
- end
-
- def self.named(name)
- authority = @@authorities.find { |a| name == a.name }
- raise AuthorityNotFound if authority.nil?
- authority
- end
-
- # Tagged x
- def self.tagged(tag)
- found = []
- @@authorities.each { |a| found << a if a.tagged?(tag) }
- found
- end
-
- # Not tagged x
- def self.not_tagged(tag)
- found = []
- @@authorities.each { |a| found << a unless a.tagged?(tag) }
- found
- end
-
- # Authorities with no tags
- def self.untagged
- found = []
- @@authorities.each { |a| found << a if a.tags.empty? }
- found
- end
-
- def self.load
- # Don't run this method more than once
- return unless @@authorities.empty?
-
- CSV.foreach(File.join(File.dirname(__dir__), 'uk_planning_scraper', \
- 'authorities.csv'), :headers => true) do |line|
- auth = Authority.new(line['authority_name'], line['url'])
-
- if line['tags']
- auth.add_tags(line['tags'].split(/\s+/))
- end
-
- auth.add_tag(auth.system)
- @@authorities << auth
- end
- end
- end
- end
-
- UKPlanningScraper::Authority.load
|