diff --git a/lib/uk_planning_scraper/application.rb b/lib/uk_planning_scraper/application.rb index 5c17550..5fced94 100644 --- a/lib/uk_planning_scraper/application.rb +++ b/lib/uk_planning_scraper/application.rb @@ -37,7 +37,7 @@ module UKPlanningScraper appeal_decision: @appeal_decision } end - + def valid? return true if @authority_name && @council_reference && @info_url false diff --git a/lib/uk_planning_scraper/authority.rb b/lib/uk_planning_scraper/authority.rb index 621e7a3..a4c08d6 100644 --- a/lib/uk_planning_scraper/authority.rb +++ b/lib/uk_planning_scraper/authority.rb @@ -3,7 +3,7 @@ require 'csv' module UKPlanningScraper class Authority attr_reader :name, :url - + @@authorities = [] def initialize(name, url) @@ -31,7 +31,7 @@ module UKPlanningScraper raise SystemNotSupported.new("Planning system not supported for \ #{@name} at URL: #{@url}") end - + # Post processing @applications.each do |app| app.authority_name = @name @@ -41,32 +41,32 @@ module UKPlanningScraper output = [] # FIXME - silently ignores invalid apps. How should we handle them? @applications.each { |app| output << app.to_hash if app.valid? } - + # Reset so that old params don't get used for new scrapes clear_scrape_params - + output # Single point of successful exit end - + def tags @tags.sort end - + # Add multiple tags to existing tags def add_tags(tags) tags.each { |t| add_tag(t) } end - + # Add a single tag to existing tags def add_tag(tag) clean_tag = tag.strip.downcase.gsub(' ', '') @tags << clean_tag unless tagged?(clean_tag) # prevent duplicates end - + def tagged?(tag) @tags.include?(tag) end - + def system if @url.match(/search\.do\?action=advanced/i) 'idox' @@ -84,18 +84,18 @@ module UKPlanningScraper def self.all @@authorities end - + # List all the tags in use def self.tags tags = [] @@authorities.each { |a| tags << a.tags } tags.flatten.uniq.sort end - + def self.named(name) authority = @@authorities.find { |a| name == a.name } raise AuthorityNotFound if authority.nil? - authority + authority end # Tagged x @@ -125,11 +125,11 @@ module UKPlanningScraper CSV.foreach(File.join(File.dirname(__dir__), 'uk_planning_scraper', \ 'authorities.csv'), :headers => true) do |line| auth = Authority.new(line['authority_name'], line['url']) - + if line['tags'] auth.add_tags(line['tags'].split(/\s+/)) end - + auth.add_tag(auth.system) @@authorities << auth end diff --git a/lib/uk_planning_scraper/authority_scrape_params.rb b/lib/uk_planning_scraper/authority_scrape_params.rb index 819c6e2..0024732 100644 --- a/lib/uk_planning_scraper/authority_scrape_params.rb +++ b/lib/uk_planning_scraper/authority_scrape_params.rb @@ -4,7 +4,7 @@ module UKPlanningScraper class Authority # Parameter methods for Authority#scrape # Desgined to be method chained, eg: - # + # # applications = UKPlanningScraper::Authority.named("Barnet"). \ # development_type("Q22").keywords("illuminat"). \ # validated_days(30).scrape @@ -17,7 +17,7 @@ module UKPlanningScraper unless n > 0 raise ArgumentError.new("validated_days must be greater than 0") end - + validated_from(Date.today - (n - 1)) validated_to(Date.today) self @@ -31,7 +31,7 @@ module UKPlanningScraper unless n > 0 raise ArgumentError.new("received_days must be greater than 0") end - + received_from(Date.today - (n - 1)) received_to(Date.today) self @@ -45,18 +45,18 @@ module UKPlanningScraper unless n > 0 raise ArgumentError.new("decided_days must be greater than 0") end - + decided_from(Date.today - (n - 1)) decided_to(Date.today) self end - + def applicant_name(s) unless system == 'idox' raise NoMethodError.new("applicant_name is only implemented for Idox. \ This authority (#{@name}) is #{system.capitalize}.") end - + check_class(s, String) @scrape_params[:applicant_name] = s.strip self @@ -67,7 +67,7 @@ module UKPlanningScraper raise NoMethodError.new("application_type is only implemented for \ Idox. This authority (#{@name}) is #{system.capitalize}.") end - + check_class(s, String) @scrape_params[:application_type] = s.strip self @@ -78,14 +78,14 @@ module UKPlanningScraper raise NoMethodError.new("development_type is only implemented for \ Idox. This authority (#{@name}) is #{system.capitalize}.") end - + check_class(s, String) @scrape_params[:development_type] = s.strip self end private - + # Handle the simple params with this def method_missing(method_name, *args) sc_params = { @@ -97,18 +97,18 @@ module UKPlanningScraper decided_to: Date, keywords: String } - + value = args[0] - + if sc_params[method_name] check_class(value, sc_params[method_name], method_name.to_s) value.strip! if value.class == String - + if value.class == Date && value > Date.today raise ArgumentError.new("#{method_name} can't be a date in the " + \ "future (#{value.to_s})") end - + @scrape_params[method_name] = value self else @@ -119,7 +119,7 @@ module UKPlanningScraper def clear_scrape_params @scrape_params = {} end - + # https://stackoverflow.com/questions/5100299/how-to-get-the-name-of-the-calling-method def check_class( param_value, diff --git a/lib/uk_planning_scraper/idox.rb b/lib/uk_planning_scraper/idox.rb index 20cd434..5ba773d 100644 --- a/lib/uk_planning_scraper/idox.rb +++ b/lib/uk_planning_scraper/idox.rb @@ -7,7 +7,7 @@ module UKPlanningScraper def scrape_idox(params, options) puts "Using Idox scraper." base_url = @url.match(/(https?:\/\/.+?)\//)[1] - + apps = [] agent = Mechanize.new @@ -31,7 +31,7 @@ module UKPlanningScraper }.each { |f| form.add_field!(f) unless form.has_field?(f) } date_format = "%d/%m/%Y" - + form.send(:"date(applicationReceivedStart)", params[:received_from].strftime(date_format)) if params[:received_from] form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime(date_format)) if params[:received_to] @@ -42,12 +42,12 @@ module UKPlanningScraper form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime(date_format)) if params[:decided_to] form.send(:"searchCriteria\.description", params[:keywords]) - + # Some councils don't have the applicant name on their form, eg Bexley form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName' - + form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType' - + # Only some Idox sites (eg Bolton) have a 'searchCriteria.developmentType' parameter form.send(:"searchCriteria\.developmentType", params[:development_type]) if form.has_field? 'searchCriteria.developmentType' @@ -56,7 +56,7 @@ module UKPlanningScraper if page.search('.errors').inner_text.match(/Too many results found/i) raise TooManySearchResults.new("Scrape in smaller chunks. Use shorter date ranges and/or more search parameters.") end - + loop do # Parse search results items = page.search('li.searchresult') @@ -69,7 +69,7 @@ module UKPlanningScraper # Parse info line info_line = app.at("p.metaInfo").inner_text.strip bits = info_line.split('|').map { |e| e.strip.delete("\r\n") } - + bits.each do |bit| if matches = bit.match(/Ref\. No:\s+(.+)/) data.council_reference = matches[1] @@ -78,7 +78,7 @@ module UKPlanningScraper if matches = bit.match(/(Received|Registered):\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/) data.date_received = Date.parse(matches[2]) end - + if matches = bit.match(/Validated:\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/) data.date_validated = Date.parse(matches[1]) end @@ -92,10 +92,10 @@ module UKPlanningScraper data.info_url = base_url + app.at('a')['href'] data.address = app.at('p.address').inner_text.strip data.description = app.at('a').inner_text.strip - + apps << data end - + # Get the Next button from the pager, if there is one if next_button = page.at('a.next') next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100' @@ -106,13 +106,13 @@ module UKPlanningScraper break end end - + # Scrape the summary tab for each app apps.each_with_index do |app, i| sleep options[:delay] puts "#{i + 1} of #{apps.size}: #{app.info_url}" res = agent.get(app.info_url) - + if res.code == '200' # That's a String not an Integer, ffs # Parse the summary tab for this app @@ -133,14 +133,14 @@ module UKPlanningScraper app.documents_url = base_url + documents_link[:href] end end - + # We need to find values in the table by using the th labels. # The row indexes/positions change from site to site (or even app to app) so we can't rely on that. res.search('#simpleDetailsTable tr').each do |row| key = row.at('th').inner_text.strip value = row.at('td').inner_text.strip - + case key when 'Reference' app.council_reference = value diff --git a/lib/uk_planning_scraper/northgate.rb b/lib/uk_planning_scraper/northgate.rb index c979e77..1ab923f 100644 --- a/lib/uk_planning_scraper/northgate.rb +++ b/lib/uk_planning_scraper/northgate.rb @@ -8,10 +8,10 @@ module UKPlanningScraper def scrape_northgate(params, options) puts "Using Northgate scraper." base_url = @url.match(/(https?:\/\/.+?)\//)[1] - + # Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive? generic_url = @url.match(/.+\//)[0] + 'Generic/' - + apps = [] $stdout.sync = true # Flush output buffer after every write so log messages appear immediately.