Nuke whitespace

5 年之前 · 74e3121f28
--- a/lib/uk_planning_scraper/application.rb
+++ b/lib/uk_planning_scraper/application.rb
@@ -37,7 +37,7 @@ module UKPlanningScraper
        appeal_decision: @appeal_decision
      }
    end
    

    def valid?
      return true if @authority_name && @council_reference && @info_url
      false
--- a/lib/uk_planning_scraper/authority.rb
+++ b/lib/uk_planning_scraper/authority.rb
@@ -3,7 +3,7 @@ require 'csv'
 module UKPlanningScraper
  class Authority
    attr_reader :name, :url
    

    @@authorities = []

    def initialize(name, url)
@@ -31,7 +31,7 @@ module UKPlanningScraper
        raise SystemNotSupported.new("Planning system not supported for \
          #{@name} at URL: #{@url}")
      end
      

      # Post processing
      @applications.each do |app|
        app.authority_name = @name
@@ -41,32 +41,32 @@ module UKPlanningScraper
      output = []
      # FIXME - silently ignores invalid apps. How should we handle them?
      @applications.each { |app| output << app.to_hash if app.valid? }
      

      # Reset so that old params don't get used for new scrapes
      clear_scrape_params
      

      output  # Single point of successful exit
    end
    

    def tags
      @tags.sort
    end
    

    # Add multiple tags to existing tags
    def add_tags(tags)
      tags.each { |t| add_tag(t) }
    end
    

    # Add a single tag to existing tags
    def add_tag(tag)
      clean_tag = tag.strip.downcase.gsub(' ', '')
      @tags << clean_tag unless tagged?(clean_tag) # prevent duplicates
    end
    

    def tagged?(tag)
      @tags.include?(tag)
    end
    

    def system
      if @url.match(/search\.do\?action=advanced/i)
        'idox'
@@ -84,18 +84,18 @@ module UKPlanningScraper
    def self.all
      @@authorities
    end
    

    # List all the tags in use
    def self.tags
      tags = []
      @@authorities.each { |a| tags << a.tags }
      tags.flatten.uniq.sort
    end
    

    def self.named(name)
      authority = @@authorities.find { |a| name == a.name }
      raise AuthorityNotFound if authority.nil?
      authority 
      authority
    end

    # Tagged x
@@ -125,11 +125,11 @@ module UKPlanningScraper
      CSV.foreach(File.join(File.dirname(__dir__), 'uk_planning_scraper', \
          'authorities.csv'), :headers => true) do |line|
        auth = Authority.new(line['authority_name'], line['url'])
        

        if line['tags']
          auth.add_tags(line['tags'].split(/\s+/))
        end
        

        auth.add_tag(auth.system)
        @@authorities << auth
      end
--- a/lib/uk_planning_scraper/authority_scrape_params.rb
+++ b/lib/uk_planning_scraper/authority_scrape_params.rb
@@ -4,7 +4,7 @@ module UKPlanningScraper
  class Authority
    # Parameter methods for Authority#scrape
    # Desgined to be method chained, eg:
    # 
    #
    # applications = UKPlanningScraper::Authority.named("Barnet"). \
    # development_type("Q22").keywords("illuminat"). \
    # validated_days(30).scrape
@@ -17,7 +17,7 @@ module UKPlanningScraper
      unless n > 0
        raise ArgumentError.new("validated_days must be greater than 0")
      end
      

      validated_from(Date.today - (n - 1))
      validated_to(Date.today)
      self
@@ -31,7 +31,7 @@ module UKPlanningScraper
      unless n > 0
        raise ArgumentError.new("received_days must be greater than 0")
      end
      

      received_from(Date.today - (n - 1))
      received_to(Date.today)
      self
@@ -45,18 +45,18 @@ module UKPlanningScraper
      unless n > 0
        raise ArgumentError.new("decided_days must be greater than 0")
      end
      

      decided_from(Date.today - (n - 1))
      decided_to(Date.today)
      self
    end
    

    def applicant_name(s)
      unless system == 'idox'
        raise NoMethodError.new("applicant_name is only implemented for Idox. \
          This authority (#{@name}) is #{system.capitalize}.")
      end
      

      check_class(s, String)
      @scrape_params[:applicant_name] = s.strip
      self
@@ -67,7 +67,7 @@ module UKPlanningScraper
        raise NoMethodError.new("application_type is only implemented for \
          Idox. This authority (#{@name}) is #{system.capitalize}.")
      end
      

      check_class(s, String)
      @scrape_params[:application_type] = s.strip
      self
@@ -78,14 +78,14 @@ module UKPlanningScraper
        raise NoMethodError.new("development_type is only implemented for \
          Idox. This authority (#{@name}) is #{system.capitalize}.")
      end
      

      check_class(s, String)
      @scrape_params[:development_type] = s.strip
      self
    end

    private
    

    # Handle the simple params with this
    def method_missing(method_name, *args)
      sc_params = {
@@ -97,18 +97,18 @@ module UKPlanningScraper
        decided_to: Date,
        keywords: String
      }
      

      value = args[0]
      

      if sc_params[method_name]
        check_class(value, sc_params[method_name], method_name.to_s)
        value.strip! if value.class == String
        

        if value.class == Date && value > Date.today
          raise ArgumentError.new("#{method_name} can't be a date in the " + \
            "future (#{value.to_s})")
        end
        

        @scrape_params[method_name] = value
        self
      else
@@ -119,7 +119,7 @@ module UKPlanningScraper
    def clear_scrape_params
      @scrape_params = {}
    end
    

    # https://stackoverflow.com/questions/5100299/how-to-get-the-name-of-the-calling-method
    def check_class(
      param_value,
--- a/lib/uk_planning_scraper/idox.rb
+++ b/lib/uk_planning_scraper/idox.rb
@@ -7,7 +7,7 @@ module UKPlanningScraper
    def scrape_idox(params, options)
      puts "Using Idox scraper."
      base_url = @url.match(/(https?:\/\/.+?)\//)[1]
      

      apps = []

      agent = Mechanize.new
@@ -31,7 +31,7 @@ module UKPlanningScraper
      }.each { |f| form.add_field!(f) unless form.has_field?(f) }

      date_format = "%d/%m/%Y"
      

      form.send(:"date(applicationReceivedStart)", params[:received_from].strftime(date_format)) if params[:received_from]
      form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime(date_format)) if params[:received_to]

@@ -42,12 +42,12 @@ module UKPlanningScraper
      form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime(date_format)) if params[:decided_to]

      form.send(:"searchCriteria\.description", params[:keywords])
      

      # Some councils don't have the applicant name on their form, eg Bexley
      form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
      

      form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType'
      

      # Only some Idox sites (eg Bolton) have a 'searchCriteria.developmentType' parameter
      form.send(:"searchCriteria\.developmentType", params[:development_type]) if form.has_field? 'searchCriteria.developmentType'

@@ -56,7 +56,7 @@ module UKPlanningScraper
      if page.search('.errors').inner_text.match(/Too many results found/i)
        raise TooManySearchResults.new("Scrape in smaller chunks. Use shorter date ranges and/or more search parameters.")
      end
      

      loop do
        # Parse search results
        items = page.search('li.searchresult')
@@ -69,7 +69,7 @@ module UKPlanningScraper
          # Parse info line
          info_line = app.at("p.metaInfo").inner_text.strip
          bits = info_line.split('|').map { |e| e.strip.delete("\r\n") }
          

          bits.each do |bit|
            if matches = bit.match(/Ref\. No:\s+(.+)/)
              data.council_reference = matches[1]
@@ -78,7 +78,7 @@ module UKPlanningScraper
            if matches = bit.match(/(Received|Registered):\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
              data.date_received = Date.parse(matches[2])
            end
            

            if matches = bit.match(/Validated:\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
              data.date_validated = Date.parse(matches[1])
            end
@@ -92,10 +92,10 @@ module UKPlanningScraper
          data.info_url = base_url + app.at('a')['href']
          data.address = app.at('p.address').inner_text.strip
          data.description = app.at('a').inner_text.strip
          

          apps << data
        end
        

        # Get the Next button from the pager, if there is one
        if next_button = page.at('a.next')
          next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
@@ -106,13 +106,13 @@ module UKPlanningScraper
          break
        end
      end
      

      # Scrape the summary tab for each app
      apps.each_with_index do |app, i|
        sleep options[:delay]
        puts "#{i + 1} of #{apps.size}: #{app.info_url}"
        res = agent.get(app.info_url)
        

        if res.code == '200' # That's a String not an Integer, ffs
          # Parse the summary tab for this app

@@ -133,14 +133,14 @@ module UKPlanningScraper
              app.documents_url = base_url + documents_link[:href]
            end
          end
          

          # We need to find values in the table by using the th labels.
          # The row indexes/positions change from site to site (or even app to app) so we can't rely on that.

          res.search('#simpleDetailsTable tr').each do |row|
            key = row.at('th').inner_text.strip
            value = row.at('td').inner_text.strip
            

            case key
              when 'Reference'
                app.council_reference = value
--- a/lib/uk_planning_scraper/northgate.rb
+++ b/lib/uk_planning_scraper/northgate.rb
@@ -8,10 +8,10 @@ module UKPlanningScraper
    def scrape_northgate(params, options)
      puts "Using Northgate scraper."
      base_url = @url.match(/(https?:\/\/.+?)\//)[1]
      

      # Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive?
      generic_url = @url.match(/.+\//)[0] + 'Generic/'
      

      apps = []

      $stdout.sync = true # Flush output buffer after every write so log messages appear immediately.