Extract summary scraping into separate method

7 年之前 · 1efb7c57d4
--- a/lib/uk_planning_scraper/idox.rb
+++ b/lib/uk_planning_scraper/idox.rb
@@ -4,13 +4,20 @@ require 'pp'
 module UKPlanningScraper
  class Authority
    private

    def base_url
      @base_url ||= @url.match(/(https?:\/\/.+?)\//)[1]
    end

    def agent
      @agent ||= Mechanize.new
    end

    def scrape_idox(params, options)
      puts "Using Idox scraper."
      base_url = @url.match(/(https?:\/\/.+?)\//)[1]

      apps = []

      agent = Mechanize.new
      puts "Getting: #{@url}"
      page = agent.get(@url) # load the search form page

@@ -111,72 +118,77 @@ module UKPlanningScraper
      apps.each_with_index do |app, i|
        sleep options[:delay]
        puts "#{i + 1} of #{apps.size}: #{app.info_url}"
        res = agent.get(app.info_url)

        if res.code == '200' # That's a String not an Integer, ffs
          # Parse the summary tab for this app
        parse_info_url(app) if app.info_url
      end # scrape summary tab for apps
      apps
    end # scrape_idox

          app.scraped_at = Time.now
    def parse_info_url(app)
      res = agent.get(app.info_url)

          # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
          # Bradford has #tab_documents but without the document count on it
          app.documents_count = 0
      if res.code == '200' # That's a String not an Integer, ffs
        # Parse the summary tab for this app

          if documents_link = res.at('.associateddocument a')
            if documents_link.inner_text.match(/\d+/)
              app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
              app.documents_url = base_url + documents_link[:href]
            end
          elsif documents_link = res.at('#tab_documents')
            if documents_link.inner_text.match(/\d+/)
              app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
              app.documents_url = base_url + documents_link[:href]
            end
        app.scraped_at = Time.now

        # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
        # Bradford has #tab_documents but without the document count on it
        app.documents_count = 0

        if documents_link = res.at('.associateddocument a')
          if documents_link.inner_text.match(/\d+/)
            app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
            app.documents_url = base_url + documents_link[:href]
          end
        elsif documents_link = res.at('#tab_documents')
          if documents_link.inner_text.match(/\d+/)
            app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
            app.documents_url = base_url + documents_link[:href]
          end
        end

          # We need to find values in the table by using the th labels.
          # The row indexes/positions change from site to site (or even app to app) so we can't rely on that.

          res.search('#simpleDetailsTable tr').each do |row|
            key = row.at('th').inner_text.strip
            value = row.at('td').inner_text.strip

            case key
              when 'Reference'
                app.council_reference = value
              when 'Alternative Reference'
                app.alternative_reference = value unless value.empty?
              when 'Planning Portal Reference'
                app.alternative_reference = value unless value.empty?
              when 'Application Received'
                app.date_received = Date.parse(value) if value.match(/\d/)
              when 'Application Registered'
                app.date_received = Date.parse(value) if value.match(/\d/)
              when 'Application Validated'
                app.date_validated = Date.parse(value) if value.match(/\d/)
              when 'Address'
                app.address = value unless value.empty?
              when 'Proposal'
                app.description = value unless value.empty?
              when 'Status'
                app.status = value unless value.empty?
              when 'Decision'
                app.decision = value unless value.empty?
              when 'Decision Issued Date'
                app.date_decision = Date.parse(value) if value.match(/\d/)
              when 'Appeal Status'
                app.appeal_status = value unless value.empty?
              when 'Appeal Decision'
                app.appeal_decision = value unless value.empty?
              else
                puts "Error: key '#{key}' not found"
            end # case
          end # each row
        else
          puts "Error: HTTP #{res.code}"
        end # if
      end # scrape summary tab for apps
      apps
    end # scrape_idox
        # We need to find values in the table by using the th labels.
        # The row indexes/positions change from site to site (or even app to app) so we can't rely on that.

        res.search('#simpleDetailsTable tr').each do |row|
          key = row.at('th').inner_text.strip
          value = row.at('td').inner_text.strip

          case key
            when 'Reference'
              app.council_reference = value
            when 'Alternative Reference'
              app.alternative_reference = value unless value.empty?
            when 'Planning Portal Reference'
              app.alternative_reference = value unless value.empty?
            when 'Application Received'
              app.date_received = Date.parse(value) if value.match(/\d/)
            when 'Application Registered'
              app.date_received = Date.parse(value) if value.match(/\d/)
            when 'Application Validated'
              app.date_validated = Date.parse(value) if value.match(/\d/)
            when 'Address'
              app.address = value unless value.empty?
            when 'Proposal'
              app.description = value unless value.empty?
            when 'Status'
              app.status = value unless value.empty?
            when 'Decision'
              app.decision = value unless value.empty?
            when 'Decision Issued Date'
              app.date_decision = Date.parse(value) if value.match(/\d/)
            when 'Appeal Status'
              app.appeal_status = value unless value.empty?
            when 'Appeal Decision'
              app.appeal_decision = value unless value.empty?
            else
              puts "Error: key '#{key}' not found"
          end # case
        end # each row
      else
        puts "Error: HTTP #{res.code}"
      end # if
    end
  end # class
 end