Make scrapers private instance methods in Authority

6 years ago · 2e100bc3c8
--- a/lib/uk_planning_scraper/authority.rb
+++ b/lib/uk_planning_scraper/authority.rb
@@ -40,9 +40,9 @@ module UKPlanningScraper
      
      # Select which scraper to use based on the URL
      if @url.match(/search\.do\?action=advanced/i)
        apps = UKPlanningScraper.scrape_idox(@url, params, options)
        apps = scrape_idox(params, options)
      elsif @url.match(/generalsearch\.aspx/i)
        apps = UKPlanningScraper.scrape_northgate(@url, params, options)
        apps = scrape_northgate(params, options)
      else
        # Not supported
        raise SystemNotSupportedError.new("Planning system not supported for #{@name} at URL: #{@url}")
--- a/lib/uk_planning_scraper/idox.rb
+++ b/lib/uk_planning_scraper/idox.rb
@@ -2,170 +2,173 @@ require 'mechanize'
 require 'pp'

 module UKPlanningScraper
  def self.scrape_idox(search_url, params, options)
    puts "Using Idox scraper."
    base_url = search_url.match(/(https?:\/\/.+?)\//)[1]
    
    apps = []

    agent = Mechanize.new
    puts "Getting: #{search_url}"
    page = agent.get(search_url) # load the search form page

    # Check that the search form is actually present.
    # When Idox has an internal error it returns an error page with HTTP 200.
    unless form = page.form('searchCriteriaForm')
      puts "Error: Search form page failed to load due to Idox internal error."
      return []
    end
    # form.action = form.action + '&searchCriteria.resultsPerPage=100'

    # Fill out and submit search form

    # Some councils don't have the received from/to dates on their form, eg Newham
    form.send(:"date(applicationReceivedStart)", params[:received_from].strftime("%d/%m/%Y")) if params[:received_from]
    form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime("%d/%m/%Y")) if params[:received_to]

    form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime("%d/%m/%Y")) if params[:validated_from]
    form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime("%d/%m/%Y")) if params[:validated_to]

    form.send(:"date(applicationDecisionStart)", params[:decided_from].strftime("%d/%m/%Y")) if params[:decided_from]
    form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime("%d/%m/%Y")) if params[:decided_to]

    form.send(:"searchCriteria\.description", params[:keywords])
    
    # Some councils don't have the applicant name on their form, eg Bexley
    form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
    
    form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType'
    # Some Idox sites (eg Bolton) call this 'searchCriteria.developmentType'
    form.send(:"searchCriteria\.developmentType", params[:application_type]) if form.has_field? 'searchCriteria.developmentType'
    
    
    page = form.submit

    loop do
      # Parse search results
      items = page.search('li.searchresult')

      puts "Found #{items.size} apps on this page."

      items.each do |app|
        data = {}

        # Parse info line
        info_line = app.at("p.metaInfo").inner_text.strip
        bits = info_line.split('|').map { |e| e.strip.delete("\r\n") }
        
        bits.each do |bit|
          if matches = bit.match(/Ref\. No:\s+(.+)/)
            data[:council_reference] = matches[1]
          end

          if matches = bit.match(/(Received|Registered):\s+(.+)/)
            data[:date_received] = Date.parse(matches[2])
          end
          
          if matches = bit.match(/Validated:\s+(.+)/)
            data[:date_validated] = Date.parse(matches[1])
          end
  class Authority
    private
    def scrape_idox(params, options)
      puts "Using Idox scraper."
      base_url = @url.match(/(https?:\/\/.+?)\//)[1]
      
      apps = []

          if matches = bit.match(/Status:\s+(.+)/)
            data[:status] = matches[1]
          end
        end
      agent = Mechanize.new
      puts "Getting: #{@url}"
      page = agent.get(@url) # load the search form page

        data.merge!({
          scraped_at: Time.now,
          info_url: base_url + app.at('a')['href'],
          address: app.at('p.address').inner_text.strip,
          description: app.at('a').inner_text.strip,
        })
        
        apps << data
      # Check that the search form is actually present.
      # When Idox has an internal error it returns an error page with HTTP 200.
      unless form = page.form('searchCriteriaForm')
        puts "Error: Search form page failed to load due to Idox internal error."
        return []
      end
      # form.action = form.action + '&searchCriteria.resultsPerPage=100'

      # Get the Next button from the pager, if there is one
      if next_button = page.at('a.next')
        next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
        sleep options[:delay]
        puts "Getting: #{next_url}"
        page = agent.get(next_url)
      else
        break
      end
    end
    
    # Scrape the summary tab for each app
    apps.each_with_index do |app, i|
      sleep options[:delay]
      puts "#{i + 1} of #{apps.size}: #{app[:info_url]}"
      res = agent.get(app[:info_url])
      # Fill out and submit search form

      # Some councils don't have the received from/to dates on their form, eg Newham
      form.send(:"date(applicationReceivedStart)", params[:received_from].strftime("%d/%m/%Y")) if params[:received_from]
      form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime("%d/%m/%Y")) if params[:received_to]

      form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime("%d/%m/%Y")) if params[:validated_from]
      form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime("%d/%m/%Y")) if params[:validated_to]

      form.send(:"date(applicationDecisionStart)", params[:decided_from].strftime("%d/%m/%Y")) if params[:decided_from]
      form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime("%d/%m/%Y")) if params[:decided_to]

      form.send(:"searchCriteria\.description", params[:keywords])
      
      if res.code == '200' # That's a String not an Integer, ffs
        # Parse the summary tab for this app
      # Some councils don't have the applicant name on their form, eg Bexley
      form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
      
      form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType'
      # Some Idox sites (eg Bolton) call this 'searchCriteria.developmentType'
      form.send(:"searchCriteria\.developmentType", params[:application_type]) if form.has_field? 'searchCriteria.developmentType'
      
      
      page = form.submit

        app[:scraped_at] = Time.now
      loop do
        # Parse search results
        items = page.search('li.searchresult')

        # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
        # Bradford has #tab_documents but without the document count on it
        app[:documents_count] = 0
        app[:documents_url] = nil
        puts "Found #{items.size} apps on this page."

        if documents_link = res.at('.associateddocument a')
          if documents_link.inner_text.match(/\d+/)
            app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i
            app[:documents_url] = base_url + documents_link[:href]
          end
        elsif documents_link = res.at('#tab_documents')
          if documents_link.inner_text.match(/\d+/)
            app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i
            app[:documents_url] = base_url + documents_link[:href]
        items.each do |app|
          data = {}

          # Parse info line
          info_line = app.at("p.metaInfo").inner_text.strip
          bits = info_line.split('|').map { |e| e.strip.delete("\r\n") }
          
          bits.each do |bit|
            if matches = bit.match(/Ref\. No:\s+(.+)/)
              data[:council_reference] = matches[1]
            end

            if matches = bit.match(/(Received|Registered):\s+(.+)/)
              data[:date_received] = Date.parse(matches[2])
            end
            
            if matches = bit.match(/Validated:\s+(.+)/)
              data[:date_validated] = Date.parse(matches[1])
            end

            if matches = bit.match(/Status:\s+(.+)/)
              data[:status] = matches[1]
            end
          end

          data.merge!({
            scraped_at: Time.now,
            info_url: base_url + app.at('a')['href'],
            address: app.at('p.address').inner_text.strip,
            description: app.at('a').inner_text.strip,
          })
          
          apps << data
        end
        
        # We need to find values in the table by using the th labels.
        # The row indexes/positions change from site to site (or even app to app) so we can't rely on that.

        res.search('#simpleDetailsTable tr').each do |row|
          key = row.at('th').inner_text.strip
          value = row.at('td').inner_text.strip
        # Get the Next button from the pager, if there is one
        if next_button = page.at('a.next')
          next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
          sleep options[:delay]
          puts "Getting: #{next_url}"
          page = agent.get(next_url)
        else
          break
        end
      end
      
      # Scrape the summary tab for each app
      apps.each_with_index do |app, i|
        sleep options[:delay]
        puts "#{i + 1} of #{apps.size}: #{app[:info_url]}"
        res = agent.get(app[:info_url])
        
        if res.code == '200' # That's a String not an Integer, ffs
          # Parse the summary tab for this app

          app[:scraped_at] = Time.now

          # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
          # Bradford has #tab_documents but without the document count on it
          app[:documents_count] = 0
          app[:documents_url] = nil

          if documents_link = res.at('.associateddocument a')
            if documents_link.inner_text.match(/\d+/)
              app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i
              app[:documents_url] = base_url + documents_link[:href]
            end
          elsif documents_link = res.at('#tab_documents')
            if documents_link.inner_text.match(/\d+/)
              app[:documents_count] = documents_link.inner_text.match(/\d+/)[0].to_i
              app[:documents_url] = base_url + documents_link[:href]
            end
          end
          
          case key
            when 'Reference'
              app[:council_reference] = value
            when 'Alternative Reference'
              app[:alternative_reference] = value
            when 'Planning Portal Reference'
              app[:alternative_reference] = value
            when 'Application Received'
              app[:date_received] = Date.parse(value) if value.match(/\d/)
            when 'Application Registered'
              app[:date_received] = Date.parse(value) if value.match(/\d/)
            when 'Application Validated'
              app[:date_validated] = Date.parse(value) if value.match(/\d/)
            when 'Address'
              app[:address] = value
            when 'Proposal'
              app[:description] = value
            when 'Status'
              app[:status] = value
            when 'Decision'
              app[:decision] = value
            when 'Decision Issued Date'
              app[:date_decision] = Date.parse(value) if value.match(/\d/)
            when 'Appeal Status'
              app[:appeal_status] = value
            when 'Appeal Decision'
              app[:appeal_decision] = value
            else
              puts "Error: key '#{key}' not found"
          end # case
        end # each row
      else
        puts "Error: HTTP #{res.code}"
      end # if
    end # scrape summary tab for apps
    apps
  end # scrape_idox
          # We need to find values in the table by using the th labels.
          # The row indexes/positions change from site to site (or even app to app) so we can't rely on that.

          res.search('#simpleDetailsTable tr').each do |row|
            key = row.at('th').inner_text.strip
            value = row.at('td').inner_text.strip
            
            case key
              when 'Reference'
                app[:council_reference] = value
              when 'Alternative Reference'
                app[:alternative_reference] = value
              when 'Planning Portal Reference'
                app[:alternative_reference] = value
              when 'Application Received'
                app[:date_received] = Date.parse(value) if value.match(/\d/)
              when 'Application Registered'
                app[:date_received] = Date.parse(value) if value.match(/\d/)
              when 'Application Validated'
                app[:date_validated] = Date.parse(value) if value.match(/\d/)
              when 'Address'
                app[:address] = value
              when 'Proposal'
                app[:description] = value
              when 'Status'
                app[:status] = value
              when 'Decision'
                app[:decision] = value
              when 'Decision Issued Date'
                app[:date_decision] = Date.parse(value) if value.match(/\d/)
              when 'Appeal Status'
                app[:appeal_status] = value
              when 'Appeal Decision'
                app[:appeal_decision] = value
              else
                puts "Error: key '#{key}' not found"
            end # case
          end # each row
        else
          puts "Error: HTTP #{res.code}"
        end # if
      end # scrape summary tab for apps
      apps
    end # scrape_idox
  end # class
 end
--- a/lib/uk_planning_scraper/northgate.rb
+++ b/lib/uk_planning_scraper/northgate.rb
@@ -3,137 +3,140 @@ require 'nokogiri'
 require 'logger'

 module UKPlanningScraper
  def self.scrape_northgate(search_url, params, options)
    puts "Using Northgate scraper."
    base_url = search_url.match(/(https?:\/\/.+?)\//)[1]
    
    # Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive?
    generic_url = search_url.match(/.+\//)[0] + 'Generic/'
    
    apps = []

    $stdout.sync = true # Flush output buffer after every write so log messages appear immediately.
    logger = Logger.new($stdout)
    logger.level = Logger::DEBUG

    date_regex = /\d{2}-\d{2}-\d{4}/

    form_vars = {
      'csbtnSearch' => 'Search' # required
    }

    form_vars['txtProposal'] = params[:keywords]

    # Date received from and to
    if params[:received_from] || params[:received_to]
      form_vars['cboSelectDateValue'] = 'DATE_RECEIVED'
      form_vars['rbGroup'] = 'rbRange'
      form_vars['dateStart'] = params[:received_from].to_s if params[:received_from] # YYYY-MM-DD
      form_vars['dateEnd'] = params[:received_to].to_s if params[:received_to] # YYYY-MM-DD
    end
  class Authority
    private
    def scrape_northgate(params, options)
      puts "Using Northgate scraper."
      base_url = @url.match(/(https?:\/\/.+?)\//)[1]
      
      # Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive?
      generic_url = @url.match(/.+\//)[0] + 'Generic/'
      
      apps = []

    # Date validated from and to
    if params[:validated_from] || params[:validated_to]
      form_vars['cboSelectDateValue'] = 'DATE_VALID'
      form_vars['rbGroup'] = 'rbRange'
      form_vars['dateStart'] = params[:validated_from].to_s if params[:validated_from] # YYYY-MM-DD
      form_vars['dateEnd'] = params[:validated_to].to_s if params[:validated_to] # YYYY-MM-DD
    end
      $stdout.sync = true # Flush output buffer after every write so log messages appear immediately.
      logger = Logger.new($stdout)
      logger.level = Logger::DEBUG

    # Date decided from and to
    if params[:decided_from] || params[:decided_to]
      form_vars['cboSelectDateValue'] = 'DATE_DECISION'
      form_vars['rbGroup'] = 'rbRange'
      form_vars['dateStart'] = params[:decided_from].to_s if params[:decided_from] # YYYY-MM-DD
      form_vars['dateEnd'] = params[:decided_to].to_s if params[:decided_to] # YYYY-MM-DD
    end
      date_regex = /\d{2}-\d{2}-\d{4}/

      form_vars = {
        'csbtnSearch' => 'Search' # required
      }

    # form_vars.merge!({ 'cboStatusCode' => ENV['MORPH_STATUS']}) if ENV['MORPH_STATUS']
      form_vars['txtProposal'] = params[:keywords]

    logger.info "Form variables: #{form_vars.to_s}"
      # Date received from and to
      if params[:received_from] || params[:received_to]
        form_vars['cboSelectDateValue'] = 'DATE_RECEIVED'
        form_vars['rbGroup'] = 'rbRange'
        form_vars['dateStart'] = params[:received_from].to_s if params[:received_from] # YYYY-MM-DD
        form_vars['dateEnd'] = params[:received_to].to_s if params[:received_to] # YYYY-MM-DD
      end

    headers = {
      'Origin' => base_url,
      'Referer' => search_url,
    }
      # Date validated from and to
      if params[:validated_from] || params[:validated_to]
        form_vars['cboSelectDateValue'] = 'DATE_VALID'
        form_vars['rbGroup'] = 'rbRange'
        form_vars['dateStart'] = params[:validated_from].to_s if params[:validated_from] # YYYY-MM-DD
        form_vars['dateEnd'] = params[:validated_to].to_s if params[:validated_to] # YYYY-MM-DD
      end

    logger.debug "HTTP request headers:"
    logger.debug(headers.to_s)
      # Date decided from and to
      if params[:decided_from] || params[:decided_to]
        form_vars['cboSelectDateValue'] = 'DATE_DECISION'
        form_vars['rbGroup'] = 'rbRange'
        form_vars['dateStart'] = params[:decided_from].to_s if params[:decided_from] # YYYY-MM-DD
        form_vars['dateEnd'] = params[:decided_to].to_s if params[:decided_to] # YYYY-MM-DD
      end

    logger.debug "GET: " + search_url
    response = HTTP.headers(headers).get(search_url)
    logger.debug "Response code: HTTP " + response.code.to_s

    if response.code == 200
      doc = Nokogiri::HTML(response.to_s)
      asp_vars = {
        '__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'],
        '__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value']
       }
    else
      logger.fatal "Bad response from search page. Response code: #{response.code.to_s}. Exiting."
      exit 1
    end
      # form_vars.merge!({ 'cboStatusCode' => ENV['MORPH_STATUS']}) if ENV['MORPH_STATUS']

    cookies = {}
    response.cookies.each { |c| cookies[c.name] = c.value }
      logger.info "Form variables: #{form_vars.to_s}"

    form_vars.merge!(asp_vars)
      headers = {
        'Origin' => base_url,
        'Referer' => @url,
      }

    logger.debug "POST: " + search_url
    response2 = HTTP.headers(headers).cookies(cookies).post(search_url, :form => form_vars)
    logger.debug "Response code: HTTP " + response2.code.to_s
      logger.debug "HTTP request headers:"
      logger.debug(headers.to_s)

    if response2.code == 302
      # Follow the redirect manually
      # Set the page size (PS) to max so we don't have to page through search results
      logger.debug "Location: #{response2.headers['Location']}"
      # exit
      results_url = URI::encode(base_url + response2.headers['Location'].gsub!('PS=10', 'PS=99999'))
      
      logger.debug "GET: " + results_url
      response3 = HTTP.headers(headers).cookies(cookies).get(results_url)
      logger.debug "Response code: HTTP " + response3.code.to_s
      doc = Nokogiri::HTML(response3.to_s)
    else
      logger.fatal "Didn't get redirected from search. Exiting."
      exit 1
    end
      logger.debug "GET: " + @url
      response = HTTP.headers(headers).get(@url)
      logger.debug "Response code: HTTP " + response.code.to_s

    rows = doc.search("table.display_table tr")
    logger.info "Found #{rows.size - 1} applications in search results." # The first row is the header row

    # Iterate over search results
    rows.each do |row|
      if row.at("td") # skip header row which only has th's
        cells = row.search("td")
        ref = cells[0].inner_text.strip

        app = {
          scraped_at: Time.now,
          # date_scraped: Date.today # FIXME - Planning Alerts compatibility?
        }

        app[:council_reference] = ref
        app[:info_url] = URI::encode(generic_url + cells[0].at('a')[:href].strip)
        app[:info_url].gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
        app[:address] = cells[1].inner_text.strip
        app[:description] = cells[2].inner_text.strip
        app[:status] = cells[3].inner_text.strip
        
        raw_date_received = cells[4].inner_text.strip
      if response.code == 200
        doc = Nokogiri::HTML(response.to_s)
        asp_vars = {
          '__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'],
          '__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value']
         }
      else
        logger.fatal "Bad response from search page. Response code: #{response.code.to_s}. Exiting."
        exit 1
      end

      cookies = {}
      response.cookies.each { |c| cookies[c.name] = c.value }

      form_vars.merge!(asp_vars)

      logger.debug "POST: " + @url
      response2 = HTTP.headers(headers).cookies(cookies).post(@url, :form => form_vars)
      logger.debug "Response code: HTTP " + response2.code.to_s

      if response2.code == 302
        # Follow the redirect manually
        # Set the page size (PS) to max so we don't have to page through search results
        logger.debug "Location: #{response2.headers['Location']}"
        # exit
        results_url = URI::encode(base_url + response2.headers['Location'].gsub!('PS=10', 'PS=99999'))
        
        if raw_date_received != '--'
          app[:date_received] = Date.parse(raw_date_received)
        else
          app[:date_received] = nil
        logger.debug "GET: " + results_url
        response3 = HTTP.headers(headers).cookies(cookies).get(results_url)
        logger.debug "Response code: HTTP " + response3.code.to_s
        doc = Nokogiri::HTML(response3.to_s)
      else
        logger.fatal "Didn't get redirected from search. Exiting."
        exit 1
      end

      rows = doc.search("table.display_table tr")
      logger.info "Found #{rows.size - 1} applications in search results." # The first row is the header row

      # Iterate over search results
      rows.each do |row|
        if row.at("td") # skip header row which only has th's
          cells = row.search("td")
          ref = cells[0].inner_text.strip

          app = {
            scraped_at: Time.now,
            # date_scraped: Date.today # FIXME - Planning Alerts compatibility?
          }

          app[:council_reference] = ref
          app[:info_url] = URI::encode(generic_url + cells[0].at('a')[:href].strip)
          app[:info_url].gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
          app[:address] = cells[1].inner_text.strip
          app[:description] = cells[2].inner_text.strip
          app[:status] = cells[3].inner_text.strip
          
          raw_date_received = cells[4].inner_text.strip
          
          if raw_date_received != '--'
            app[:date_received] = Date.parse(raw_date_received)
          else
            app[:date_received] = nil
          end
          
          app[:decision] = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney
          apps << app
        end
        
        app[:decision] = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney
        apps << app
      end
      apps
    end
    apps
  end
 end