| @@ -4,13 +4,20 @@ require 'pp' | |||
| module UKPlanningScraper | |||
| class Authority | |||
| private | |||
| def base_url | |||
| @base_url ||= @url.match(/(https?:\/\/.+?)\//)[1] | |||
| end | |||
| def agent | |||
| @agent ||= Mechanize.new | |||
| end | |||
| def scrape_idox(params, options) | |||
| puts "Using Idox scraper." | |||
| base_url = @url.match(/(https?:\/\/.+?)\//)[1] | |||
| apps = [] | |||
| agent = Mechanize.new | |||
| puts "Getting: #{@url}" | |||
| page = agent.get(@url) # load the search form page | |||
| @@ -111,72 +118,77 @@ module UKPlanningScraper | |||
| apps.each_with_index do |app, i| | |||
| sleep options[:delay] | |||
| puts "#{i + 1} of #{apps.size}: #{app.info_url}" | |||
| res = agent.get(app.info_url) | |||
| if res.code == '200' # That's a String not an Integer, ffs | |||
| # Parse the summary tab for this app | |||
| parse_info_url(app) if app.info_url | |||
| end # scrape summary tab for apps | |||
| apps | |||
| end # scrape_idox | |||
| app.scraped_at = Time.now | |||
| def parse_info_url(app) | |||
| res = agent.get(app.info_url) | |||
| # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead) | |||
| # Bradford has #tab_documents but without the document count on it | |||
| app.documents_count = 0 | |||
| if res.code == '200' # That's a String not an Integer, ffs | |||
| # Parse the summary tab for this app | |||
| if documents_link = res.at('.associateddocument a') | |||
| if documents_link.inner_text.match(/\d+/) | |||
| app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i | |||
| app.documents_url = base_url + documents_link[:href] | |||
| end | |||
| elsif documents_link = res.at('#tab_documents') | |||
| if documents_link.inner_text.match(/\d+/) | |||
| app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i | |||
| app.documents_url = base_url + documents_link[:href] | |||
| end | |||
| app.scraped_at = Time.now | |||
| # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead) | |||
| # Bradford has #tab_documents but without the document count on it | |||
| app.documents_count = 0 | |||
| if documents_link = res.at('.associateddocument a') | |||
| if documents_link.inner_text.match(/\d+/) | |||
| app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i | |||
| app.documents_url = base_url + documents_link[:href] | |||
| end | |||
| elsif documents_link = res.at('#tab_documents') | |||
| if documents_link.inner_text.match(/\d+/) | |||
| app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i | |||
| app.documents_url = base_url + documents_link[:href] | |||
| end | |||
| end | |||
| # We need to find values in the table by using the th labels. | |||
| # The row indexes/positions change from site to site (or even app to app) so we can't rely on that. | |||
| res.search('#simpleDetailsTable tr').each do |row| | |||
| key = row.at('th').inner_text.strip | |||
| value = row.at('td').inner_text.strip | |||
| case key | |||
| when 'Reference' | |||
| app.council_reference = value | |||
| when 'Alternative Reference' | |||
| app.alternative_reference = value unless value.empty? | |||
| when 'Planning Portal Reference' | |||
| app.alternative_reference = value unless value.empty? | |||
| when 'Application Received' | |||
| app.date_received = Date.parse(value) if value.match(/\d/) | |||
| when 'Application Registered' | |||
| app.date_received = Date.parse(value) if value.match(/\d/) | |||
| when 'Application Validated' | |||
| app.date_validated = Date.parse(value) if value.match(/\d/) | |||
| when 'Address' | |||
| app.address = value unless value.empty? | |||
| when 'Proposal' | |||
| app.description = value unless value.empty? | |||
| when 'Status' | |||
| app.status = value unless value.empty? | |||
| when 'Decision' | |||
| app.decision = value unless value.empty? | |||
| when 'Decision Issued Date' | |||
| app.date_decision = Date.parse(value) if value.match(/\d/) | |||
| when 'Appeal Status' | |||
| app.appeal_status = value unless value.empty? | |||
| when 'Appeal Decision' | |||
| app.appeal_decision = value unless value.empty? | |||
| else | |||
| puts "Error: key '#{key}' not found" | |||
| end # case | |||
| end # each row | |||
| else | |||
| puts "Error: HTTP #{res.code}" | |||
| end # if | |||
| end # scrape summary tab for apps | |||
| apps | |||
| end # scrape_idox | |||
| # We need to find values in the table by using the th labels. | |||
| # The row indexes/positions change from site to site (or even app to app) so we can't rely on that. | |||
| res.search('#simpleDetailsTable tr').each do |row| | |||
| key = row.at('th').inner_text.strip | |||
| value = row.at('td').inner_text.strip | |||
| case key | |||
| when 'Reference' | |||
| app.council_reference = value | |||
| when 'Alternative Reference' | |||
| app.alternative_reference = value unless value.empty? | |||
| when 'Planning Portal Reference' | |||
| app.alternative_reference = value unless value.empty? | |||
| when 'Application Received' | |||
| app.date_received = Date.parse(value) if value.match(/\d/) | |||
| when 'Application Registered' | |||
| app.date_received = Date.parse(value) if value.match(/\d/) | |||
| when 'Application Validated' | |||
| app.date_validated = Date.parse(value) if value.match(/\d/) | |||
| when 'Address' | |||
| app.address = value unless value.empty? | |||
| when 'Proposal' | |||
| app.description = value unless value.empty? | |||
| when 'Status' | |||
| app.status = value unless value.empty? | |||
| when 'Decision' | |||
| app.decision = value unless value.empty? | |||
| when 'Decision Issued Date' | |||
| app.date_decision = Date.parse(value) if value.match(/\d/) | |||
| when 'Appeal Status' | |||
| app.appeal_status = value unless value.empty? | |||
| when 'Appeal Decision' | |||
| app.appeal_decision = value unless value.empty? | |||
| else | |||
| puts "Error: key '#{key}' not found" | |||
| end # case | |||
| end # each row | |||
| else | |||
| puts "Error: HTTP #{res.code}" | |||
| end # if | |||
| end | |||
| end # class | |||
| end | |||