| @@ -4,13 +4,20 @@ require 'pp' | |||||
| module UKPlanningScraper | module UKPlanningScraper | ||||
| class Authority | class Authority | ||||
| private | private | ||||
| def base_url | |||||
| @base_url ||= @url.match(/(https?:\/\/.+?)\//)[1] | |||||
| end | |||||
| def agent | |||||
| @agent ||= Mechanize.new | |||||
| end | |||||
| def scrape_idox(params, options) | def scrape_idox(params, options) | ||||
| puts "Using Idox scraper." | puts "Using Idox scraper." | ||||
| base_url = @url.match(/(https?:\/\/.+?)\//)[1] | |||||
| apps = [] | apps = [] | ||||
| agent = Mechanize.new | |||||
| puts "Getting: #{@url}" | puts "Getting: #{@url}" | ||||
| page = agent.get(@url) # load the search form page | page = agent.get(@url) # load the search form page | ||||
| @@ -111,72 +118,77 @@ module UKPlanningScraper | |||||
| apps.each_with_index do |app, i| | apps.each_with_index do |app, i| | ||||
| sleep options[:delay] | sleep options[:delay] | ||||
| puts "#{i + 1} of #{apps.size}: #{app.info_url}" | puts "#{i + 1} of #{apps.size}: #{app.info_url}" | ||||
| res = agent.get(app.info_url) | |||||
| if res.code == '200' # That's a String not an Integer, ffs | |||||
| # Parse the summary tab for this app | |||||
| parse_info_url(app) if app.info_url | |||||
| end # scrape summary tab for apps | |||||
| apps | |||||
| end # scrape_idox | |||||
| app.scraped_at = Time.now | |||||
| def parse_info_url(app) | |||||
| res = agent.get(app.info_url) | |||||
| # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead) | |||||
| # Bradford has #tab_documents but without the document count on it | |||||
| app.documents_count = 0 | |||||
| if res.code == '200' # That's a String not an Integer, ffs | |||||
| # Parse the summary tab for this app | |||||
| if documents_link = res.at('.associateddocument a') | |||||
| if documents_link.inner_text.match(/\d+/) | |||||
| app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i | |||||
| app.documents_url = base_url + documents_link[:href] | |||||
| end | |||||
| elsif documents_link = res.at('#tab_documents') | |||||
| if documents_link.inner_text.match(/\d+/) | |||||
| app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i | |||||
| app.documents_url = base_url + documents_link[:href] | |||||
| end | |||||
| app.scraped_at = Time.now | |||||
| # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead) | |||||
| # Bradford has #tab_documents but without the document count on it | |||||
| app.documents_count = 0 | |||||
| if documents_link = res.at('.associateddocument a') | |||||
| if documents_link.inner_text.match(/\d+/) | |||||
| app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i | |||||
| app.documents_url = base_url + documents_link[:href] | |||||
| end | end | ||||
| elsif documents_link = res.at('#tab_documents') | |||||
| if documents_link.inner_text.match(/\d+/) | |||||
| app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i | |||||
| app.documents_url = base_url + documents_link[:href] | |||||
| end | |||||
| end | |||||
| # We need to find values in the table by using the th labels. | |||||
| # The row indexes/positions change from site to site (or even app to app) so we can't rely on that. | |||||
| res.search('#simpleDetailsTable tr').each do |row| | |||||
| key = row.at('th').inner_text.strip | |||||
| value = row.at('td').inner_text.strip | |||||
| case key | |||||
| when 'Reference' | |||||
| app.council_reference = value | |||||
| when 'Alternative Reference' | |||||
| app.alternative_reference = value unless value.empty? | |||||
| when 'Planning Portal Reference' | |||||
| app.alternative_reference = value unless value.empty? | |||||
| when 'Application Received' | |||||
| app.date_received = Date.parse(value) if value.match(/\d/) | |||||
| when 'Application Registered' | |||||
| app.date_received = Date.parse(value) if value.match(/\d/) | |||||
| when 'Application Validated' | |||||
| app.date_validated = Date.parse(value) if value.match(/\d/) | |||||
| when 'Address' | |||||
| app.address = value unless value.empty? | |||||
| when 'Proposal' | |||||
| app.description = value unless value.empty? | |||||
| when 'Status' | |||||
| app.status = value unless value.empty? | |||||
| when 'Decision' | |||||
| app.decision = value unless value.empty? | |||||
| when 'Decision Issued Date' | |||||
| app.date_decision = Date.parse(value) if value.match(/\d/) | |||||
| when 'Appeal Status' | |||||
| app.appeal_status = value unless value.empty? | |||||
| when 'Appeal Decision' | |||||
| app.appeal_decision = value unless value.empty? | |||||
| else | |||||
| puts "Error: key '#{key}' not found" | |||||
| end # case | |||||
| end # each row | |||||
| else | |||||
| puts "Error: HTTP #{res.code}" | |||||
| end # if | |||||
| end # scrape summary tab for apps | |||||
| apps | |||||
| end # scrape_idox | |||||
| # We need to find values in the table by using the th labels. | |||||
| # The row indexes/positions change from site to site (or even app to app) so we can't rely on that. | |||||
| res.search('#simpleDetailsTable tr').each do |row| | |||||
| key = row.at('th').inner_text.strip | |||||
| value = row.at('td').inner_text.strip | |||||
| case key | |||||
| when 'Reference' | |||||
| app.council_reference = value | |||||
| when 'Alternative Reference' | |||||
| app.alternative_reference = value unless value.empty? | |||||
| when 'Planning Portal Reference' | |||||
| app.alternative_reference = value unless value.empty? | |||||
| when 'Application Received' | |||||
| app.date_received = Date.parse(value) if value.match(/\d/) | |||||
| when 'Application Registered' | |||||
| app.date_received = Date.parse(value) if value.match(/\d/) | |||||
| when 'Application Validated' | |||||
| app.date_validated = Date.parse(value) if value.match(/\d/) | |||||
| when 'Address' | |||||
| app.address = value unless value.empty? | |||||
| when 'Proposal' | |||||
| app.description = value unless value.empty? | |||||
| when 'Status' | |||||
| app.status = value unless value.empty? | |||||
| when 'Decision' | |||||
| app.decision = value unless value.empty? | |||||
| when 'Decision Issued Date' | |||||
| app.date_decision = Date.parse(value) if value.match(/\d/) | |||||
| when 'Appeal Status' | |||||
| app.appeal_status = value unless value.empty? | |||||
| when 'Appeal Decision' | |||||
| app.appeal_decision = value unless value.empty? | |||||
| else | |||||
| puts "Error: key '#{key}' not found" | |||||
| end # case | |||||
| end # each row | |||||
| else | |||||
| puts "Error: HTTP #{res.code}" | |||||
| end # if | |||||
| end | |||||
| end # class | end # class | ||||
| end | end | ||||