| @@ -37,7 +37,7 @@ module UKPlanningScraper | |||||
| appeal_decision: @appeal_decision | appeal_decision: @appeal_decision | ||||
| } | } | ||||
| end | end | ||||
| def valid? | def valid? | ||||
| return true if @authority_name && @council_reference && @info_url | return true if @authority_name && @council_reference && @info_url | ||||
| false | false | ||||
| @@ -3,7 +3,7 @@ require 'csv' | |||||
| module UKPlanningScraper | module UKPlanningScraper | ||||
| class Authority | class Authority | ||||
| attr_reader :name, :url | attr_reader :name, :url | ||||
| @@authorities = [] | @@authorities = [] | ||||
| def initialize(name, url) | def initialize(name, url) | ||||
| @@ -31,7 +31,7 @@ module UKPlanningScraper | |||||
| raise SystemNotSupported.new("Planning system not supported for \ | raise SystemNotSupported.new("Planning system not supported for \ | ||||
| #{@name} at URL: #{@url}") | #{@name} at URL: #{@url}") | ||||
| end | end | ||||
| # Post processing | # Post processing | ||||
| @applications.each do |app| | @applications.each do |app| | ||||
| app.authority_name = @name | app.authority_name = @name | ||||
| @@ -41,32 +41,32 @@ module UKPlanningScraper | |||||
| output = [] | output = [] | ||||
| # FIXME - silently ignores invalid apps. How should we handle them? | # FIXME - silently ignores invalid apps. How should we handle them? | ||||
| @applications.each { |app| output << app.to_hash if app.valid? } | @applications.each { |app| output << app.to_hash if app.valid? } | ||||
| # Reset so that old params don't get used for new scrapes | # Reset so that old params don't get used for new scrapes | ||||
| clear_scrape_params | clear_scrape_params | ||||
| output # Single point of successful exit | output # Single point of successful exit | ||||
| end | end | ||||
| def tags | def tags | ||||
| @tags.sort | @tags.sort | ||||
| end | end | ||||
| # Add multiple tags to existing tags | # Add multiple tags to existing tags | ||||
| def add_tags(tags) | def add_tags(tags) | ||||
| tags.each { |t| add_tag(t) } | tags.each { |t| add_tag(t) } | ||||
| end | end | ||||
| # Add a single tag to existing tags | # Add a single tag to existing tags | ||||
| def add_tag(tag) | def add_tag(tag) | ||||
| clean_tag = tag.strip.downcase.gsub(' ', '') | clean_tag = tag.strip.downcase.gsub(' ', '') | ||||
| @tags << clean_tag unless tagged?(clean_tag) # prevent duplicates | @tags << clean_tag unless tagged?(clean_tag) # prevent duplicates | ||||
| end | end | ||||
| def tagged?(tag) | def tagged?(tag) | ||||
| @tags.include?(tag) | @tags.include?(tag) | ||||
| end | end | ||||
| def system | def system | ||||
| if @url.match(/search\.do\?action=advanced/i) | if @url.match(/search\.do\?action=advanced/i) | ||||
| 'idox' | 'idox' | ||||
| @@ -84,18 +84,18 @@ module UKPlanningScraper | |||||
| def self.all | def self.all | ||||
| @@authorities | @@authorities | ||||
| end | end | ||||
| # List all the tags in use | # List all the tags in use | ||||
| def self.tags | def self.tags | ||||
| tags = [] | tags = [] | ||||
| @@authorities.each { |a| tags << a.tags } | @@authorities.each { |a| tags << a.tags } | ||||
| tags.flatten.uniq.sort | tags.flatten.uniq.sort | ||||
| end | end | ||||
| def self.named(name) | def self.named(name) | ||||
| authority = @@authorities.find { |a| name == a.name } | authority = @@authorities.find { |a| name == a.name } | ||||
| raise AuthorityNotFound if authority.nil? | raise AuthorityNotFound if authority.nil? | ||||
| authority | |||||
| authority | |||||
| end | end | ||||
| # Tagged x | # Tagged x | ||||
| @@ -125,11 +125,11 @@ module UKPlanningScraper | |||||
| CSV.foreach(File.join(File.dirname(__dir__), 'uk_planning_scraper', \ | CSV.foreach(File.join(File.dirname(__dir__), 'uk_planning_scraper', \ | ||||
| 'authorities.csv'), :headers => true) do |line| | 'authorities.csv'), :headers => true) do |line| | ||||
| auth = Authority.new(line['authority_name'], line['url']) | auth = Authority.new(line['authority_name'], line['url']) | ||||
| if line['tags'] | if line['tags'] | ||||
| auth.add_tags(line['tags'].split(/\s+/)) | auth.add_tags(line['tags'].split(/\s+/)) | ||||
| end | end | ||||
| auth.add_tag(auth.system) | auth.add_tag(auth.system) | ||||
| @@authorities << auth | @@authorities << auth | ||||
| end | end | ||||
| @@ -4,7 +4,7 @@ module UKPlanningScraper | |||||
| class Authority | class Authority | ||||
| # Parameter methods for Authority#scrape | # Parameter methods for Authority#scrape | ||||
| # Desgined to be method chained, eg: | # Desgined to be method chained, eg: | ||||
| # | |||||
| # | |||||
| # applications = UKPlanningScraper::Authority.named("Barnet"). \ | # applications = UKPlanningScraper::Authority.named("Barnet"). \ | ||||
| # development_type("Q22").keywords("illuminat"). \ | # development_type("Q22").keywords("illuminat"). \ | ||||
| # validated_days(30).scrape | # validated_days(30).scrape | ||||
| @@ -17,7 +17,7 @@ module UKPlanningScraper | |||||
| unless n > 0 | unless n > 0 | ||||
| raise ArgumentError.new("validated_days must be greater than 0") | raise ArgumentError.new("validated_days must be greater than 0") | ||||
| end | end | ||||
| validated_from(Date.today - (n - 1)) | validated_from(Date.today - (n - 1)) | ||||
| validated_to(Date.today) | validated_to(Date.today) | ||||
| self | self | ||||
| @@ -31,7 +31,7 @@ module UKPlanningScraper | |||||
| unless n > 0 | unless n > 0 | ||||
| raise ArgumentError.new("received_days must be greater than 0") | raise ArgumentError.new("received_days must be greater than 0") | ||||
| end | end | ||||
| received_from(Date.today - (n - 1)) | received_from(Date.today - (n - 1)) | ||||
| received_to(Date.today) | received_to(Date.today) | ||||
| self | self | ||||
| @@ -45,18 +45,18 @@ module UKPlanningScraper | |||||
| unless n > 0 | unless n > 0 | ||||
| raise ArgumentError.new("decided_days must be greater than 0") | raise ArgumentError.new("decided_days must be greater than 0") | ||||
| end | end | ||||
| decided_from(Date.today - (n - 1)) | decided_from(Date.today - (n - 1)) | ||||
| decided_to(Date.today) | decided_to(Date.today) | ||||
| self | self | ||||
| end | end | ||||
| def applicant_name(s) | def applicant_name(s) | ||||
| unless system == 'idox' | unless system == 'idox' | ||||
| raise NoMethodError.new("applicant_name is only implemented for Idox. \ | raise NoMethodError.new("applicant_name is only implemented for Idox. \ | ||||
| This authority (#{@name}) is #{system.capitalize}.") | This authority (#{@name}) is #{system.capitalize}.") | ||||
| end | end | ||||
| check_class(s, String) | check_class(s, String) | ||||
| @scrape_params[:applicant_name] = s.strip | @scrape_params[:applicant_name] = s.strip | ||||
| self | self | ||||
| @@ -67,7 +67,7 @@ module UKPlanningScraper | |||||
| raise NoMethodError.new("application_type is only implemented for \ | raise NoMethodError.new("application_type is only implemented for \ | ||||
| Idox. This authority (#{@name}) is #{system.capitalize}.") | Idox. This authority (#{@name}) is #{system.capitalize}.") | ||||
| end | end | ||||
| check_class(s, String) | check_class(s, String) | ||||
| @scrape_params[:application_type] = s.strip | @scrape_params[:application_type] = s.strip | ||||
| self | self | ||||
| @@ -78,14 +78,14 @@ module UKPlanningScraper | |||||
| raise NoMethodError.new("development_type is only implemented for \ | raise NoMethodError.new("development_type is only implemented for \ | ||||
| Idox. This authority (#{@name}) is #{system.capitalize}.") | Idox. This authority (#{@name}) is #{system.capitalize}.") | ||||
| end | end | ||||
| check_class(s, String) | check_class(s, String) | ||||
| @scrape_params[:development_type] = s.strip | @scrape_params[:development_type] = s.strip | ||||
| self | self | ||||
| end | end | ||||
| private | private | ||||
| # Handle the simple params with this | # Handle the simple params with this | ||||
| def method_missing(method_name, *args) | def method_missing(method_name, *args) | ||||
| sc_params = { | sc_params = { | ||||
| @@ -97,18 +97,18 @@ module UKPlanningScraper | |||||
| decided_to: Date, | decided_to: Date, | ||||
| keywords: String | keywords: String | ||||
| } | } | ||||
| value = args[0] | value = args[0] | ||||
| if sc_params[method_name] | if sc_params[method_name] | ||||
| check_class(value, sc_params[method_name], method_name.to_s) | check_class(value, sc_params[method_name], method_name.to_s) | ||||
| value.strip! if value.class == String | value.strip! if value.class == String | ||||
| if value.class == Date && value > Date.today | if value.class == Date && value > Date.today | ||||
| raise ArgumentError.new("#{method_name} can't be a date in the " + \ | raise ArgumentError.new("#{method_name} can't be a date in the " + \ | ||||
| "future (#{value.to_s})") | "future (#{value.to_s})") | ||||
| end | end | ||||
| @scrape_params[method_name] = value | @scrape_params[method_name] = value | ||||
| self | self | ||||
| else | else | ||||
| @@ -119,7 +119,7 @@ module UKPlanningScraper | |||||
| def clear_scrape_params | def clear_scrape_params | ||||
| @scrape_params = {} | @scrape_params = {} | ||||
| end | end | ||||
| # https://stackoverflow.com/questions/5100299/how-to-get-the-name-of-the-calling-method | # https://stackoverflow.com/questions/5100299/how-to-get-the-name-of-the-calling-method | ||||
| def check_class( | def check_class( | ||||
| param_value, | param_value, | ||||
| @@ -7,7 +7,7 @@ module UKPlanningScraper | |||||
| def scrape_idox(params, options) | def scrape_idox(params, options) | ||||
| puts "Using Idox scraper." | puts "Using Idox scraper." | ||||
| base_url = @url.match(/(https?:\/\/.+?)\//)[1] | base_url = @url.match(/(https?:\/\/.+?)\//)[1] | ||||
| apps = [] | apps = [] | ||||
| agent = Mechanize.new | agent = Mechanize.new | ||||
| @@ -31,7 +31,7 @@ module UKPlanningScraper | |||||
| }.each { |f| form.add_field!(f) unless form.has_field?(f) } | }.each { |f| form.add_field!(f) unless form.has_field?(f) } | ||||
| date_format = "%d/%m/%Y" | date_format = "%d/%m/%Y" | ||||
| form.send(:"date(applicationReceivedStart)", params[:received_from].strftime(date_format)) if params[:received_from] | form.send(:"date(applicationReceivedStart)", params[:received_from].strftime(date_format)) if params[:received_from] | ||||
| form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime(date_format)) if params[:received_to] | form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime(date_format)) if params[:received_to] | ||||
| @@ -42,12 +42,12 @@ module UKPlanningScraper | |||||
| form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime(date_format)) if params[:decided_to] | form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime(date_format)) if params[:decided_to] | ||||
| form.send(:"searchCriteria\.description", params[:keywords]) | form.send(:"searchCriteria\.description", params[:keywords]) | ||||
| # Some councils don't have the applicant name on their form, eg Bexley | # Some councils don't have the applicant name on their form, eg Bexley | ||||
| form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName' | form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName' | ||||
| form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType' | form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType' | ||||
| # Only some Idox sites (eg Bolton) have a 'searchCriteria.developmentType' parameter | # Only some Idox sites (eg Bolton) have a 'searchCriteria.developmentType' parameter | ||||
| form.send(:"searchCriteria\.developmentType", params[:development_type]) if form.has_field? 'searchCriteria.developmentType' | form.send(:"searchCriteria\.developmentType", params[:development_type]) if form.has_field? 'searchCriteria.developmentType' | ||||
| @@ -56,7 +56,7 @@ module UKPlanningScraper | |||||
| if page.search('.errors').inner_text.match(/Too many results found/i) | if page.search('.errors').inner_text.match(/Too many results found/i) | ||||
| raise TooManySearchResults.new("Scrape in smaller chunks. Use shorter date ranges and/or more search parameters.") | raise TooManySearchResults.new("Scrape in smaller chunks. Use shorter date ranges and/or more search parameters.") | ||||
| end | end | ||||
| loop do | loop do | ||||
| # Parse search results | # Parse search results | ||||
| items = page.search('li.searchresult') | items = page.search('li.searchresult') | ||||
| @@ -69,7 +69,7 @@ module UKPlanningScraper | |||||
| # Parse info line | # Parse info line | ||||
| info_line = app.at("p.metaInfo").inner_text.strip | info_line = app.at("p.metaInfo").inner_text.strip | ||||
| bits = info_line.split('|').map { |e| e.strip.delete("\r\n") } | bits = info_line.split('|').map { |e| e.strip.delete("\r\n") } | ||||
| bits.each do |bit| | bits.each do |bit| | ||||
| if matches = bit.match(/Ref\. No:\s+(.+)/) | if matches = bit.match(/Ref\. No:\s+(.+)/) | ||||
| data.council_reference = matches[1] | data.council_reference = matches[1] | ||||
| @@ -78,7 +78,7 @@ module UKPlanningScraper | |||||
| if matches = bit.match(/(Received|Registered):\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/) | if matches = bit.match(/(Received|Registered):\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/) | ||||
| data.date_received = Date.parse(matches[2]) | data.date_received = Date.parse(matches[2]) | ||||
| end | end | ||||
| if matches = bit.match(/Validated:\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/) | if matches = bit.match(/Validated:\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/) | ||||
| data.date_validated = Date.parse(matches[1]) | data.date_validated = Date.parse(matches[1]) | ||||
| end | end | ||||
| @@ -92,10 +92,10 @@ module UKPlanningScraper | |||||
| data.info_url = base_url + app.at('a')['href'] | data.info_url = base_url + app.at('a')['href'] | ||||
| data.address = app.at('p.address').inner_text.strip | data.address = app.at('p.address').inner_text.strip | ||||
| data.description = app.at('a').inner_text.strip | data.description = app.at('a').inner_text.strip | ||||
| apps << data | apps << data | ||||
| end | end | ||||
| # Get the Next button from the pager, if there is one | # Get the Next button from the pager, if there is one | ||||
| if next_button = page.at('a.next') | if next_button = page.at('a.next') | ||||
| next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100' | next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100' | ||||
| @@ -106,13 +106,13 @@ module UKPlanningScraper | |||||
| break | break | ||||
| end | end | ||||
| end | end | ||||
| # Scrape the summary tab for each app | # Scrape the summary tab for each app | ||||
| apps.each_with_index do |app, i| | apps.each_with_index do |app, i| | ||||
| sleep options[:delay] | sleep options[:delay] | ||||
| puts "#{i + 1} of #{apps.size}: #{app.info_url}" | puts "#{i + 1} of #{apps.size}: #{app.info_url}" | ||||
| res = agent.get(app.info_url) | res = agent.get(app.info_url) | ||||
| if res.code == '200' # That's a String not an Integer, ffs | if res.code == '200' # That's a String not an Integer, ffs | ||||
| # Parse the summary tab for this app | # Parse the summary tab for this app | ||||
| @@ -133,14 +133,14 @@ module UKPlanningScraper | |||||
| app.documents_url = base_url + documents_link[:href] | app.documents_url = base_url + documents_link[:href] | ||||
| end | end | ||||
| end | end | ||||
| # We need to find values in the table by using the th labels. | # We need to find values in the table by using the th labels. | ||||
| # The row indexes/positions change from site to site (or even app to app) so we can't rely on that. | # The row indexes/positions change from site to site (or even app to app) so we can't rely on that. | ||||
| res.search('#simpleDetailsTable tr').each do |row| | res.search('#simpleDetailsTable tr').each do |row| | ||||
| key = row.at('th').inner_text.strip | key = row.at('th').inner_text.strip | ||||
| value = row.at('td').inner_text.strip | value = row.at('td').inner_text.strip | ||||
| case key | case key | ||||
| when 'Reference' | when 'Reference' | ||||
| app.council_reference = value | app.council_reference = value | ||||
| @@ -8,10 +8,10 @@ module UKPlanningScraper | |||||
| def scrape_northgate(params, options) | def scrape_northgate(params, options) | ||||
| puts "Using Northgate scraper." | puts "Using Northgate scraper." | ||||
| base_url = @url.match(/(https?:\/\/.+?)\//)[1] | base_url = @url.match(/(https?:\/\/.+?)\//)[1] | ||||
| # Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive? | # Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive? | ||||
| generic_url = @url.match(/.+\//)[0] + 'Generic/' | generic_url = @url.match(/.+\//)[0] + 'Generic/' | ||||
| apps = [] | apps = [] | ||||
| $stdout.sync = true # Flush output buffer after every write so log messages appear immediately. | $stdout.sync = true # Flush output buffer after every write so log messages appear immediately. | ||||