North Ayrshire, Redbridge: added

пре 18 година · 825ac7d147
--- a/python_scrapers/NorthAyrshire.cgi
+++ b/python_scrapers/NorthAyrshire.cgi
@@ -0,0 +1,85 @@
 #!/usr/bin/perl -w

 use strict;
 use HTML::TreeBuilder;
 use File::Temp qw(tempfile);
 use LWP::Simple;
 use POSIX;
 use Encode;
 use CGI;
 use CGI::Carp;

 sub sanity_check {
 	my ($var) = @_;
 	defined $var or return 0;
 	$var =~ /^[0-9]+$/ or return 0;
 	return 1;
 }

 sub no_results {
 	my ($y, $m, $d, $reason) = @_;
 	print <<NIL;
 Content-type: text/xml

 <?xml version="1.0" encoding="UTF-8"?>
 <planning>
  <authority_name>North Ayrshire Council</authority_name>
  <authority_short_name>North Ayrshire</authority_short_name>
  <applications>
  </applications>
 </planning>
 NIL
 	die "$y/$m/$d failed: $reason\n";
 }

 my $cgi = new CGI;

 my $year = $cgi->param("year");
 my $month = $cgi->param("month");
 my $day = $cgi->param("day");

 unless (sanity_check($year) && sanity_check($month) && sanity_check($day)) {
 	print <<ERROR;
 Content-type: text/plain

 Need year, month, day parameters
 ERROR
 	exit 0;
 }

 my $tree = HTML::TreeBuilder->new;
 $tree->parse(decode_utf8(get('http://www.north-ayrshire.gov.uk/na/Home.nsf/OtherMenuPage?ReadForm&MenuType=Environment-Planning&DocDisplay=NoDoc&CatLevel=2||') or die "couldn't fetch index page"));
 $tree->eof;

 my $re = strftime('Planning Applications Received week ending %d %B %Y', 0, 0, 0, $day, $month-1, $year-1900);

 my ($day_link) = $tree->look_down(
 	"_tag", "a",
 	sub {  $_[0]->as_text =~ /$re/i }
 );
 $day_link or no_results($year, $month, $day, "Cannot find day link");

 my $day_absurl = 'http://www.north-ayrshire.gov.uk'.$day_link->attr('href');

 my $day_tree = HTML::TreeBuilder->new;
 $day_tree->parse(decode_utf8(get($day_absurl) or die "couldn't fetch day page"));
 $day_tree->eof;

 my ($pdf_img) = $day_tree->look_down(
 	"_tag", "img",
 	"alt", qr/\.pdf$/i
 );
 $pdf_img or die "couldn't find pdf image on day page";
 my $pdf_link = $pdf_img->parent;
 $pdf_link or die "couldn't find pdf link on day page";

 my $pdf_absurl = 'http://www.north-ayrshire.gov.uk'.$pdf_link->attr('href');

 my ($fh, $filename) = tempfile(SUFFIX => ".pdf");
 print $fh get($pdf_absurl);
 close($fh);

 print "Content-type: text/xml\n\n";
 system "./NorthAyrshire.pl", $filename, $pdf_absurl and die "system failed: $|";

 unlink $filename or die "cannot unlink temporary file $filename: $!";
--- a/python_scrapers/NorthAyrshire.pl
+++ b/python_scrapers/NorthAyrshire.pl
@@ -0,0 +1,66 @@
 #!/usr/bin/perl -w

 use strict;
 use XML::Writer;
 use Date::Parse;
 use POSIX;

 my $file = $ARGV[0];
 my $info_url = $ARGV[1];

 my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2);

 $writer->xmlDecl("UTF-8");

 $writer->startTag("planning");
 $writer->dataElement("authority_name", "North Ayrshire Council");
 $writer->dataElement("authority_short_name", "North Ayrshire");
 $writer->startTag("applications");

 open (my $fh, '-|', "pdftotext", "-layout", $file, "-") or die "open failed: $!";
 while (my $line = <$fh>) {
 	if ($line =~ /^\s*Application No:\s*(\S+)/) {
 		my $refno = $1;
 		my $address = ""; my $proposal = ""; my $date_received = "";
 		my $cur_field;
 		while (1) {
 			chomp $line;
 			$line =~ s/^\s+//; $line =~ s/\s+$//;
 			if ($line =~ s/^Location://) {
 				$cur_field = \$address;
 			} elsif ($line =~ s/^Description://) {
 				$cur_field = \$proposal;
 			} elsif ($line =~ s/^Date Registered://) {
 				$cur_field = \$date_received;
 			} elsif (($line =~ s/^Applicant://) || ($line =~ s/^Agent://) || ($line =~ s/^Ward://)) {
 				$cur_field = undef;
 			}
 			$line =~ s/^\s+//;
 			if (defined $cur_field) {
 				$$cur_field .= " " if $$cur_field ne "" and $line ne "";
 				$$cur_field .= $line;
 			}
 			last unless defined ($line = <$fh>);
 			last if $line =~ /^\s*application:/;
 		}
 		my $postcode = "None";
 		if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) {
 			$postcode = $1;
 		}
 		my $norm_date_received = strftime("%d/%m/%Y", map { defined $_ ? $_ : 0 } strptime($date_received));

 		$writer->startTag("application");
 		$writer->dataElement("council_reference", $refno);
 		$writer->dataElement("address", $address);
 		$writer->dataElement("postcode", $postcode);
 		$writer->dataElement("description", $proposal);
 		$writer->dataElement("info_url", $info_url);
 		$writer->dataElement("comment_url", 'dcontrol@north-ayrshire.gov.uk');
 		$writer->dataElement("date_received", $norm_date_received);
 		$writer->endTag;
 	}
 }

 $writer->endTag;
 $writer->endTag;
 $writer->end;
--- a/python_scrapers/OtherFilesToCopy.csv
+++ b/python_scrapers/OtherFilesToCopy.csv
@@ -38,3 +38,7 @@
 "Highland.cgi", "493"
 "Newport.pl", "493"
 "Newport.cgi", "493"
 "NorthAyrshire.pl", "493"
 "NorthAyrshire.cgi", "493"
 "Redbridge.pl", "493"
 "Redbridge.cgi", "493"
--- a/python_scrapers/Redbridge.cgi
+++ b/python_scrapers/Redbridge.cgi
@@ -0,0 +1,85 @@
 #!/usr/bin/perl -w

 use strict;
 use HTML::TreeBuilder;
 use File::Temp qw(tempfile);
 use LWP::Simple;
 use POSIX;
 use Encode;
 use CGI;
 use CGI::Carp;

 sub sanity_check {
 	my ($var) = @_;
 	defined $var or return 0;
 	$var =~ /^[0-9]+$/ or return 0;
 	return 1;
 }

 sub no_results {
 	my ($y, $m, $d, $reason) = @_;
 	print <<NIL;
 Content-type: text/xml

 <?xml version="1.0" encoding="UTF-8"?>
 <planning>
  <authority_name>London Borough of Redbridge</authority_name>
  <authority_short_name>Redbridge</authority_short_name>
  <applications>
  </applications>
 </planning>
 NIL
 	die "$y/$m/$d failed: $reason\n";
 }

 my $cgi = new CGI;

 my $year = $cgi->param("year");
 my $month = $cgi->param("month");
 my $day = $cgi->param("day");

 unless (sanity_check($year) && sanity_check($month) && sanity_check($day)) {
 	print <<ERROR;
 Content-type: text/plain

 Need year, month, day parameters
 ERROR
 	exit 0;
 }

 my $tree = HTML::TreeBuilder->new;
 $tree->parse(decode_utf8(get('http://www.redbridge.gov.uk/cms/environment__planning/planning_and_regeneration/planning_dc.aspx') or die "couldn't fetch index page"));
 $tree->eof;

 my $re = sprintf('Planning Applications Received %d', $year);

 my ($year_link) = $tree->look_down(
 	"_tag", "a",
 	sub {  $_[0]->as_text =~ /$re/i }
 );
 $year_link or no_results($year, $month, $day, "Cannot find year link");

 my $year_absurl = 'http://www.redbridge.gov.uk'.$year_link->attr('href');

 my $year_tree = HTML::TreeBuilder->new;
 $year_tree->parse(decode_utf8(get($year_absurl) or die "couldn't fetch day page"));
 $year_tree->eof;

 my $day_re = strftime('Received %e[a-z]* %B %Y', 0, 0, 0, $day, $month-1, $year-1900);
 $day_re =~ s/ +/\\s+/g;
 my ($pdf_link) = $year_tree->look_down(
 	"_tag", "a",
 	sub { $_[0]->as_text =~ /$day_re/i }
 );
 $pdf_link or no_results($year, $month, $day, "Cannot find day link");

 my $pdf_absurl = 'http://www.redbridge.gov.uk/cms/'.$pdf_link->attr('href');

 my ($fh, $filename) = tempfile(SUFFIX => ".pdf");
 print $fh get($pdf_absurl);
 close($fh);

 print "Content-type: text/xml\n\n";
 system "./Redbridge.pl", $filename, $pdf_absurl and die "system failed: $|";

 unlink $filename or die "cannot unlink temporary file $filename: $!";
--- a/python_scrapers/Redbridge.pl
+++ b/python_scrapers/Redbridge.pl
@@ -0,0 +1,115 @@
 #!/usr/bin/perl -w

 use strict;
 use XML::Writer;
 use Date::Parse;
 use POSIX;

 my $file = $ARGV[0];
 my $info_url = $ARGV[1];

 my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2);

 $writer->xmlDecl("UTF-8");

 $writer->startTag("planning");
 $writer->dataElement("authority_name", "London Borough of Redbridge");
 $writer->dataElement("authority_short_name", "Redbridge");
 $writer->startTag("applications");

 open (my $fh, '-|', "pdftotext", "-layout", $file, "-") or die "open failed: $!";
 while (my $line = <$fh>) {
 	$line =~ s///g;
 	chomp $line;
 	if ($line =~ /^\s*(App\.No:)\s*(Location:)\s+(Agent)/) {
 		my $ofs_col1 = $-[1];
 		my $ofs_col2 = $-[2];
 		my $ofs_col3 = $-[3];
 		my $col1_full = ""; # sometimes col1 headings break up onto 2 lines so grab the whole thing and match on it
 		my $address = ""; my $proposal = "";
 		my $cur_field = \$address;
 		my $near_end = 0; my $redo_outer = 0;
 		while ($line = <$fh>) {
 			if ($line =~ s///g) { # alignment may have changed for new page, take care of adjustments of at most 2
 			                        # we may lose a few characters but luckily a page break during an entry doesn't
 			                        # happen very often (only 4 times during 2005-July 2008)
 				$ofs_col2 -= 2;
 				$ofs_col3 -= 2;
 			}

 			if ($line =~ /^\s*App\.No:/) {
 				$redo_outer = 1;
 				last;
 			}

 			chomp $line;
 			if ($line =~ /^\s{10,}[0-9]+\s*$/) { # a line with a page number... ignore unless near end
 				if ($near_end) {
 					last;
 				} else {
 					next;
 				}
 			}
 			my $col1; my $col2;
 			if (length $line > $ofs_col1) {
 				$col1 = substr $line, $ofs_col1, $ofs_col2-$ofs_col1;
 			} else {
 				$col1 = "";
 			}
 			if (length $line > $ofs_col2) {
 				$col2 = substr $line, $ofs_col2, $ofs_col3-$ofs_col2;
 			} else {
 				$col2 = "";
 			}
 			
 			$col2 =~ s/\s{10,}\S.*//g; # remove any obvious spillover text (only needed for a page break during an entry)

 			$col1 =~ s/^\s+//; $col1 =~ s/\s+$//;
 			$col2 =~ s/^\s+//; $col2 =~ s/\s+$//;

 			$col1_full .= " " if $col1_full ne "" and $col1 ne "";
 			$col1_full .= $col1;

 			if ($col2 eq "") {
 				if ($proposal eq "") {
 					$cur_field = \$proposal;
 				} else {
 					$cur_field = undef;
 				}
 			} elsif (defined $cur_field) {
 				$$cur_field .= " " if $$cur_field ne "" and $col2 ne "";
 				$$cur_field .= $col2;
 			}
 			last if $near_end and $line =~ /^\s*$/;
 			$near_end = 1 if $col1_full =~ /Case Officer/;
 		}
 		my ($refno, $date_received) = ($col1_full =~ /^(.+) Deposit Date: (\S+)/);
 		$refno ||= ""; $date_received ||= "";
 		$refno =~ s/-? //g;

 		my $postcode = "None";
 		if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) {
 			$postcode = $1;
 		}
 		my $norm_date_received = strftime("%d/%m/%Y", map { defined $_ ? $_ : 0 } strptime($date_received));

 		$writer->startTag("application");
 		$writer->dataElement("council_reference", $refno);
 		$writer->dataElement("address", $address);
 		$writer->dataElement("postcode", $postcode);
 		$writer->dataElement("description", $proposal);
 		$writer->dataElement("info_url", $info_url);
 		$writer->dataElement("comment_url", 'planning.enquiry@redbridge.gov.uk');
 		$writer->dataElement("date_received", $norm_date_received);
 		$writer->endTag;

 		if ($redo_outer) {
 			$redo_outer = 0;
 			redo;
 		}
 	}
 }

 $writer->endTag;
 $writer->endTag;
 $writer->end;