Add Peter's scrapers for Highland, Glasgow, and Brentwood.

před 18 roky · 9a3b38c325
--- a/python_scrapers/Brentwood.cgi
+++ b/python_scrapers/Brentwood.cgi
@@ -0,0 +1,55 @@
 #!/usr/bin/perl -w

 use strict;

 use LWP::Simple;
 use File::Temp qw(tempfile);
 use POSIX;
 use CGI;

 my $cgi = new CGI;

 my $year = $cgi->param("year");
 my $month = $cgi->param("month");
 my $day = $cgi->param("day");

 unless (defined $year && defined $month && defined $day) {
 	print <<ERROR;
 Content-type: text/plain

 Need year, month, day parameters
 ERROR
 	exit 0;
 }

 my $html = get('http://www.brentwood-council.gov.uk/index.php?cid=573');

 my $date = strftime("%d %B %Y", 0, 0, 0, $day, $month-1, $year-1900);

 # quick and dirty
 my ($url) = ($html =~ /(http:\/\/[^"]*\.pdf)[^<]*(<[^>]*>)*[^<]*$date/);
 unless (defined $url) {
 	print <<NIL;
 Content-type: text/xml

 <?xml version="1.0" encoding="UTF-8"?>
 <planning>
  <authority_name>Brentwood Borough Council</authority_name>
  <authority_short_name>Brentwood</authority_short_name>
  <applications>
  </applications>
 </planning>
 NIL
 	exit 0;
 }

 my $dmy = sprintf("%02d/%02d/%04d", $day, $month, $year);

 my ($fh, $filename) = tempfile(SUFFIX => ".pdf");
 print $fh get($url);
 close($fh);

 print "Content-type: text/xml\n\n";
 system "./Brentwood.pl", $filename, $url, $dmy;

 unlink $filename;
--- a/python_scrapers/Brentwood.pl
+++ b/python_scrapers/Brentwood.pl
@@ -0,0 +1,72 @@
 #!/usr/bin/perl -w

 use strict;
 use XML::Writer;

 my $file = $ARGV[0];
 my $info_url = $ARGV[1];
 my $date = $ARGV[2];

 my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2);

 $writer->xmlDecl("UTF-8");

 $writer->startTag("planning");
 $writer->dataElement("authority_name", "Brentwood Borough Council");
 $writer->dataElement("authority_short_name", "Brentwood");
 $writer->startTag("applications");

 open (my $fh, "pdftotext -layout $file -|");
 while (my $line = <$fh>) {
 	chomp $line;
 	$line =~ s///g;
 	if ($line =~ /Address:/) {
 		my $ofs_col2 = $-[0];
 		my $refno = substr $line, 0, $ofs_col2;
 		$refno =~ s/ +$//g;
 		my $address = ""; my $proposal = "";
 		my $cur_field;
 		while (1) {
 			if (length($line) > $ofs_col2) {
 				my $col2 = substr $line, $ofs_col2;
 				$col2 =~ s/^ +//;
 				if ($col2 =~ s/^((A?d)?d)?ress://) {
 					$cur_field = \$address;
 				} elsif ($col2 =~ s/^((P?r)?o)?posal://) {
 					$cur_field = \$proposal;
 				} elsif ($col2 =~ s/^((A?p)?p)?licant://) {
 					$cur_field = undef;
 				} elsif ($col2 =~ s/^((A?g)?e)?nt://) {
 					$cur_field = undef;
 				}
 				$col2 =~ s/^ +//; $col2 =~ s/ +$//;
 				if (defined $cur_field) {
 					$$cur_field .= " " if $$cur_field ne "";
 					$$cur_field .= $col2;
 				}
 			}
 			last unless defined ($line = <$fh>);
 			chomp $line;
 			$line =~ s///g;
 			last if length $line == 0;
 		}
 		my $postcode = "None";
 		if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) {
 			$postcode = $1;
 		}

 		$writer->startTag("application");
 		$writer->dataElement("council_reference", $refno);
 		$writer->dataElement("address", $address);
 		$writer->dataElement("postcode", $postcode);
 		$writer->dataElement("description", $proposal);
 		$writer->dataElement("info_url", $info_url);
 		$writer->dataElement("comment_url", "planning\@brentwood.gov.uk");
 		$writer->dataElement("date_received", $date);
 		$writer->endTag;
 	}
 }

 $writer->endTag;
 $writer->endTag;
 $writer->end;
--- a/python_scrapers/Glasgow.cgi
+++ b/python_scrapers/Glasgow.cgi
@@ -0,0 +1,54 @@
 #!/usr/bin/perl -w

 use strict;

 use LWP::Simple;
 use File::Temp qw(tempfile);
 use POSIX;
 use CGI;

 my $cgi = new CGI;

 my $year = $cgi->param("year");
 my $month = $cgi->param("month");
 my $day = $cgi->param("day");

 unless (defined $year && defined $month && defined $day) {
 	print <<ERROR;
 Content-type: text/plain

 Need year, month, day parameters
 ERROR
 	exit 0;
 }

 my $html = get('http://www.glasgow.gov.uk/en/Business/Planning_Development/DevelopmentControl/Sitehistorysearches/');

 my $date = sprintf("%02d/%02d/%02d", $day, $month, $year % 100);

 # quick and dirty
 my ($url) = ($html =~ /href="(\/[^"]*\.pdf)[^<]*[0-9]{2}\/[0-9]{2}\/[0-9]{2} - $date/);
 unless (defined $url) {
 	print <<NIL;
 Content-type: text/xml

 <?xml version="1.0" encoding="UTF-8"?>
 <planning>
  <authority_name>Glasgow City Council</authority_name>
  <authority_short_name>Glasgow</authority_short_name>
  <applications>
  </applications>
 </planning>
 NIL
 	exit 0;
 }
 my $absurl = "http://www.glasgow.gov.uk$url";

 my ($fh, $filename) = tempfile(SUFFIX => ".pdf");
 print $fh get($absurl);
 close($fh);

 print "Content-type: text/xml\n\n";
 system "./Glasgow.pl", $filename, $absurl;

 unlink $filename;
--- a/python_scrapers/Glasgow.pl
+++ b/python_scrapers/Glasgow.pl
@@ -0,0 +1,63 @@
 #!/usr/bin/perl -w

 use strict;
 use XML::Writer;

 my $file = $ARGV[0];
 my $info_url = $ARGV[1];

 my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2);

 $writer->xmlDecl("UTF-8");

 $writer->startTag("planning");
 $writer->dataElement("authority_name", "Glasgow City Council");
 $writer->dataElement("authority_short_name", "Glasgow");
 $writer->startTag("applications");

 open (my $fh, "pdftotext -layout $file -|");
 while (my $line = <$fh>) {
 	if ($line =~ /^\s*Reference:\s*(\S+)/) {
 		my $refno = $1;
 		my $address = ""; my $proposal = ""; my $date_received;
 		my $cur_field;
 		while (1) {
 			chomp $line;
 			$line =~ s/^\s+//; $line =~ s/\s+$//;
 			if ($line =~ s/^Address://) {
 				$cur_field = \$address;
 			} elsif ($line =~ s/^Proposal://) {
 				$cur_field = \$proposal;
 			} elsif ($line =~ /^Date Received:\s*(\S+)/) {
 				$date_received = $1;
 				$date_received =~ s#\.#/#g;
 				$cur_field = undef;
 			}
 			$line =~ s/^\s+//;
 			if (defined $cur_field) {
 				$$cur_field .= " " if $$cur_field ne "";
 				$$cur_field .= $line;
 			}
 			last if $line =~ /Map Reference:/;
 			last unless defined ($line = <$fh>);
 		}
 		my $postcode = "None";
 		if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) {
 			$postcode = $1;
 		}

 		$writer->startTag("application");
 		$writer->dataElement("council_reference", $refno);
 		$writer->dataElement("address", $address);
 		$writer->dataElement("postcode", $postcode);
 		$writer->dataElement("description", $proposal);
 		$writer->dataElement("info_url", $info_url);
 		$writer->dataElement("comment_url", "planning.representations\@drs.glasgow.gov.uk");
 		$writer->dataElement("date_received", $date_received);
 		$writer->endTag;
 	}
 }

 $writer->endTag;
 $writer->endTag;
 $writer->end;
--- a/python_scrapers/Highland.cgi
+++ b/python_scrapers/Highland.cgi
@@ -0,0 +1,82 @@
 #!/usr/bin/perl -w

 use strict;
 use HTML::TreeBuilder;
 use File::Temp qw(tempfile);
 use LWP::Simple;
 use POSIX;
 use Encode;
 use CGI;
 use CGI::Carp;

 sub sanity_check {
 	my ($var) = @_;
 	defined $var or return 0;
 	$var =~ /^[0-9]+$/ or return 0;
 	return 1;
 }

 sub no_results {
 	my ($y, $m, $d, $reason) = @_;
 	print <<NIL;
 Content-type: text/xml

 <?xml version="1.0" encoding="UTF-8"?>
 <planning>
  <authority_name>Highland Council</authority_name>
  <authority_short_name>Highland</authority_short_name>
  <applications>
  </applications>
 </planning>
 NIL
 	die "$y/$m/$d failed: $reason\n";
 }

 my $cgi = new CGI;

 my $year = $cgi->param("year");
 my $month = $cgi->param("month");
 my $day = $cgi->param("day");

 unless (sanity_check($year) && sanity_check($month) && sanity_check($day)) {
 	print <<ERROR;
 Content-type: text/plain

 Need year, month, day parameters
 ERROR
 	exit 0;
 }

 my $tree = HTML::TreeBuilder->new;
 # $tree->parse_file('weekly-planning-bw-lists.htm');
 $tree->parse(decode_utf8(get('http://www.highland.gov.uk/yourenvironment/planning/planningapplications/weekly-planning-bw-lists.htm') or die "couldn't fetch index page"));
 $tree->eof;

 my $monthyear_re = strftime('%B[ \xa0]%Y', 0, 0, 0, 1, $month-1, $year-1900);

 my ($month_h2) = $tree->look_down(
 	"_tag", "h2",
 	sub {  $_[0]->as_text =~ /$monthyear_re/ }
 );
 $month_h2 or no_results($year, $month, $day, "Cannot find month header");

 my $month_list = $month_h2->right;

 my $day_re = strftime('Planning Applications (?:[A-Za-z0-9 ]*?to )?%b[a-z]* ?%e[a-z]', 0, 0, 0, $day, $month-1, $year-1900);

 my ($day_link) = $month_list->look_down(
 	"_tag", "a",
 	sub {  $_[0]->as_text =~ /$day_re/ }
 );
 $day_link or no_results($year, $month, $day, "Cannot find day link");

 my $day_absurl = "http://www.highland.gov.uk".$day_link->attr('href');

 my ($fh, $filename) = tempfile(SUFFIX => ".pdf");
 print $fh get($day_absurl);
 close($fh);

 print "Content-type: text/xml\n\n";
 system "./Highland.pl", $filename, $day_absurl and die "system failed: $|";

 unlink $filename or die "cannot unlink temporary file $filename: $!";
--- a/python_scrapers/Highland.pl
+++ b/python_scrapers/Highland.pl
@@ -0,0 +1,72 @@
 #!/usr/bin/perl -w

 use strict;
 use XML::Writer;

 my $file = $ARGV[0];
 my $info_url = $ARGV[1];

 my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2);

 $writer->xmlDecl("UTF-8");

 $writer->startTag("planning");
 $writer->dataElement("authority_name", "Highland Council");
 $writer->dataElement("authority_short_name", "Highland");
 $writer->startTag("applications");

 open (my $fh, '-|', "pdftotext", "-layout", $file, "-") or die "open failed: $!";
 while (my $line = <$fh>) {
 	if ($line =~ /^\s*Ref Number\s*(\S+)/) {
 		my $refno = $1;
 		my $address = ""; my $proposal = ""; my $case_officer = ""; my $date_received;
 		my $cur_field;
 		my $near_end;
 		while (1) {
 			chomp $line;
 			$line =~ s/^\s+//; $line =~ s/\s+$//;
 			if ($line =~ s/^Location of Works//) {
 				$cur_field = \$address;
 			} elsif ($line =~ s/^Description of Works//) {
 				$cur_field = \$proposal;
 			} elsif ($line =~ s/^Case Officer//) {
 				$cur_field = \$case_officer;
 			} elsif (($line =~ s/^Community Council//) || ($line =~ s/^Applicant Name//) || ($line =~ s/^Applicant Address//)) {
 				$cur_field = undef;
 			} elsif ($line =~ /^Validation Date\s*(\S+)/) {
 				$date_received = $1;
 				$cur_field = undef;
 			}
 			$line =~ s/^\s+//;
 			if (defined $cur_field) {
 				$$cur_field .= " " if $$cur_field ne "";
 				$$cur_field .= $line;
 			}
 			last unless defined ($line = <$fh>);
 			last if $near_end && length $line == 1;
 			$near_end = 1 if $line =~ /^\s*Case Officer/;
 		}
 		my $postcode = "None";
 		if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) {
 			$postcode = $1;
 		}
 		my $comment_url = "None";
 		if ($case_officer =~ /([A-Za-z0-9\.]+\@[A-Za-z0-9\.]+)/) {
 			$comment_url = "$1";
 		}

 		$writer->startTag("application");
 		$writer->dataElement("council_reference", $refno);
 		$writer->dataElement("address", $address);
 		$writer->dataElement("postcode", $postcode);
 		$writer->dataElement("description", $proposal);
 		$writer->dataElement("info_url", $info_url);
 		$writer->dataElement("comment_url", $comment_url);
 		$writer->dataElement("date_received", $date_received);
 		$writer->endTag;
 	}
 }

 $writer->endTag;
 $writer->endTag;
 $writer->end;
--- a/python_scrapers/Ocella.py
+++ b/python_scrapers/Ocella.py
@@ -208,7 +208,7 @@ if __name__ == '__main__':
 #    parser = OcellaParser("Bridgend", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL")
    parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("Castle Point", "Castle Point", "http://planning.castlepoint.gov.uk/portal/page?_pageid=35,38205&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly")
    parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly")


    print parser.getResults(21,5,2008)
--- a/python_scrapers/OtherFilesToCopy.csv
+++ b/python_scrapers/OtherFilesToCopy.csv
@@ -31,3 +31,9 @@
 "Maldon.py", "420"
 "Medway.py", "420"
 "Shropshire.py", "420"
 "Brentwood.pl", "493"
 "Brentwood.cgi", "493"
 "Glasgow.pl", "493"
 "Glasgow.cgi", "493"
 "Highland.pl", "493"
 "Highland.cgi", "493"