From 9a3b38c3256c9c656330ad2907e12090f2c624ba Mon Sep 17 00:00:00 2001
From: "duncan.parkes" <duncan.parkes@cf68f80a-222c-0410-aabe-5f79d0504a29>
Date: Thu, 17 Jul 2008 10:46:43 +0000
Subject: [PATCH] Add Peter's scrapers for Highland, Glasgow, and Brentwood.

---
 python_scrapers/Brentwood.cgi        | 55 +++++++++++++++++++
 python_scrapers/Brentwood.pl         | 72 ++++++++++++++++++++++++
 python_scrapers/Glasgow.cgi          | 54 ++++++++++++++++++
 python_scrapers/Glasgow.pl           | 63 +++++++++++++++++++++
 python_scrapers/Highland.cgi         | 82 ++++++++++++++++++++++++++++
 python_scrapers/Highland.pl          | 72 ++++++++++++++++++++++++
 python_scrapers/Ocella.py            |  2 +-
 python_scrapers/OtherFilesToCopy.csv |  6 ++
 8 files changed, 405 insertions(+), 1 deletion(-)
 create mode 100644 python_scrapers/Brentwood.cgi
 create mode 100644 python_scrapers/Brentwood.pl
 create mode 100644 python_scrapers/Glasgow.cgi
 create mode 100644 python_scrapers/Glasgow.pl
 create mode 100644 python_scrapers/Highland.cgi
 create mode 100644 python_scrapers/Highland.pl
diff --git a/python_scrapers/Brentwood.cgi b/python_scrapers/Brentwood.cgi
new file mode 100644
index 0000000..38b5f79
--- /dev/null
+++ b/python_scrapers/Brentwood.cgi
@@ -0,0 +1,55 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+use LWP::Simple;
+use File::Temp qw(tempfile);
+use POSIX;
+use CGI;
+
+my $cgi = new CGI;
+
+my $year = $cgi->param("year");
+my $month = $cgi->param("month");
+my $day = $cgi->param("day");
+
+unless (defined $year && defined $month && defined $day) {
+	print <<ERROR;
+Content-type: text/plain
+
+Need year, month, day parameters
+ERROR
+	exit 0;
+}
+
+my $html = get('http://www.brentwood-council.gov.uk/index.php?cid=573');
+
+my $date = strftime("%d %B %Y", 0, 0, 0, $day, $month-1, $year-1900);
+
+# quick and dirty
+my ($url) = ($html =~ /(http:\/\/[^"]*\.pdf)[^<]*(<[^>]*>)*[^<]*$date/);
+unless (defined $url) {
+	print <<NIL;
+Content-type: text/xml
+
+<?xml version="1.0" encoding="UTF-8"?>
+<planning>
+  <authority_name>Brentwood Borough Council</authority_name>
+  <authority_short_name>Brentwood</authority_short_name>
+  <applications>
+  </applications>
+</planning>
+NIL
+	exit 0;
+}
+
+my $dmy = sprintf("%02d/%02d/%04d", $day, $month, $year);
+
+my ($fh, $filename) = tempfile(SUFFIX => ".pdf");
+print $fh get($url);
+close($fh);
+
+print "Content-type: text/xml\n\n";
+system "./Brentwood.pl", $filename, $url, $dmy;
+
+unlink $filename;
diff --git a/python_scrapers/Brentwood.pl b/python_scrapers/Brentwood.pl
new file mode 100644
index 0000000..0ef848e
--- /dev/null
+++ b/python_scrapers/Brentwood.pl
@@ -0,0 +1,72 @@
+#!/usr/bin/perl -w
+
+use strict;
+use XML::Writer;
+
+my $file = $ARGV[0];
+my $info_url = $ARGV[1];
+my $date = $ARGV[2];
+
+my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2);
+
+$writer->xmlDecl("UTF-8");
+
+$writer->startTag("planning");
+$writer->dataElement("authority_name", "Brentwood Borough Council");
+$writer->dataElement("authority_short_name", "Brentwood");
+$writer->startTag("applications");
+
+open (my $fh, "pdftotext -layout $file -|");
+while (my $line = <$fh>) {
+	chomp $line;
+	$line =~ s///g;
+	if ($line =~ /Address:/) {
+		my $ofs_col2 = $-[0];
+		my $refno = substr $line, 0, $ofs_col2;
+		$refno =~ s/ +$//g;
+		my $address = ""; my $proposal = "";
+		my $cur_field;
+		while (1) {
+			if (length($line) > $ofs_col2) {
+				my $col2 = substr $line, $ofs_col2;
+				$col2 =~ s/^ +//;
+				if ($col2 =~ s/^((A?d)?d)?ress://) {
+					$cur_field = \$address;
+				} elsif ($col2 =~ s/^((P?r)?o)?posal://) {
+					$cur_field = \$proposal;
+				} elsif ($col2 =~ s/^((A?p)?p)?licant://) {
+					$cur_field = undef;
+				} elsif ($col2 =~ s/^((A?g)?e)?nt://) {
+					$cur_field = undef;
+				}
+				$col2 =~ s/^ +//; $col2 =~ s/ +$//;
+				if (defined $cur_field) {
+					$$cur_field .= " " if $$cur_field ne "";
+					$$cur_field .= $col2;
+				}
+			}
+			last unless defined ($line = <$fh>);
+			chomp $line;
+			$line =~ s///g;
+			last if length $line == 0;
+		}
+		my $postcode = "None";
+		if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) {
+			$postcode = $1;
+		}
+
+		$writer->startTag("application");
+		$writer->dataElement("council_reference", $refno);
+		$writer->dataElement("address", $address);
+		$writer->dataElement("postcode", $postcode);
+		$writer->dataElement("description", $proposal);
+		$writer->dataElement("info_url", $info_url);
+		$writer->dataElement("comment_url", "planning\@brentwood.gov.uk");
+		$writer->dataElement("date_received", $date);
+		$writer->endTag;
+	}
+}
+
+$writer->endTag;
+$writer->endTag;
+$writer->end;
diff --git a/python_scrapers/Glasgow.cgi b/python_scrapers/Glasgow.cgi
new file mode 100644
index 0000000..20fa4af
--- /dev/null
+++ b/python_scrapers/Glasgow.cgi
@@ -0,0 +1,54 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+use LWP::Simple;
+use File::Temp qw(tempfile);
+use POSIX;
+use CGI;
+
+my $cgi = new CGI;
+
+my $year = $cgi->param("year");
+my $month = $cgi->param("month");
+my $day = $cgi->param("day");
+
+unless (defined $year && defined $month && defined $day) {
+	print <<ERROR;
+Content-type: text/plain
+
+Need year, month, day parameters
+ERROR
+	exit 0;
+}
+
+my $html = get('http://www.glasgow.gov.uk/en/Business/Planning_Development/DevelopmentControl/Sitehistorysearches/');
+
+my $date = sprintf("%02d/%02d/%02d", $day, $month, $year % 100);
+
+# quick and dirty
+my ($url) = ($html =~ /href="(\/[^"]*\.pdf)[^<]*[0-9]{2}\/[0-9]{2}\/[0-9]{2} - $date/);
+unless (defined $url) {
+	print <<NIL;
+Content-type: text/xml
+
+<?xml version="1.0" encoding="UTF-8"?>
+<planning>
+  <authority_name>Glasgow City Council</authority_name>
+  <authority_short_name>Glasgow</authority_short_name>
+  <applications>
+  </applications>
+</planning>
+NIL
+	exit 0;
+}
+my $absurl = "http://www.glasgow.gov.uk$url";
+
+my ($fh, $filename) = tempfile(SUFFIX => ".pdf");
+print $fh get($absurl);
+close($fh);
+
+print "Content-type: text/xml\n\n";
+system "./Glasgow.pl", $filename, $absurl;
+
+unlink $filename;
diff --git a/python_scrapers/Glasgow.pl b/python_scrapers/Glasgow.pl
new file mode 100644
index 0000000..470b4c9
--- /dev/null
+++ b/python_scrapers/Glasgow.pl
@@ -0,0 +1,63 @@
+#!/usr/bin/perl -w
+
+use strict;
+use XML::Writer;
+
+my $file = $ARGV[0];
+my $info_url = $ARGV[1];
+
+my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2);
+
+$writer->xmlDecl("UTF-8");
+
+$writer->startTag("planning");
+$writer->dataElement("authority_name", "Glasgow City Council");
+$writer->dataElement("authority_short_name", "Glasgow");
+$writer->startTag("applications");
+
+open (my $fh, "pdftotext -layout $file -|");
+while (my $line = <$fh>) {
+	if ($line =~ /^\s*Reference:\s*(\S+)/) {
+		my $refno = $1;
+		my $address = ""; my $proposal = ""; my $date_received;
+		my $cur_field;
+		while (1) {
+			chomp $line;
+			$line =~ s/^\s+//; $line =~ s/\s+$//;
+			if ($line =~ s/^Address://) {
+				$cur_field = \$address;
+			} elsif ($line =~ s/^Proposal://) {
+				$cur_field = \$proposal;
+			} elsif ($line =~ /^Date Received:\s*(\S+)/) {
+				$date_received = $1;
+				$date_received =~ s#\.#/#g;
+				$cur_field = undef;
+			}
+			$line =~ s/^\s+//;
+			if (defined $cur_field) {
+				$$cur_field .= " " if $$cur_field ne "";
+				$$cur_field .= $line;
+			}
+			last if $line =~ /Map Reference:/;
+			last unless defined ($line = <$fh>);
+		}
+		my $postcode = "None";
+		if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) {
+			$postcode = $1;
+		}
+
+		$writer->startTag("application");
+		$writer->dataElement("council_reference", $refno);
+		$writer->dataElement("address", $address);
+		$writer->dataElement("postcode", $postcode);
+		$writer->dataElement("description", $proposal);
+		$writer->dataElement("info_url", $info_url);
+		$writer->dataElement("comment_url", "planning.representations\@drs.glasgow.gov.uk");
+		$writer->dataElement("date_received", $date_received);
+		$writer->endTag;
+	}
+}
+
+$writer->endTag;
+$writer->endTag;
+$writer->end;
diff --git a/python_scrapers/Highland.cgi b/python_scrapers/Highland.cgi
new file mode 100644
index 0000000..f7f77d8
--- /dev/null
+++ b/python_scrapers/Highland.cgi
@@ -0,0 +1,82 @@
+#!/usr/bin/perl -w
+
+use strict;
+use HTML::TreeBuilder;
+use File::Temp qw(tempfile);
+use LWP::Simple;
+use POSIX;
+use Encode;
+use CGI;
+use CGI::Carp;
+
+sub sanity_check {
+	my ($var) = @_;
+	defined $var or return 0;
+	$var =~ /^[0-9]+$/ or return 0;
+	return 1;
+}
+
+sub no_results {
+	my ($y, $m, $d, $reason) = @_;
+	print <<NIL;
+Content-type: text/xml
+
+<?xml version="1.0" encoding="UTF-8"?>
+<planning>
+  <authority_name>Highland Council</authority_name>
+  <authority_short_name>Highland</authority_short_name>
+  <applications>
+  </applications>
+</planning>
+NIL
+	die "$y/$m/$d failed: $reason\n";
+}
+
+my $cgi = new CGI;
+
+my $year = $cgi->param("year");
+my $month = $cgi->param("month");
+my $day = $cgi->param("day");
+
+unless (sanity_check($year) && sanity_check($month) && sanity_check($day)) {
+	print <<ERROR;
+Content-type: text/plain
+
+Need year, month, day parameters
+ERROR
+	exit 0;
+}
+
+my $tree = HTML::TreeBuilder->new;
+# $tree->parse_file('weekly-planning-bw-lists.htm');
+$tree->parse(decode_utf8(get('http://www.highland.gov.uk/yourenvironment/planning/planningapplications/weekly-planning-bw-lists.htm') or die "couldn't fetch index page"));
+$tree->eof;
+
+my $monthyear_re = strftime('%B[ \xa0]%Y', 0, 0, 0, 1, $month-1, $year-1900);
+
+my ($month_h2) = $tree->look_down(
+	"_tag", "h2",
+	sub {  $_[0]->as_text =~ /$monthyear_re/ }
+);
+$month_h2 or no_results($year, $month, $day, "Cannot find month header");
+
+my $month_list = $month_h2->right;
+
+my $day_re = strftime('Planning Applications (?:[A-Za-z0-9 ]*?to )?%b[a-z]* ?%e[a-z]', 0, 0, 0, $day, $month-1, $year-1900);
+
+my ($day_link) = $month_list->look_down(
+	"_tag", "a",
+	sub {  $_[0]->as_text =~ /$day_re/ }
+);
+$day_link or no_results($year, $month, $day, "Cannot find day link");
+
+my $day_absurl = "http://www.highland.gov.uk".$day_link->attr('href');
+
+my ($fh, $filename) = tempfile(SUFFIX => ".pdf");
+print $fh get($day_absurl);
+close($fh);
+
+print "Content-type: text/xml\n\n";
+system "./Highland.pl", $filename, $day_absurl and die "system failed: $|";
+
+unlink $filename or die "cannot unlink temporary file $filename: $!";
diff --git a/python_scrapers/Highland.pl b/python_scrapers/Highland.pl
new file mode 100644
index 0000000..c70edc4
--- /dev/null
+++ b/python_scrapers/Highland.pl
@@ -0,0 +1,72 @@
+#!/usr/bin/perl -w
+
+use strict;
+use XML::Writer;
+
+my $file = $ARGV[0];
+my $info_url = $ARGV[1];
+
+my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2);
+
+$writer->xmlDecl("UTF-8");
+
+$writer->startTag("planning");
+$writer->dataElement("authority_name", "Highland Council");
+$writer->dataElement("authority_short_name", "Highland");
+$writer->startTag("applications");
+
+open (my $fh, '-|', "pdftotext", "-layout", $file, "-") or die "open failed: $!";
+while (my $line = <$fh>) {
+	if ($line =~ /^\s*Ref Number\s*(\S+)/) {
+		my $refno = $1;
+		my $address = ""; my $proposal = ""; my $case_officer = ""; my $date_received;
+		my $cur_field;
+		my $near_end;
+		while (1) {
+			chomp $line;
+			$line =~ s/^\s+//; $line =~ s/\s+$//;
+			if ($line =~ s/^Location of Works//) {
+				$cur_field = \$address;
+			} elsif ($line =~ s/^Description of Works//) {
+				$cur_field = \$proposal;
+			} elsif ($line =~ s/^Case Officer//) {
+				$cur_field = \$case_officer;
+			} elsif (($line =~ s/^Community Council//) || ($line =~ s/^Applicant Name//) || ($line =~ s/^Applicant Address//)) {
+				$cur_field = undef;
+			} elsif ($line =~ /^Validation Date\s*(\S+)/) {
+				$date_received = $1;
+				$cur_field = undef;
+			}
+			$line =~ s/^\s+//;
+			if (defined $cur_field) {
+				$$cur_field .= " " if $$cur_field ne "";
+				$$cur_field .= $line;
+			}
+			last unless defined ($line = <$fh>);
+			last if $near_end && length $line == 1;
+			$near_end = 1 if $line =~ /^\s*Case Officer/;
+		}
+		my $postcode = "None";
+		if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) {
+			$postcode = $1;
+		}
+		my $comment_url = "None";
+		if ($case_officer =~ /([A-Za-z0-9\.]+\@[A-Za-z0-9\.]+)/) {
+			$comment_url = "$1";
+		}
+
+		$writer->startTag("application");
+		$writer->dataElement("council_reference", $refno);
+		$writer->dataElement("address", $address);
+		$writer->dataElement("postcode", $postcode);
+		$writer->dataElement("description", $proposal);
+		$writer->dataElement("info_url", $info_url);
+		$writer->dataElement("comment_url", $comment_url);
+		$writer->dataElement("date_received", $date_received);
+		$writer->endTag;
+	}
+}
+
+$writer->endTag;
+$writer->endTag;
+$writer->end;
diff --git a/python_scrapers/Ocella.py b/python_scrapers/Ocella.py
index eef26b6..924a349 100644
--- a/python_scrapers/Ocella.py
+++ b/python_scrapers/Ocella.py
@@ -208,7 +208,7 @@ if __name__ == '__main__':
 #    parser = OcellaParser("Bridgend", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL")
     parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL")
 #    parser = OcellaParser("Castle Point", "Castle Point", "http://planning.castlepoint.gov.uk/portal/page?_pageid=35,38205&_dad=portal&_schema=PORTAL")
-#    parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly")
+    parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly")
 
 
     print parser.getResults(21,5,2008)
diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv
index c4994cd..55b8b0a 100644
--- a/python_scrapers/OtherFilesToCopy.csv
+++ b/python_scrapers/OtherFilesToCopy.csv
@@ -31,3 +31,9 @@
 "Maldon.py", "420"
 "Medway.py", "420"
 "Shropshire.py", "420"
+"Brentwood.pl", "493"
+"Brentwood.cgi", "493"
+"Glasgow.pl", "493"
+"Glasgow.cgi", "493"
+"Highland.pl", "493"
+"Highland.cgi", "493"