Преглед на файлове

North Ayrshire, Redbridge: added

import/raw
pcc03@doc.ic.ac.uk преди 16 години
родител
ревизия
e28b89840c
променени са 9 файла, в които са добавени 365 реда и са изтрити 6 реда
  1. +85
    -0
      trunk/python_scrapers/NorthAyrshire.cgi
  2. +66
    -0
      trunk/python_scrapers/NorthAyrshire.pl
  3. +4
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  4. +85
    -0
      trunk/python_scrapers/Redbridge.cgi
  5. +115
    -0
      trunk/python_scrapers/Redbridge.pl
  6. +3
    -1
      wiki/ExistingScrapers.wiki
  7. +2
    -2
      wiki/PlanningAuthoritiesN.wiki
  8. +2
    -2
      wiki/PlanningAuthoritiesR.wiki
  9. +3
    -1
      wiki/ScraperDevelopment.wiki

+ 85
- 0
trunk/python_scrapers/NorthAyrshire.cgi Целия файл

@@ -0,0 +1,85 @@
#!/usr/bin/perl -w

use strict;
use HTML::TreeBuilder;
use File::Temp qw(tempfile);
use LWP::Simple;
use POSIX;
use Encode;
use CGI;
use CGI::Carp;

sub sanity_check {
my ($var) = @_;
defined $var or return 0;
$var =~ /^[0-9]+$/ or return 0;
return 1;
}

sub no_results {
my ($y, $m, $d, $reason) = @_;
print <<NIL;
Content-type: text/xml

<?xml version="1.0" encoding="UTF-8"?>
<planning>
<authority_name>North Ayrshire Council</authority_name>
<authority_short_name>North Ayrshire</authority_short_name>
<applications>
</applications>
</planning>
NIL
die "$y/$m/$d failed: $reason\n";
}

my $cgi = new CGI;

my $year = $cgi->param("year");
my $month = $cgi->param("month");
my $day = $cgi->param("day");

unless (sanity_check($year) && sanity_check($month) && sanity_check($day)) {
print <<ERROR;
Content-type: text/plain

Need year, month, day parameters
ERROR
exit 0;
}

my $tree = HTML::TreeBuilder->new;
$tree->parse(decode_utf8(get('http://www.north-ayrshire.gov.uk/na/Home.nsf/OtherMenuPage?ReadForm&MenuType=Environment-Planning&DocDisplay=NoDoc&CatLevel=2||') or die "couldn't fetch index page"));
$tree->eof;

my $re = strftime('Planning Applications Received week ending %d %B %Y', 0, 0, 0, $day, $month-1, $year-1900);

my ($day_link) = $tree->look_down(
"_tag", "a",
sub { $_[0]->as_text =~ /$re/i }
);
$day_link or no_results($year, $month, $day, "Cannot find day link");

my $day_absurl = 'http://www.north-ayrshire.gov.uk'.$day_link->attr('href');

my $day_tree = HTML::TreeBuilder->new;
$day_tree->parse(decode_utf8(get($day_absurl) or die "couldn't fetch day page"));
$day_tree->eof;

my ($pdf_img) = $day_tree->look_down(
"_tag", "img",
"alt", qr/\.pdf$/i
);
$pdf_img or die "couldn't find pdf image on day page";
my $pdf_link = $pdf_img->parent;
$pdf_link or die "couldn't find pdf link on day page";

my $pdf_absurl = 'http://www.north-ayrshire.gov.uk'.$pdf_link->attr('href');

my ($fh, $filename) = tempfile(SUFFIX => ".pdf");
print $fh get($pdf_absurl);
close($fh);

print "Content-type: text/xml\n\n";
system "./NorthAyrshire.pl", $filename, $pdf_absurl and die "system failed: $|";

unlink $filename or die "cannot unlink temporary file $filename: $!";

+ 66
- 0
trunk/python_scrapers/NorthAyrshire.pl Целия файл

@@ -0,0 +1,66 @@
#!/usr/bin/perl -w

use strict;
use XML::Writer;
use Date::Parse;
use POSIX;

my $file = $ARGV[0];
my $info_url = $ARGV[1];

my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2);

$writer->xmlDecl("UTF-8");

$writer->startTag("planning");
$writer->dataElement("authority_name", "North Ayrshire Council");
$writer->dataElement("authority_short_name", "North Ayrshire");
$writer->startTag("applications");

open (my $fh, '-|', "pdftotext", "-layout", $file, "-") or die "open failed: $!";
while (my $line = <$fh>) {
if ($line =~ /^\s*Application No:\s*(\S+)/) {
my $refno = $1;
my $address = ""; my $proposal = ""; my $date_received = "";
my $cur_field;
while (1) {
chomp $line;
$line =~ s/^\s+//; $line =~ s/\s+$//;
if ($line =~ s/^Location://) {
$cur_field = \$address;
} elsif ($line =~ s/^Description://) {
$cur_field = \$proposal;
} elsif ($line =~ s/^Date Registered://) {
$cur_field = \$date_received;
} elsif (($line =~ s/^Applicant://) || ($line =~ s/^Agent://) || ($line =~ s/^Ward://)) {
$cur_field = undef;
}
$line =~ s/^\s+//;
if (defined $cur_field) {
$$cur_field .= " " if $$cur_field ne "" and $line ne "";
$$cur_field .= $line;
}
last unless defined ($line = <$fh>);
last if $line =~ /^\s*application:/;
}
my $postcode = "None";
if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) {
$postcode = $1;
}
my $norm_date_received = strftime("%d/%m/%Y", map { defined $_ ? $_ : 0 } strptime($date_received));

$writer->startTag("application");
$writer->dataElement("council_reference", $refno);
$writer->dataElement("address", $address);
$writer->dataElement("postcode", $postcode);
$writer->dataElement("description", $proposal);
$writer->dataElement("info_url", $info_url);
$writer->dataElement("comment_url", 'dcontrol@north-ayrshire.gov.uk');
$writer->dataElement("date_received", $norm_date_received);
$writer->endTag;
}
}

$writer->endTag;
$writer->endTag;
$writer->end;

+ 4
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Целия файл

@@ -38,3 +38,7 @@
"Highland.cgi", "493"
"Newport.pl", "493"
"Newport.cgi", "493"
"NorthAyrshire.pl", "493"
"NorthAyrshire.cgi", "493"
"Redbridge.pl", "493"
"Redbridge.cgi", "493"

+ 85
- 0
trunk/python_scrapers/Redbridge.cgi Целия файл

@@ -0,0 +1,85 @@
#!/usr/bin/perl -w

use strict;
use HTML::TreeBuilder;
use File::Temp qw(tempfile);
use LWP::Simple;
use POSIX;
use Encode;
use CGI;
use CGI::Carp;

sub sanity_check {
my ($var) = @_;
defined $var or return 0;
$var =~ /^[0-9]+$/ or return 0;
return 1;
}

sub no_results {
my ($y, $m, $d, $reason) = @_;
print <<NIL;
Content-type: text/xml

<?xml version="1.0" encoding="UTF-8"?>
<planning>
<authority_name>London Borough of Redbridge</authority_name>
<authority_short_name>Redbridge</authority_short_name>
<applications>
</applications>
</planning>
NIL
die "$y/$m/$d failed: $reason\n";
}

my $cgi = new CGI;

my $year = $cgi->param("year");
my $month = $cgi->param("month");
my $day = $cgi->param("day");

unless (sanity_check($year) && sanity_check($month) && sanity_check($day)) {
print <<ERROR;
Content-type: text/plain

Need year, month, day parameters
ERROR
exit 0;
}

my $tree = HTML::TreeBuilder->new;
$tree->parse(decode_utf8(get('http://www.redbridge.gov.uk/cms/environment__planning/planning_and_regeneration/planning_dc.aspx') or die "couldn't fetch index page"));
$tree->eof;

my $re = sprintf('Planning Applications Received %d', $year);

my ($year_link) = $tree->look_down(
"_tag", "a",
sub { $_[0]->as_text =~ /$re/i }
);
$year_link or no_results($year, $month, $day, "Cannot find year link");

my $year_absurl = 'http://www.redbridge.gov.uk'.$year_link->attr('href');

my $year_tree = HTML::TreeBuilder->new;
$year_tree->parse(decode_utf8(get($year_absurl) or die "couldn't fetch day page"));
$year_tree->eof;

my $day_re = strftime('Received %e[a-z]* %B %Y', 0, 0, 0, $day, $month-1, $year-1900);
$day_re =~ s/ +/\\s+/g;
my ($pdf_link) = $year_tree->look_down(
"_tag", "a",
sub { $_[0]->as_text =~ /$day_re/i }
);
$pdf_link or no_results($year, $month, $day, "Cannot find day link");

my $pdf_absurl = 'http://www.redbridge.gov.uk/cms/'.$pdf_link->attr('href');

my ($fh, $filename) = tempfile(SUFFIX => ".pdf");
print $fh get($pdf_absurl);
close($fh);

print "Content-type: text/xml\n\n";
system "./Redbridge.pl", $filename, $pdf_absurl and die "system failed: $|";

unlink $filename or die "cannot unlink temporary file $filename: $!";

+ 115
- 0
trunk/python_scrapers/Redbridge.pl Целия файл

@@ -0,0 +1,115 @@
#!/usr/bin/perl -w

use strict;
use XML::Writer;
use Date::Parse;
use POSIX;

my $file = $ARGV[0];
my $info_url = $ARGV[1];

my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2);

$writer->xmlDecl("UTF-8");

$writer->startTag("planning");
$writer->dataElement("authority_name", "London Borough of Redbridge");
$writer->dataElement("authority_short_name", "Redbridge");
$writer->startTag("applications");

open (my $fh, '-|', "pdftotext", "-layout", $file, "-") or die "open failed: $!";
while (my $line = <$fh>) {
$line =~ s/ //g;
chomp $line;
if ($line =~ /^\s*(App\.No:)\s*(Location:)\s+(Agent)/) {
my $ofs_col1 = $-[1];
my $ofs_col2 = $-[2];
my $ofs_col3 = $-[3];
my $col1_full = ""; # sometimes col1 headings break up onto 2 lines so grab the whole thing and match on it
my $address = ""; my $proposal = "";
my $cur_field = \$address;
my $near_end = 0; my $redo_outer = 0;
while ($line = <$fh>) {
if ($line =~ s/ //g) { # alignment may have changed for new page, take care of adjustments of at most 2
# we may lose a few characters but luckily a page break during an entry doesn't
# happen very often (only 4 times during 2005-July 2008)
$ofs_col2 -= 2;
$ofs_col3 -= 2;
}

if ($line =~ /^\s*App\.No:/) {
$redo_outer = 1;
last;
}

chomp $line;
if ($line =~ /^\s{10,}[0-9]+\s*$/) { # a line with a page number... ignore unless near end
if ($near_end) {
last;
} else {
next;
}
}
my $col1; my $col2;
if (length $line > $ofs_col1) {
$col1 = substr $line, $ofs_col1, $ofs_col2-$ofs_col1;
} else {
$col1 = "";
}
if (length $line > $ofs_col2) {
$col2 = substr $line, $ofs_col2, $ofs_col3-$ofs_col2;
} else {
$col2 = "";
}
$col2 =~ s/\s{10,}\S.*//g; # remove any obvious spillover text (only needed for a page break during an entry)

$col1 =~ s/^\s+//; $col1 =~ s/\s+$//;
$col2 =~ s/^\s+//; $col2 =~ s/\s+$//;

$col1_full .= " " if $col1_full ne "" and $col1 ne "";
$col1_full .= $col1;

if ($col2 eq "") {
if ($proposal eq "") {
$cur_field = \$proposal;
} else {
$cur_field = undef;
}
} elsif (defined $cur_field) {
$$cur_field .= " " if $$cur_field ne "" and $col2 ne "";
$$cur_field .= $col2;
}
last if $near_end and $line =~ /^\s*$/;
$near_end = 1 if $col1_full =~ /Case Officer/;
}
my ($refno, $date_received) = ($col1_full =~ /^(.+) Deposit Date: (\S+)/);
$refno ||= ""; $date_received ||= "";
$refno =~ s/-? //g;

my $postcode = "None";
if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) {
$postcode = $1;
}
my $norm_date_received = strftime("%d/%m/%Y", map { defined $_ ? $_ : 0 } strptime($date_received));

$writer->startTag("application");
$writer->dataElement("council_reference", $refno);
$writer->dataElement("address", $address);
$writer->dataElement("postcode", $postcode);
$writer->dataElement("description", $proposal);
$writer->dataElement("info_url", $info_url);
$writer->dataElement("comment_url", 'planning.enquiry@redbridge.gov.uk');
$writer->dataElement("date_received", $norm_date_received);
$writer->endTag;

if ($redo_outer) {
$redo_outer = 0;
redo;
}
}
}

$writer->endTag;
$writer->endTag;
$writer->end;

+ 3
- 1
wiki/ExistingScrapers.wiki Целия файл

@@ -32,6 +32,8 @@ Others:
* Maldon (and Pendle)
* Medway
* Newport (from Peter)
* North Ayrshire (from Peter)
* Redbridge (from Peter)
* Shetland Islands
* Shropshire (and North Yorkshire, South Northamptonshire)
* South Oxfordshire
@@ -40,4 +42,4 @@ Others:

Provided directly by the council:

* Wansbeck
* Wansbeck

+ 2
- 2
wiki/PlanningAuthoritiesN.wiki Целия файл

@@ -7,7 +7,7 @@
|| Done || Newham Borough Council || East Ham || PublicAccess || ||
|| Done || Newport City Council || Newport || weekly lists as pdf - Perl scraper from Peter || http://www.newport.gov.uk/_dc/index.cfm?fuseaction=planapps.applist ||
|| || *Norfolk County Council* || Norwich || *(no postcodes)* Minerals and waste only. All current apps in html || http://ptweb.norfolk.gov.uk/planning_applications/read/show.asp?Decision=CURRENT&Type=W&Submit=Submit ||
|| || *North Ayrshire Council* || Irvine || weekly lists as pdf || http://www.north-ayrshire.gov.uk/na/Home.nsf/OtherMenuPage?ReadForm&MenuType=Environment-Planning&DocDisplay=NoDoc&CatLevel=2|| ||
|| Done || North Ayrshire Council || Irvine || weekly lists as pdf - Perl scraper from Peter || http://www.north-ayrshire.gov.uk/na/Home.nsf/OtherMenuPage?ReadForm&MenuType=Environment-Planning&DocDisplay=NoDoc&CatLevel=2|| ||
|| || *North Cornwall District Council* || Bodmin, Wadebridge || searchable by date || http://onlineplanning.ncdc.gov.uk/eaccess/Planning/Planning-application-and-property-Search/Application-Search/Application-Search.asp ||
|| || *North Devon District Council* || Barnstaple || searchable by date || http://planning.northdevon.gov.uk/search.asp ||
|| || *North Dorset District Council* || Blandford Forum || weekly lists as pdf || http://www.north-dorset.gov.uk/index/living/building_planning/development_control/planning_applications-3/view_application.htm ||
@@ -33,4 +33,4 @@
|| Done || Norwich City Council || Norwich || PublicAccess|| ||
|| Done || Nottingham City Council || Nottingham || [WAM] || http://plan4.nottinghamcity.gov.uk/WAM/search/pas/index.html||
|| || *Nottinghamshire County Council* || Nottingham, West Bridgford || No date search, but there is an apps open for consultation page. || http://www.nottinghamcity.gov.uk/planning/Report/weeksrch.html ||
|| || *Nuneaton & Bedworth Borough Council* || Nuneaton || searchable by date || http://www.nuneatonandbedworth.gov.uk/environment-planning/planning/planning-applications/search-planning-applications ||
|| || *Nuneaton & Bedworth Borough Council* || Nuneaton || searchable by date || http://www.nuneatonandbedworth.gov.uk/environment-planning/planning/planning-applications/search-planning-applications ||

+ 2
- 2
wiki/PlanningAuthoritiesR.wiki Целия файл

@@ -1,5 +1,5 @@
|| Done || Reading || Reading || PublicAccess || http://planning.reading.gov.uk/publicaccess/default.aspx ||
|| || *Redbridge* || Ilford || Weekly lists as pdf || http://www.redbridge.gov.uk/planning/Planappsrec.cfm ||
|| Done || Redbridge || Ilford || Weekly lists as pdf - Perl scraper from Peter || http://www.redbridge.gov.uk/cms/environment__planning/planning_and_regeneration/planning_dc.aspx ||
|| || *Redcar & Cleveland* || Redcar || list of all current apps as html (*No postcodes* in location, only in applicant details, which may be a different address || http://www.redcar-cleveland.gov.uk/Planning.nsf/ViewWebApplicationsByDate?readform ||
|| Done || Redditch || Redditch || PublicAccess || http://access.redditchbc.gov.uk/publicaccess/default.aspx ||
|| || *Reigate & Banstead* || Reigate || Planit - searchable by date || http://www.reigate-banstead.gov.uk/planit03.asp ||
@@ -21,4 +21,4 @@
|| Done || Rutland || Oakham || RutlandLike || http://www.rutland.gov.uk/wellandplanning/searchparam.asp ||
|| Done || Ryedale || Malton || PlanetApplicationEnquiry || http://www.ryedale.gov.uk/ispforms.asp?serviceKey=SysDoc-PlanetApplicationEnquiry ||

+ 3
- 1
wiki/ScraperDevelopment.wiki Целия файл

@@ -3,6 +3,8 @@
* Conwy
* Merton
* Newport
* North Ayrshire
* Redbridge

= Scrapers being worked on =

@@ -33,4 +35,4 @@
* Newcastle (not timing out today) - forbidden.
* Stratford - connection refused
* Watford - site is down

Зареждане…
Отказ
Запис