Procházet zdrojové kódy

Add Peter's scrapers for Highland, Glasgow, and Brentwood.

master
duncan.parkes před 16 roky
rodič
revize
9a3b38c325
8 změnil soubory, kde provedl 405 přidání a 1 odebrání
  1. +55
    -0
      python_scrapers/Brentwood.cgi
  2. +72
    -0
      python_scrapers/Brentwood.pl
  3. +54
    -0
      python_scrapers/Glasgow.cgi
  4. +63
    -0
      python_scrapers/Glasgow.pl
  5. +82
    -0
      python_scrapers/Highland.cgi
  6. +72
    -0
      python_scrapers/Highland.pl
  7. +1
    -1
      python_scrapers/Ocella.py
  8. +6
    -0
      python_scrapers/OtherFilesToCopy.csv

+ 55
- 0
python_scrapers/Brentwood.cgi Zobrazit soubor

@@ -0,0 +1,55 @@
#!/usr/bin/perl -w

use strict;

use LWP::Simple;
use File::Temp qw(tempfile);
use POSIX;
use CGI;

my $cgi = new CGI;

my $year = $cgi->param("year");
my $month = $cgi->param("month");
my $day = $cgi->param("day");

unless (defined $year && defined $month && defined $day) {
print <<ERROR;
Content-type: text/plain

Need year, month, day parameters
ERROR
exit 0;
}

my $html = get('http://www.brentwood-council.gov.uk/index.php?cid=573');

my $date = strftime("%d %B %Y", 0, 0, 0, $day, $month-1, $year-1900);

# quick and dirty
my ($url) = ($html =~ /(http:\/\/[^"]*\.pdf)[^<]*(<[^>]*>)*[^<]*$date/);
unless (defined $url) {
print <<NIL;
Content-type: text/xml

<?xml version="1.0" encoding="UTF-8"?>
<planning>
<authority_name>Brentwood Borough Council</authority_name>
<authority_short_name>Brentwood</authority_short_name>
<applications>
</applications>
</planning>
NIL
exit 0;
}

my $dmy = sprintf("%02d/%02d/%04d", $day, $month, $year);

my ($fh, $filename) = tempfile(SUFFIX => ".pdf");
print $fh get($url);
close($fh);

print "Content-type: text/xml\n\n";
system "./Brentwood.pl", $filename, $url, $dmy;

unlink $filename;

+ 72
- 0
python_scrapers/Brentwood.pl Zobrazit soubor

@@ -0,0 +1,72 @@
#!/usr/bin/perl -w

use strict;
use XML::Writer;

my $file = $ARGV[0];
my $info_url = $ARGV[1];
my $date = $ARGV[2];

my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2);

$writer->xmlDecl("UTF-8");

$writer->startTag("planning");
$writer->dataElement("authority_name", "Brentwood Borough Council");
$writer->dataElement("authority_short_name", "Brentwood");
$writer->startTag("applications");

open (my $fh, "pdftotext -layout $file -|");
while (my $line = <$fh>) {
chomp $line;
$line =~ s/ //g;
if ($line =~ /Address:/) {
my $ofs_col2 = $-[0];
my $refno = substr $line, 0, $ofs_col2;
$refno =~ s/ +$//g;
my $address = ""; my $proposal = "";
my $cur_field;
while (1) {
if (length($line) > $ofs_col2) {
my $col2 = substr $line, $ofs_col2;
$col2 =~ s/^ +//;
if ($col2 =~ s/^((A?d)?d)?ress://) {
$cur_field = \$address;
} elsif ($col2 =~ s/^((P?r)?o)?posal://) {
$cur_field = \$proposal;
} elsif ($col2 =~ s/^((A?p)?p)?licant://) {
$cur_field = undef;
} elsif ($col2 =~ s/^((A?g)?e)?nt://) {
$cur_field = undef;
}
$col2 =~ s/^ +//; $col2 =~ s/ +$//;
if (defined $cur_field) {
$$cur_field .= " " if $$cur_field ne "";
$$cur_field .= $col2;
}
}
last unless defined ($line = <$fh>);
chomp $line;
$line =~ s/ //g;
last if length $line == 0;
}
my $postcode = "None";
if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) {
$postcode = $1;
}

$writer->startTag("application");
$writer->dataElement("council_reference", $refno);
$writer->dataElement("address", $address);
$writer->dataElement("postcode", $postcode);
$writer->dataElement("description", $proposal);
$writer->dataElement("info_url", $info_url);
$writer->dataElement("comment_url", "planning\@brentwood.gov.uk");
$writer->dataElement("date_received", $date);
$writer->endTag;
}
}

$writer->endTag;
$writer->endTag;
$writer->end;

+ 54
- 0
python_scrapers/Glasgow.cgi Zobrazit soubor

@@ -0,0 +1,54 @@
#!/usr/bin/perl -w

use strict;

use LWP::Simple;
use File::Temp qw(tempfile);
use POSIX;
use CGI;

my $cgi = new CGI;

my $year = $cgi->param("year");
my $month = $cgi->param("month");
my $day = $cgi->param("day");

unless (defined $year && defined $month && defined $day) {
print <<ERROR;
Content-type: text/plain

Need year, month, day parameters
ERROR
exit 0;
}

my $html = get('http://www.glasgow.gov.uk/en/Business/Planning_Development/DevelopmentControl/Sitehistorysearches/');

my $date = sprintf("%02d/%02d/%02d", $day, $month, $year % 100);

# quick and dirty
my ($url) = ($html =~ /href="(\/[^"]*\.pdf)[^<]*[0-9]{2}\/[0-9]{2}\/[0-9]{2} - $date/);
unless (defined $url) {
print <<NIL;
Content-type: text/xml

<?xml version="1.0" encoding="UTF-8"?>
<planning>
<authority_name>Glasgow City Council</authority_name>
<authority_short_name>Glasgow</authority_short_name>
<applications>
</applications>
</planning>
NIL
exit 0;
}
my $absurl = "http://www.glasgow.gov.uk$url";

my ($fh, $filename) = tempfile(SUFFIX => ".pdf");
print $fh get($absurl);
close($fh);

print "Content-type: text/xml\n\n";
system "./Glasgow.pl", $filename, $absurl;

unlink $filename;

+ 63
- 0
python_scrapers/Glasgow.pl Zobrazit soubor

@@ -0,0 +1,63 @@
#!/usr/bin/perl -w

use strict;
use XML::Writer;

my $file = $ARGV[0];
my $info_url = $ARGV[1];

my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2);

$writer->xmlDecl("UTF-8");

$writer->startTag("planning");
$writer->dataElement("authority_name", "Glasgow City Council");
$writer->dataElement("authority_short_name", "Glasgow");
$writer->startTag("applications");

open (my $fh, "pdftotext -layout $file -|");
while (my $line = <$fh>) {
if ($line =~ /^\s*Reference:\s*(\S+)/) {
my $refno = $1;
my $address = ""; my $proposal = ""; my $date_received;
my $cur_field;
while (1) {
chomp $line;
$line =~ s/^\s+//; $line =~ s/\s+$//;
if ($line =~ s/^Address://) {
$cur_field = \$address;
} elsif ($line =~ s/^Proposal://) {
$cur_field = \$proposal;
} elsif ($line =~ /^Date Received:\s*(\S+)/) {
$date_received = $1;
$date_received =~ s#\.#/#g;
$cur_field = undef;
}
$line =~ s/^\s+//;
if (defined $cur_field) {
$$cur_field .= " " if $$cur_field ne "";
$$cur_field .= $line;
}
last if $line =~ /Map Reference:/;
last unless defined ($line = <$fh>);
}
my $postcode = "None";
if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) {
$postcode = $1;
}

$writer->startTag("application");
$writer->dataElement("council_reference", $refno);
$writer->dataElement("address", $address);
$writer->dataElement("postcode", $postcode);
$writer->dataElement("description", $proposal);
$writer->dataElement("info_url", $info_url);
$writer->dataElement("comment_url", "planning.representations\@drs.glasgow.gov.uk");
$writer->dataElement("date_received", $date_received);
$writer->endTag;
}
}

$writer->endTag;
$writer->endTag;
$writer->end;

+ 82
- 0
python_scrapers/Highland.cgi Zobrazit soubor

@@ -0,0 +1,82 @@
#!/usr/bin/perl -w

use strict;
use HTML::TreeBuilder;
use File::Temp qw(tempfile);
use LWP::Simple;
use POSIX;
use Encode;
use CGI;
use CGI::Carp;

sub sanity_check {
my ($var) = @_;
defined $var or return 0;
$var =~ /^[0-9]+$/ or return 0;
return 1;
}

sub no_results {
my ($y, $m, $d, $reason) = @_;
print <<NIL;
Content-type: text/xml

<?xml version="1.0" encoding="UTF-8"?>
<planning>
<authority_name>Highland Council</authority_name>
<authority_short_name>Highland</authority_short_name>
<applications>
</applications>
</planning>
NIL
die "$y/$m/$d failed: $reason\n";
}

my $cgi = new CGI;

my $year = $cgi->param("year");
my $month = $cgi->param("month");
my $day = $cgi->param("day");

unless (sanity_check($year) && sanity_check($month) && sanity_check($day)) {
print <<ERROR;
Content-type: text/plain

Need year, month, day parameters
ERROR
exit 0;
}

my $tree = HTML::TreeBuilder->new;
# $tree->parse_file('weekly-planning-bw-lists.htm');
$tree->parse(decode_utf8(get('http://www.highland.gov.uk/yourenvironment/planning/planningapplications/weekly-planning-bw-lists.htm') or die "couldn't fetch index page"));
$tree->eof;

my $monthyear_re = strftime('%B[ \xa0]%Y', 0, 0, 0, 1, $month-1, $year-1900);

my ($month_h2) = $tree->look_down(
"_tag", "h2",
sub { $_[0]->as_text =~ /$monthyear_re/ }
);
$month_h2 or no_results($year, $month, $day, "Cannot find month header");

my $month_list = $month_h2->right;

my $day_re = strftime('Planning Applications (?:[A-Za-z0-9 ]*?to )?%b[a-z]* ?%e[a-z]', 0, 0, 0, $day, $month-1, $year-1900);

my ($day_link) = $month_list->look_down(
"_tag", "a",
sub { $_[0]->as_text =~ /$day_re/ }
);
$day_link or no_results($year, $month, $day, "Cannot find day link");

my $day_absurl = "http://www.highland.gov.uk".$day_link->attr('href');

my ($fh, $filename) = tempfile(SUFFIX => ".pdf");
print $fh get($day_absurl);
close($fh);

print "Content-type: text/xml\n\n";
system "./Highland.pl", $filename, $day_absurl and die "system failed: $|";

unlink $filename or die "cannot unlink temporary file $filename: $!";

+ 72
- 0
python_scrapers/Highland.pl Zobrazit soubor

@@ -0,0 +1,72 @@
#!/usr/bin/perl -w

use strict;
use XML::Writer;

my $file = $ARGV[0];
my $info_url = $ARGV[1];

my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2);

$writer->xmlDecl("UTF-8");

$writer->startTag("planning");
$writer->dataElement("authority_name", "Highland Council");
$writer->dataElement("authority_short_name", "Highland");
$writer->startTag("applications");

open (my $fh, '-|', "pdftotext", "-layout", $file, "-") or die "open failed: $!";
while (my $line = <$fh>) {
if ($line =~ /^\s*Ref Number\s*(\S+)/) {
my $refno = $1;
my $address = ""; my $proposal = ""; my $case_officer = ""; my $date_received;
my $cur_field;
my $near_end;
while (1) {
chomp $line;
$line =~ s/^\s+//; $line =~ s/\s+$//;
if ($line =~ s/^Location of Works//) {
$cur_field = \$address;
} elsif ($line =~ s/^Description of Works//) {
$cur_field = \$proposal;
} elsif ($line =~ s/^Case Officer//) {
$cur_field = \$case_officer;
} elsif (($line =~ s/^Community Council//) || ($line =~ s/^Applicant Name//) || ($line =~ s/^Applicant Address//)) {
$cur_field = undef;
} elsif ($line =~ /^Validation Date\s*(\S+)/) {
$date_received = $1;
$cur_field = undef;
}
$line =~ s/^\s+//;
if (defined $cur_field) {
$$cur_field .= " " if $$cur_field ne "";
$$cur_field .= $line;
}
last unless defined ($line = <$fh>);
last if $near_end && length $line == 1;
$near_end = 1 if $line =~ /^\s*Case Officer/;
}
my $postcode = "None";
if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) {
$postcode = $1;
}
my $comment_url = "None";
if ($case_officer =~ /([A-Za-z0-9\.]+\@[A-Za-z0-9\.]+)/) {
$comment_url = "$1";
}

$writer->startTag("application");
$writer->dataElement("council_reference", $refno);
$writer->dataElement("address", $address);
$writer->dataElement("postcode", $postcode);
$writer->dataElement("description", $proposal);
$writer->dataElement("info_url", $info_url);
$writer->dataElement("comment_url", $comment_url);
$writer->dataElement("date_received", $date_received);
$writer->endTag;
}
}

$writer->endTag;
$writer->endTag;
$writer->end;

+ 1
- 1
python_scrapers/Ocella.py Zobrazit soubor

@@ -208,7 +208,7 @@ if __name__ == '__main__':
# parser = OcellaParser("Bridgend", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL")
parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Castle Point", "Castle Point", "http://planning.castlepoint.gov.uk/portal/page?_pageid=35,38205&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly")
parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly")


print parser.getResults(21,5,2008)


+ 6
- 0
python_scrapers/OtherFilesToCopy.csv Zobrazit soubor

@@ -31,3 +31,9 @@
"Maldon.py", "420"
"Medway.py", "420"
"Shropshire.py", "420"
"Brentwood.pl", "493"
"Brentwood.cgi", "493"
"Glasgow.pl", "493"
"Glasgow.cgi", "493"
"Highland.pl", "493"
"Highland.cgi", "493"

Načítá se…
Zrušit
Uložit