|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115 |
- #!/usr/bin/perl -w
-
- use strict;
- use XML::Writer;
- use Date::Parse;
- use POSIX;
-
- my $file = $ARGV[0];
- my $info_url = $ARGV[1];
-
- my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2);
-
- $writer->xmlDecl("UTF-8");
-
- $writer->startTag("planning");
- $writer->dataElement("authority_name", "London Borough of Redbridge");
- $writer->dataElement("authority_short_name", "Redbridge");
- $writer->startTag("applications");
-
- open (my $fh, '-|', "pdftotext", "-layout", $file, "-") or die "open failed: $!";
- while (my $line = <$fh>) {
- $line =~ s///g;
- chomp $line;
- if ($line =~ /^\s*(App\.No:)\s*(Location:)\s+(Agent)/) {
- my $ofs_col1 = $-[1];
- my $ofs_col2 = $-[2];
- my $ofs_col3 = $-[3];
- my $col1_full = ""; # sometimes col1 headings break up onto 2 lines so grab the whole thing and match on it
- my $address = ""; my $proposal = "";
- my $cur_field = \$address;
- my $near_end = 0; my $redo_outer = 0;
- while ($line = <$fh>) {
- if ($line =~ s///g) { # alignment may have changed for new page, take care of adjustments of at most 2
- # we may lose a few characters but luckily a page break during an entry doesn't
- # happen very often (only 4 times during 2005-July 2008)
- $ofs_col2 -= 2;
- $ofs_col3 -= 2;
- }
-
- if ($line =~ /^\s*App\.No:/) {
- $redo_outer = 1;
- last;
- }
-
- chomp $line;
- if ($line =~ /^\s{10,}[0-9]+\s*$/) { # a line with a page number... ignore unless near end
- if ($near_end) {
- last;
- } else {
- next;
- }
- }
- my $col1; my $col2;
- if (length $line > $ofs_col1) {
- $col1 = substr $line, $ofs_col1, $ofs_col2-$ofs_col1;
- } else {
- $col1 = "";
- }
- if (length $line > $ofs_col2) {
- $col2 = substr $line, $ofs_col2, $ofs_col3-$ofs_col2;
- } else {
- $col2 = "";
- }
-
- $col2 =~ s/\s{10,}\S.*//g; # remove any obvious spillover text (only needed for a page break during an entry)
-
- $col1 =~ s/^\s+//; $col1 =~ s/\s+$//;
- $col2 =~ s/^\s+//; $col2 =~ s/\s+$//;
-
- $col1_full .= " " if $col1_full ne "" and $col1 ne "";
- $col1_full .= $col1;
-
- if ($col2 eq "") {
- if ($proposal eq "") {
- $cur_field = \$proposal;
- } else {
- $cur_field = undef;
- }
- } elsif (defined $cur_field) {
- $$cur_field .= " " if $$cur_field ne "" and $col2 ne "";
- $$cur_field .= $col2;
- }
- last if $near_end and $line =~ /^\s*$/;
- $near_end = 1 if $col1_full =~ /Case Officer/;
- }
- my ($refno, $date_received) = ($col1_full =~ /^(.+) Deposit Date: (\S+)/);
- $refno ||= ""; $date_received ||= "";
- $refno =~ s/-? //g;
-
- my $postcode = "None";
- if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) {
- $postcode = $1;
- }
- my $norm_date_received = strftime("%d/%m/%Y", map { defined $_ ? $_ : 0 } strptime($date_received));
-
- $writer->startTag("application");
- $writer->dataElement("council_reference", $refno);
- $writer->dataElement("address", $address);
- $writer->dataElement("postcode", $postcode);
- $writer->dataElement("description", $proposal);
- $writer->dataElement("info_url", $info_url);
- $writer->dataElement("comment_url", 'planning.enquiry@redbridge.gov.uk');
- $writer->dataElement("date_received", $norm_date_received);
- $writer->endTag;
-
- if ($redo_outer) {
- $redo_outer = 0;
- redo;
- }
- }
- }
-
- $writer->endTag;
- $writer->endTag;
- $writer->end;
|