Automatically exported from code.google.com/p/planningalerts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

116 lines
3.2 KiB

  1. #!/usr/bin/perl -w
  2. use strict;
  3. use XML::Writer;
  4. use Date::Parse;
  5. use POSIX;
  6. my $file = $ARGV[0];
  7. my $info_url = $ARGV[1];
  8. my $writer = new XML::Writer(DATA_MODE => 1, DATA_INDENT => 2);
  9. $writer->xmlDecl("UTF-8");
  10. $writer->startTag("planning");
  11. $writer->dataElement("authority_name", "London Borough of Redbridge");
  12. $writer->dataElement("authority_short_name", "Redbridge");
  13. $writer->startTag("applications");
  14. open (my $fh, '-|', "pdftotext", "-layout", $file, "-") or die "open failed: $!";
  15. while (my $line = <$fh>) {
  16. $line =~ s/ //g;
  17. chomp $line;
  18. if ($line =~ /^\s*(App\.No:)\s*(Location:)\s+(Agent)/) {
  19. my $ofs_col1 = $-[1];
  20. my $ofs_col2 = $-[2];
  21. my $ofs_col3 = $-[3];
  22. my $col1_full = ""; # sometimes col1 headings break up onto 2 lines so grab the whole thing and match on it
  23. my $address = ""; my $proposal = "";
  24. my $cur_field = \$address;
  25. my $near_end = 0; my $redo_outer = 0;
  26. while ($line = <$fh>) {
  27. if ($line =~ s/ //g) { # alignment may have changed for new page, take care of adjustments of at most 2
  28. # we may lose a few characters but luckily a page break during an entry doesn't
  29. # happen very often (only 4 times during 2005-July 2008)
  30. $ofs_col2 -= 2;
  31. $ofs_col3 -= 2;
  32. }
  33. if ($line =~ /^\s*App\.No:/) {
  34. $redo_outer = 1;
  35. last;
  36. }
  37. chomp $line;
  38. if ($line =~ /^\s{10,}[0-9]+\s*$/) { # a line with a page number... ignore unless near end
  39. if ($near_end) {
  40. last;
  41. } else {
  42. next;
  43. }
  44. }
  45. my $col1; my $col2;
  46. if (length $line > $ofs_col1) {
  47. $col1 = substr $line, $ofs_col1, $ofs_col2-$ofs_col1;
  48. } else {
  49. $col1 = "";
  50. }
  51. if (length $line > $ofs_col2) {
  52. $col2 = substr $line, $ofs_col2, $ofs_col3-$ofs_col2;
  53. } else {
  54. $col2 = "";
  55. }
  56. $col2 =~ s/\s{10,}\S.*//g; # remove any obvious spillover text (only needed for a page break during an entry)
  57. $col1 =~ s/^\s+//; $col1 =~ s/\s+$//;
  58. $col2 =~ s/^\s+//; $col2 =~ s/\s+$//;
  59. $col1_full .= " " if $col1_full ne "" and $col1 ne "";
  60. $col1_full .= $col1;
  61. if ($col2 eq "") {
  62. if ($proposal eq "") {
  63. $cur_field = \$proposal;
  64. } else {
  65. $cur_field = undef;
  66. }
  67. } elsif (defined $cur_field) {
  68. $$cur_field .= " " if $$cur_field ne "" and $col2 ne "";
  69. $$cur_field .= $col2;
  70. }
  71. last if $near_end and $line =~ /^\s*$/;
  72. $near_end = 1 if $col1_full =~ /Case Officer/;
  73. }
  74. my ($refno, $date_received) = ($col1_full =~ /^(.+) Deposit Date: (\S+)/);
  75. $refno ||= ""; $date_received ||= "";
  76. $refno =~ s/-? //g;
  77. my $postcode = "None";
  78. if ($address =~ /([A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z])/) {
  79. $postcode = $1;
  80. }
  81. my $norm_date_received = strftime("%d/%m/%Y", map { defined $_ ? $_ : 0 } strptime($date_received));
  82. $writer->startTag("application");
  83. $writer->dataElement("council_reference", $refno);
  84. $writer->dataElement("address", $address);
  85. $writer->dataElement("postcode", $postcode);
  86. $writer->dataElement("description", $proposal);
  87. $writer->dataElement("info_url", $info_url);
  88. $writer->dataElement("comment_url", 'planning.enquiry@redbridge.gov.uk');
  89. $writer->dataElement("date_received", $norm_date_received);
  90. $writer->endTag;
  91. if ($redo_outer) {
  92. $redo_outer = 0;
  93. redo;
  94. }
  95. }
  96. }
  97. $writer->endTag;
  98. $writer->endTag;
  99. $writer->end;