add some Acolnet sites:

Babergh Basingstoke Bassetlaw Bury Derby
17 years ago · 43dceae0c6
--- a/cgi-bin/AcolnetParser.py
+++ b/cgi-bin/AcolnetParser.py
@@ -0,0 +1,364 @@
 #!/usr/local/bin/python

 import urllib, urllib2
 import HTMLParser
 #from BeautifulSoup import BeautifulSoup

 import urlparse

 import re

 end_head_regex = re.compile("</head", re.IGNORECASE)

 import MultipartPostHandler
 # this is not mine, or part of standard python (though it should be!)
 # it comes from http://pipe.scs.fsu.edu/PostHandler/MultipartPostHandler.py

 from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication

 from datetime import date
 from time import strptime


 date_format = "%d/%m/%Y"
 our_date = date(2007,4,25)


 class AcolnetParser(HTMLParser.HTMLParser):
    case_number_tr = None # this one can be got by the td class attribute
    reg_date_tr = None
    location_tr = None
    proposal_tr = None

    # There is no online comment facility in these, so we provide an
    # appropriate email address instead
    comments_email_address = None

    
    def __init__(self,
                 authority_name,
                 authority_short_name,
                 base_url,
                 debug=False):


        HTMLParser.HTMLParser.__init__(self)

        self.authority_name = authority_name
        self.authority_short_name = authority_short_name
        self.base_url = base_url

        self.debug = debug

        self._tr_number = 0

        # This will be used to track the subtable depth
        # when we are in a results-table, in order to
        # avoid adding an application before we have got to
        # the end of the results-table
        self._subtable_depth = None

        self._in_td = False

        # This in where we store the results
        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

        # This will store the planning application we are currently working on.
        self._current_application = None


    def _cleanupHTML(self, html):
        """This method should be overridden in subclasses to perform site specific
        HTML cleanup."""
        return html

    def handle_starttag(self, tag, attrs):
        #print tag, attrs
                    
        if tag == "table":
            if self._current_application is None:
                # Each application is in a separate table with class "results-table"
                for key, value in attrs:
                    if key == "class" and value == "results-table":
                        #print "found results-table"
                        self._current_application = PlanningApplication()
                        self._tr_number = 0
                        self._subtable_depth = 0
                        self._current_application.comment_url = self.comments_email_address
                        break
            else:
                # We are already in a results-table, and this is the start of a subtable,
                # so increment the subtable depth.
                self._subtable_depth += 1

        elif self._current_application is not None:
            if tag == "tr" and self._subtable_depth == 0:
                self._tr_number += 1
            if tag == "td":
                self._in_td = True
                if self._tr_number == self.case_number_tr:
                    #get the reference and the info link here
                    pass
                elif self._tr_number == self.reg_date_tr:
                    #get the registration date here
                    pass
                elif self._tr_number == self.location_tr:
                    #get the address and postcode here
                    pass
                elif self._tr_number == self.proposal_tr:
                    #get the description here
                    pass
            if tag == "a" and self._tr_number == self.case_number_tr:
                # this is where we get the info link and the case number
                for key, value in attrs:
                    if key == "href":
                        self._current_application.info_url = value
                        
    def handle_data(self, data):
        # If we are in the tr which contains the case number,
        # then data is the council reference, so
        # add it to self._current_application.
        if self._in_td:
            if self._tr_number == self.case_number_tr:
                self._current_application.council_reference = data.strip()
            elif self._tr_number == self.reg_date_tr:
                # we need to make a date object out of data
                date_as_str = ''.join(data.strip().split())
                received_date = date(*strptime(date_as_str, date_format)[0:3])

                #print received_date

                self._current_application.date_received = received_date

            elif self._tr_number == self.location_tr:
                location = data.strip()

                self._current_application.address = location
                self._current_application.postcode = getPostcodeFromText(location)
            elif self._tr_number == self.proposal_tr:
                self._current_application.description = data.strip()


    def handle_endtag(self, tag):
        #print "ending: ", tag
        if tag == "table" and self._current_application is not None:
            if self._subtable_depth > 0:
                self._subtable_depth -= 1
            else:
                # We need to add the last application in the table
                if self._current_application is not None:
                    #print "adding application"
                    self._results.addApplication(self._current_application)
                    #print self._current_application
                    self._current_application = None
                    self._tr_number = None
                    self._subtable_depth = None
        elif tag == "td":
            self._in_td = False

    def getResultsByDayMonthYear(self, day, month, year):
        # first we fetch the search page to get ourselves some session info...
        search_form_response = urllib2.urlopen(self.base_url)
        search_form_contents = search_form_response.read()

        # This sometimes causes a problem in HTMLParser, so let's just get the link
        # out with a regex...

        groups = self.action_regex.search(search_form_contents).groups()

        action = groups[0] 
        #print action

        action_url = urlparse.urljoin(self.base_url, action)
        #print action_url

        our_date = date(year, month, day)
        
        search_data = {"regdate1": our_date.strftime(date_format),
                       "regdate2": our_date.strftime(date_format),
                       }
        
        opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
        response = opener.open(action_url, search_data)
        results_html = response.read()

        # This is for doing site specific html cleanup
        results_html = self._cleanupHTML(results_html)

        #some javascript garbage in the header upsets HTMLParser,
        #so we'll just have the body
        just_body = "<html>" + end_head_regex.split(results_html)[-1]

        #outfile = open(self.authority_short_name + ".debug", "w")
        #outfile.write(just_body)        

        self.feed(just_body)
        
        return self._results



    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()


 class BaberghParser(AcolnetParser):
    #search_url = "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

    case_number_tr = 1 # this one can be got by the td class attribute
    reg_date_tr = 2
    location_tr = 4
    proposal_tr = 5

    #authority_name = "Babergh District Council"
    #authority_short_name = "Babergh"

    # It would be nice to scrape this...
    comments_email_address = "planning.reception@babergh.gov.uk"

    action_regex = re.compile("<FORM  name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">")

 class BasingstokeParser(AcolnetParser):
    #search_url = "http://planning.basingstoke.gov.uk/DCOnline2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
    
    case_number_tr = 1 # this one can be got by the td class attribute
    reg_date_tr = 3
    location_tr = 6
    proposal_tr = 8

    #authority_name = "Basingstoke and Deane Borough Council"
    #authority_short_name = "Basingstoke and Deane"

    # It would be nice to scrape this...
    comments_email_address = "development.control@basingstoke.gov.uk"

    action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">")

 class BassetlawParser(AcolnetParser):
    #search_url =  "http://www.bassetlaw.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

    case_number_tr = 1 # this one can be got by the td class attribute
    reg_date_tr = 2
    location_tr = 5
    proposal_tr = 6    

    #authority_name = "Bassetlaw District Council"
    #authority_short_name = "Bassetlaw"

    comments_email_address = "planning@bassetlaw.gov.uk"

    action_regex = re.compile("<FORM  name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)

    def _cleanupHTML(self, html):
        """There is a broken div in this page. We don't need any divs, so
        let's get rid of them all."""

        div_regex = re.compile("</?div[^>]*>", re.IGNORECASE)
        return div_regex.sub('', html)


 class BridgenorthParser(AcolnetParser):
    #search_url = "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"

    case_number_tr = 1 # this one can be got by the td class attribute
    reg_date_tr = 2
    location_tr = 4
    proposal_tr = 5    

    #authority_name = "Bridgenorth District Council"
    #authority_short_name = "Bridgenorth"

    comments_email_address = "contactus@bridgnorth-dc.gov.uk"

    action_regex = re.compile("<FORM  name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">")

 class BuryParser(AcolnetParser):
    #search_url = "http://e-planning.bury.gov.uk/ePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"

    case_number_tr = 1 # this one can be got by the td class attribute
    reg_date_tr = 2
    location_tr = 4
    proposal_tr = 5    

    #authority_name = "Bury Metropolitan Borough Council"
    #authority_short_name = "Bury"

    comments_email_address = "development.control@bury.gov.uk"
    action_regex = re.compile("<FORM  name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">")

 ## class CanterburyParser(AcolnetParser):
 ##     search_url = "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

 ##     case_number_tr = 1 # this one can be got by the td class attribute
 ##     reg_date_tr = 2
 ##     location_tr = 4
 ##     proposal_tr = 5    

 ##     authority_name = "Canterbury City Council"
 ##     authority_short_name = "Canterbury"

 ##     comments_email_address = ""
 ##     action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">")

 class CarlisleParser(AcolnetParser):
    #search_url = "http://planning.carlisle.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"

    case_number_tr = 1 # this one can be got by the td class attribute
    reg_date_tr = 2
    location_tr = 5
    proposal_tr = 6    

    #authority_name = "Carlisle City Council"
    #authority_short_name = "Carlisle"

    comments_email_address = "dc@carlisle.gov.uk"
    action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">")


 class DerbyParser(AcolnetParser):
    #search_url = "http://195.224.106.204/scripts/planningpages02%5CXSLPagesDC_DERBY%5CDCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

    case_number_tr = 1 # this one can be got by the td class attribute
    reg_date_tr = 3
    location_tr = 4
    proposal_tr = 5    

    #authority_name = "Derby City Council"
    #authority_short_name = "Derby"

    comments_email_address = "developmentcontrol@derby.gov.uk"
    action_regex = re.compile("<FORM  name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">")


 if __name__ == '__main__':
    day = 15
    month = 3
    year = 2007

    # working
    # parser = BasingstokeParser()
    parser = BaberghParser("Babergh District Council", "Babergh", "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")

    # works with the divs stripped out
    #parser = BassetlawParser()

    # returns error 400 - bad request
    #parser = BridgenorthParser()

    # working
    #parser = BuryParser()

    # cambridgeshire is a bit different...
    # no advanced search page

    # canterbury
    # results as columns of one table

    # returns error 400 - bad request
    #parser = CarlisleParser()

    # working
    #parser = DerbyParser()
    
    print parser.getResults(day, month, year)
    
--- a/cgi-bin/Allerdale.cgi
+++ b/cgi-bin/Allerdale.cgi
--- a/cgi-bin/Alnwick.cgi
+++ b/cgi-bin/Alnwick.cgi
--- a/cgi-bin/Angus.cgi
+++ b/cgi-bin/Angus.cgi
--- a/cgi-bin/Aylesbury
+++ b/cgi-bin/Aylesbury
--- a/cgi-bin/Babergh.cgi
+++ b/cgi-bin/Babergh.cgi
@@ -0,0 +1,29 @@
 #!/usr/local/bin/python

 # This is the parser for Babergh District Council.
 # it is generated from the file CGITemplate

 import cgi
 import cgitb
 #cgitb.enable(display=0, logdir="/tmp")


 form = cgi.FieldStorage()
 day = form.getfirst('day')
 month = form.getfirst('month')
 year = form.getfirst('year')


 authority_name = "Babergh District Council"
 authority_short_name = "Babergh"
 base_url = "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

 import AcolnetParser

 parser = AcolnetParser.BaberghParser(authority_name, authority_short_name, base_url)

 xml = parser.getResults(day, month, year)

 print "Content-Type: text/xml"     # XML is following
 print
 print xml                          # print the xml
--- a/cgi-bin/Barrow.cgi
+++ b/cgi-bin/Barrow.cgi
--- a/cgi-bin/Basildon.cgi
+++ b/cgi-bin/Basildon.cgi
--- a/cgi-bin/Basingstoke
+++ b/cgi-bin/Basingstoke
@@ -0,0 +1,29 @@
 #!/usr/local/bin/python

 # This is the parser for Basingstoke and Deane Borough Council.
 # it is generated from the file CGITemplate

 import cgi
 import cgitb
 #cgitb.enable(display=0, logdir="/tmp")


 form = cgi.FieldStorage()
 day = form.getfirst('day')
 month = form.getfirst('month')
 year = form.getfirst('year')


 authority_name = "Basingstoke and Deane Borough Council"
 authority_short_name = "Basingstoke and Deane"
 base_url = "http://planning.basingstoke.gov.uk/DCOnline2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

 import AcolnetParser

 parser = AcolnetParser.BasingstokeParser(authority_name, authority_short_name, base_url)

 xml = parser.getResults(day, month, year)

 print "Content-Type: text/xml"     # XML is following
 print
 print xml                          # print the xml
--- a/cgi-bin/Bassetlaw.cgi
+++ b/cgi-bin/Bassetlaw.cgi
@@ -0,0 +1,29 @@
 #!/usr/local/bin/python

 # This is the parser for Bassetlaw District Council.
 # it is generated from the file CGITemplate

 import cgi
 import cgitb
 #cgitb.enable(display=0, logdir="/tmp")


 form = cgi.FieldStorage()
 day = form.getfirst('day')
 month = form.getfirst('month')
 year = form.getfirst('year')


 authority_name = "Bassetlaw District Council"
 authority_short_name = "Bassetlaw"
 base_url = "http://www.bassetlaw.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

 import AcolnetParser

 parser = AcolnetParser.BassetlawParser(authority_name, authority_short_name, base_url)

 xml = parser.getResults(day, month, year)

 print "Content-Type: text/xml"     # XML is following
 print
 print xml                          # print the xml
--- a/cgi-bin/Bath.cgi
+++ b/cgi-bin/Bath.cgi
--- a/cgi-bin/Bexley.cgi
+++ b/cgi-bin/Bexley.cgi
--- a/cgi-bin/Blaby.cgi
+++ b/cgi-bin/Blaby.cgi
--- a/cgi-bin/Bolsover.cgi
+++ b/cgi-bin/Bolsover.cgi
--- a/cgi-bin/Bristol.cgi
+++ b/cgi-bin/Bristol.cgi
--- a/cgi-bin/Buckinghamshire.cgi
+++ b/cgi-bin/Buckinghamshire.cgi
--- a/cgi-bin/Bury.cgi
+++ b/cgi-bin/Bury.cgi
@@ -0,0 +1,29 @@
 #!/usr/local/bin/python

 # This is the parser for Bury Metropolitan Borough Council.
 # it is generated from the file CGITemplate

 import cgi
 import cgitb
 #cgitb.enable(display=0, logdir="/tmp")


 form = cgi.FieldStorage()
 day = form.getfirst('day')
 month = form.getfirst('month')
 year = form.getfirst('year')


 authority_name = "Bury Metropolitan Borough Council"
 authority_short_name = "Bury"
 base_url = "http://e-planning.bury.gov.uk/ePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"

 import AcolnetParser

 parser = AcolnetParser.BuryParser(authority_name, authority_short_name, base_url)

 xml = parser.getResults(day, month, year)

 print "Content-Type: text/xml"     # XML is following
 print
 print xml                          # print the xml
--- a/cgi-bin/Chelmsford.cgi
+++ b/cgi-bin/Chelmsford.cgi
--- a/cgi-bin/Cherwell.cgi
+++ b/cgi-bin/Cherwell.cgi
--- a/cgi-bin/Chorley.cgi
+++ b/cgi-bin/Chorley.cgi
--- a/cgi-bin/City
+++ b/cgi-bin/City
--- a/cgi-bin/Cornwall.cgi
+++ b/cgi-bin/Cornwall.cgi
--- a/cgi-bin/Coventry.cgi
+++ b/cgi-bin/Coventry.cgi
--- a/cgi-bin/Dacorum.cgi
+++ b/cgi-bin/Dacorum.cgi
@@ -1,108 +0,0 @@
 #!/usr/bin/perl

 use strict;
 use warnings;

 use CGI qw(:cgi);
 use HTML::TreeBuilder;
 use LWP::UserAgent;
 use XML::Writer;

 # The master URLs for the Dacorum planning search
 our $SearchURL = "http://www.dacorum.gov.uk/default.aspx?page=1495";
 our $InfoURL = "http://www.dacorum.gov.uk/Default.aspx?page=1497&ID=";
 our $CommentURL = "http://www.dacorum.gov.uk/Default.aspx?page=2847&ID=";

 # We're a CGI script...
 my $query = CGI->new();

 # Construct an LWP user agent
 our $UA = LWP::UserAgent->new(env_proxy => 1,
                              cookie_jar => {},
                              requests_redirectable => [ 'GET', 'HEAD', 'POST' ]);

 # Post the URL to get an initial blank form
 my $state = get_state(do_post());

 # Do the search
 my $page = do_post({"__VIEWSTATE" => $state,
                    "Template:_ctl10:_ctl0:btnSearch" => "Search",
                    "Template:_ctl10:_ctl0:tbRegistrationFromDay" => $query->param("day"),
                    "Template:_ctl10:_ctl0:tbRegistrationFromMon" => $query->param("month"),
                    "Template:_ctl10:_ctl0:tbRegistrationFromYear" => $query->param("year"),
                    "Template:_ctl10:_ctl0:tbRegistrationToDay" => $query->param("day"),
                    "Template:_ctl10:_ctl0:tbRegistrationToMon" => $query->param("month"),
                    "Template:_ctl10:_ctl0:tbRegistrationToYear" => $query->param("year")});

 # Output an HTTP response header
 print $query->header(-type  => "text/xml");

 # Create an XML output stream
 my $Writer = XML::Writer->new(DATA_MODE => 1);

 # Output the XML header data
 $Writer->xmlDecl("UTF-8");
 $Writer->startTag("planning");
 $Writer->dataElement("authority_name", "Dacorum Borough Council");
 $Writer->dataElement("authority_short_name", "Dacorum");
 $Writer->startTag("applications");

 # Find the result table
 my $table = $page->look_down("_tag" => "table", "class" => "FormDataGrid");

 # Process each row of the results
 foreach my $row ($table->look_down("_tag" => "tr"))
 {
    my @cells = $row->look_down("_tag" => "td");

    if ($cells[0]->attr("class") eq "FormGridDataItem" ||
        $cells[0]->attr("class") eq "FormGridAlternatingDataItem")
    {
        my $reference = $cells[0]->as_trimmed_text;
        my $address = $cells[1]->as_trimmed_text;
        my $description = $cells[2]->as_trimmed_text;
        my $date = $cells[3]->as_trimmed_text;
        my $postcode;

        if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/)
        {
            $postcode = $1;
        }

        $Writer->startTag("application");
        $Writer->dataElement("council_reference", $reference);
        $Writer->dataElement("address", $address);
        $Writer->dataElement("postcode", $postcode);
        $Writer->dataElement("description", $description);
        $Writer->dataElement("info_url", $InfoURL . $reference);
        $Writer->dataElement("comment_url", $CommentURL . $reference);
        $Writer->dataElement("date_received", $date);
        $Writer->endTag("application");
    }
 }

 # Finish off XML output
 $Writer->endTag("applications");
 $Writer->endTag("planning");
 $Writer->end();

 exit 0;

 # Extract the state from a page so we can repost it
 sub get_state
 {
    my $page = shift;
    my $viewstate = $page->look_down("_tag" => "input", "name" => "__VIEWSTATE");

    return $viewstate->attr("value");
 }

 # Post to the planning search page
 sub do_post
 {
    my $response = $UA->post($SearchURL, @_);

    die $response->status_line unless $response->is_success;

    return HTML::TreeBuilder->new_from_content($response->content);
 }
--- a/cgi-bin/Denbighshire.cgi
+++ b/cgi-bin/Denbighshire.cgi
--- a/cgi-bin/Derby.cgi
+++ b/cgi-bin/Derby.cgi
@@ -0,0 +1,29 @@
 #!/usr/local/bin/python

 # This is the parser for Derby City Council.
 # it is generated from the file CGITemplate

 import cgi
 import cgitb
 #cgitb.enable(display=0, logdir="/tmp")


 form = cgi.FieldStorage()
 day = form.getfirst('day')
 month = form.getfirst('month')
 year = form.getfirst('year')


 authority_name = "Derby City Council"
 authority_short_name = "Derby"
 base_url = "http://195.224.106.204/scripts/planningpages02%5CXSLPagesDC_DERBY%5CDCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

 import AcolnetParser

 parser = AcolnetParser.DerbyParser(authority_name, authority_short_name, base_url)

 xml = parser.getResults(day, month, year)

 print "Content-Type: text/xml"     # XML is following
 print
 print xml                          # print the xml
--- a/cgi-bin/Doncaster.cgi
+++ b/cgi-bin/Doncaster.cgi
--- a/cgi-bin/Dundee.cgi
+++ b/cgi-bin/Dundee.cgi
--- a/cgi-bin/Durham.cgi
+++ b/cgi-bin/Durham.cgi
--- a/cgi-bin/Ealing.cgi
+++ b/cgi-bin/Ealing.cgi
--- a/cgi-bin/Easington.cgi
+++ b/cgi-bin/Easington.cgi
--- a/cgi-bin/East
+++ b/cgi-bin/East
--- a/cgi-bin/East
+++ b/cgi-bin/East
--- a/cgi-bin/EastHerts.cgi
+++ b/cgi-bin/EastHerts.cgi
@@ -1,122 +0,0 @@
 #!/usr/bin/perl

 use strict;
 use warnings;

 use CGI qw(:cgi);
 use HTML::TreeBuilder;
 use LWP::UserAgent;
 use XML::Writer;

 # The master URLs for the East Herts planning search
 our $SearchURL = "http://e-services.eastherts.gov.uk/swiftlg/apas/run/WPHAPPCRITERIA";
 our $InfoURL = "http://e-services.eastherts.gov.uk/swiftlg/apas/run/WPHAPPDETAIL.DisplayUrl?theApnID=";
 our $CommentURL = "http://e-services.eastherts.gov.uk/swiftlg/apas/run/wphmakerep.displayURL?ApnID=";

 # We're a CGI script...
 my $query = CGI->new();

 # Get the date to fetch
 my $date = $query->param("day") . "/" . $query->param("month") . "/" . $query->param("year");

 # Construct an LWP user agent
 our $UA = LWP::UserAgent->new(env_proxy => 1);

 # Do the search
 my $page = do_post($SearchURL,
                   {"REGFROMDATE.MAINBODY.WPACIS.1." => $date,
                    "REGTODATE.MAINBODY.WPACIS.1." => $date,
                    "SEARCHBUTTON.MAINBODY.WPACIS.1." => "Search"});

 # Output an HTTP response header
 print $query->header(-type  => "text/xml");

 # Create an XML output stream
 my $Writer = XML::Writer->new(DATA_MODE => 1);

 # Output the XML header data
 $Writer->xmlDecl("UTF-8");
 $Writer->startTag("planning");
 $Writer->dataElement("authority_name", "East Herts Council");
 $Writer->dataElement("authority_short_name", "East Herts");
 $Writer->startTag("applications");

 # Output any applications on the first page
 output_applications($page);

 # Loop over any additional results pages
 foreach my $link ($page->look_down("_tag" => "a", "href" => qr/^WPHAPPSEARCHRES\.displayResultsURL/))
 {
    # Fetch this page...
    $page = do_get(URI->new_abs($link->attr("href"), $SearchURL));

    # ...and output the applications from it
    output_applications($page);
 }

 # Finish off XML output
 $Writer->endTag("applications");
 $Writer->endTag("planning");
 $Writer->end();

 exit 0;

 # Make a GET request
 sub do_get
 {
    my $response = $UA->get(@_);

    die $response->status_line unless $response->is_success;

    return HTML::TreeBuilder->new_from_content($response->content);
 }

 # Make a POST request
 sub do_post
 {
    my $response = $UA->post(@_);

    die $response->status_line unless $response->is_success;

    return HTML::TreeBuilder->new_from_content($response->content);
 }

 # Output applications from a results page
 sub output_applications
 {
    my $page = shift;

    # Find the result table
    my $table = $page->look_down("_tag" => "table", "cellspacing" => "2", "cellpadding" => "2");

    # Process each row of the results
    foreach my $row ($table->look_down("_tag" => "tr"))
    {
        my @cells = $row->look_down("_tag" => "td");

        if (@cells >= 3)
        {
            my $reference = $cells[0]->as_trimmed_text;
            my $description = $cells[1]->as_trimmed_text;
            my $address = $cells[2]->as_trimmed_text;
            my $postcode;

            if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/)
            {
                $postcode = $1;
            }

            $Writer->startTag("application");
            $Writer->dataElement("council_reference", $reference);
            $Writer->dataElement("address", $address);
            $Writer->dataElement("postcode", $postcode);
            $Writer->dataElement("description", $description);
            $Writer->dataElement("info_url", $InfoURL . $reference);
            $Writer->dataElement("comment_url", $CommentURL . $reference);
            $Writer->dataElement("date_received", $date);
            $Writer->endTag("application");
        }
    }

    return;
 }
--- a/cgi-bin/Edinburgh.cgi
+++ b/cgi-bin/Edinburgh.cgi
--- a/cgi-bin/Enfield.cgi
+++ b/cgi-bin/Enfield.cgi
@@ -1,121 +0,0 @@
 #!/usr/bin/perl

 use strict;
 use warnings;

 use CGI qw(:cgi);
 use HTML::TreeBuilder;
 use LWP::UserAgent;
 use XML::Writer;


 # The master URLs for the Enfield planning search
 our $SearchURL = "http://forms.enfield.gov.uk/swiftlg/apas/run/WPHAPPCRITERIA";
 our $InfoURL = "http://forms.enfield.gov.uk/swiftlg/apas/run/WPHAPPDETAIL.DisplayUrl?theApnID=";

 # We're a CGI script...
 my $query = CGI->new();

 # Get the date to fetch
 my $date = $query->param("day") . "/" . $query->param("month") . "/" . $query->param("year");

 # Construct an LWP user agent
 our $UA = LWP::UserAgent->new(env_proxy => 1);

 # Do the search
 my $page = do_post($SearchURL,
                   {"REGFROMDATE.MAINBODY.WPACIS.1." => $date,
                    "REGTODATE.MAINBODY.WPACIS.1." => $date,
                    "SEARCHBUTTON.MAINBODY.WPACIS.1." => "Search"});

 # Output an HTTP response header
 print $query->header(-type  => "text/xml");

 # Create an XML output stream
 my $Writer = XML::Writer->new(DATA_MODE => 1);

 # Output the XML header data
 $Writer->xmlDecl("UTF-8");
 $Writer->startTag("planning");
 $Writer->dataElement("authority_name", "Enfield Council");
 $Writer->dataElement("authority_short_name", "Enfield");
 $Writer->startTag("applications");

 # Output any applications on the first page
 output_applications($page);

 # Loop over any additional results pages
 foreach my $link ($page->look_down("_tag" => "a", "href" => qr/^WPHAPPSEARCHRES\.displayResultsURL/))
 {
    # Fetch this page...
    $page = do_get(URI->new_abs($link->attr("href"), $SearchURL));

    # ...and output the applications from it
    output_applications($page);
 }

 # Finish off XML output
 $Writer->endTag("applications");
 $Writer->endTag("planning");
 $Writer->end();

 exit 0;

 # Make a GET request
 sub do_get
 {
    my $response = $UA->get(@_);

    die $response->status_line unless $response->is_success;

    return HTML::TreeBuilder->new_from_content($response->content);
 }

 # Make a POST request
 sub do_post
 {
    my $response = $UA->post(@_);

    die $response->status_line unless $response->is_success;

    return HTML::TreeBuilder->new_from_content($response->content);
 }

 # Output applications from a results page
 sub output_applications
 {
    my $page = shift;

    # Find the result table
    my $table = $page->look_down("_tag" => "table", "class" => "apas_tbl");

    # Process each row of the results
    foreach my $row ($table->look_down("_tag" => "tr"))
    {
        my @cells = $row->look_down("_tag" => "td");

        if (@cells >= 3)
        {
            my $reference = $cells[0]->as_trimmed_text;
            my $description = $cells[1]->as_trimmed_text;
            my $address = $cells[2]->as_trimmed_text;
            my $postcode;

            if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/)
            {
                $postcode = $1;
            }

            $Writer->startTag("application");
            $Writer->dataElement("council_reference", $reference);
            $Writer->dataElement("address", $address);
            $Writer->dataElement("postcode", $postcode);
            $Writer->dataElement("description", $description);
            $Writer->dataElement("info_url", $InfoURL . $reference);
            $Writer->dataElement("date_received", $date);
            $Writer->endTag("application");
        }
    }

    return;
 }
--- a/cgi-bin/Epsom
+++ b/cgi-bin/Epsom
--- a/cgi-bin/Fenland.cgi
+++ b/cgi-bin/Fenland.cgi
--- a/cgi-bin/Gateshead.cgi
+++ b/cgi-bin/Gateshead.cgi
--- a/cgi-bin/Gedling.cgi
+++ b/cgi-bin/Gedling.cgi
--- a/cgi-bin/Gloucestershire.cgi
+++ b/cgi-bin/Gloucestershire.cgi
--- a/cgi-bin/Gravesham.cgi
+++ b/cgi-bin/Gravesham.cgi
--- a/cgi-bin/Hammersmith
+++ b/cgi-bin/Hammersmith
--- a/cgi-bin/Haringey.cgi
+++ b/cgi-bin/Haringey.cgi
--- a/cgi-bin/Harrogate.cgi
+++ b/cgi-bin/Harrogate.cgi
--- a/cgi-bin/Hart.cgi
+++ b/cgi-bin/Hart.cgi
--- a/cgi-bin/Hartlepool.cgi
+++ b/cgi-bin/Hartlepool.cgi
--- a/cgi-bin/High
+++ b/cgi-bin/High
--- a/cgi-bin/Huntingdonshire.cgi
+++ b/cgi-bin/Huntingdonshire.cgi
--- a/cgi-bin/Kerrier.cgi
+++ b/cgi-bin/Kerrier.cgi
--- a/cgi-bin/Knowsley.cgi
+++ b/cgi-bin/Knowsley.cgi
--- a/cgi-bin/Lancaster.cgi
+++ b/cgi-bin/Lancaster.cgi
--- a/cgi-bin/Luton.cgi
+++ b/cgi-bin/Luton.cgi
--- a/cgi-bin/Malvern
+++ b/cgi-bin/Malvern
--- a/cgi-bin/Mid
+++ b/cgi-bin/Mid
--- a/cgi-bin/Milton
+++ b/cgi-bin/Milton
--- a/cgi-bin/MultipartPostHandler.py
+++ b/cgi-bin/MultipartPostHandler.py
@@ -0,0 +1,133 @@
 ####
 # 02/2006 Will Holcomb <wholcomb@gmail.com>
 # 
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
 # License as published by the Free Software Foundation; either
 # version 2.1 of the License, or (at your option) any later version.
 # 
 # This library is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # Lesser General Public License for more details.
 #

 # I have edited out a bit in the middle of this which reverts to a normal
 # post with "application/x-www-form-urlencoded" content-type when there are
 # no files.
 # Duncan 5/5/2007

 """
 Usage:
  Enables the use of multipart/form-data for posting forms

 Inspirations:
  Upload files in python:
    http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/146306
  urllib2_file:
    Fabien Seisen: <fabien@seisen.org>

 Example:
  import MultipartPostHandler, urllib2, cookielib

  cookies = cookielib.CookieJar()
  opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookies),
                                MultipartPostHandler.MultipartPostHandler)
  params = { "username" : "bob", "password" : "riviera",
             "file" : open("filename", "rb") }
  opener.open("http://wwww.bobsite.com/upload/", params)

 Further Example:
  The main function of this file is a sample which downloads a page and
  then uploads it to the W3C validator.
 """

 import urllib
 import urllib2
 import mimetools, mimetypes
 import os, stat

 class Callable:
    def __init__(self, anycallable):
        self.__call__ = anycallable

 # Controls how sequences are uncoded. If true, elements may be given multiple values by
 #  assigning a sequence.
 doseq = 1

 class MultipartPostHandler(urllib2.BaseHandler):
    handler_order = urllib2.HTTPHandler.handler_order - 10 # needs to run first

    def http_request(self, request):
        data = request.get_data()
        if data is not None and type(data) != str:
            v_files = []
            v_vars = []
            try:
                 for(key, value) in data.items():
                     if type(value) == file:
                         v_files.append((key, value))
                     else:
                         v_vars.append((key, value))
            except TypeError:
                systype, value, traceback = sys.exc_info()
                raise TypeError, "not a valid non-string sequence or mapping object", traceback

            boundary, data = self.multipart_encode(v_vars, v_files)
            contenttype = 'multipart/form-data; boundary=%s' % boundary
            if(request.has_header('Content-Type')
               and request.get_header('Content-Type').find('multipart/form-data') != 0):
                print "Replacing %s with %s" % (request.get_header('content-type'), 'multipart/form-data')
            request.add_unredirected_header('Content-Type', contenttype)

            request.add_data(data)
        return request

    def multipart_encode(vars, files, boundary = None, buffer = None):
        if boundary is None:
            boundary = mimetools.choose_boundary()
        if buffer is None:
            buffer = ''
        for(key, value) in vars:
            buffer += '--%s\r\n' % boundary
            buffer += 'Content-Disposition: form-data; name="%s"' % key
            buffer += '\r\n\r\n' + value + '\r\n'
        for(key, fd) in files:
            file_size = os.fstat(fd.fileno())[stat.ST_SIZE]
            filename = fd.name.split('/')[-1]
            contenttype = mimetypes.guess_type(filename)[0] or 'application/octet-stream'
            buffer += '--%s\r\n' % boundary
            buffer += 'Content-Disposition: form-data; name="%s"; filename="%s"\r\n' % (key, filename)
            buffer += 'Content-Type: %s\r\n' % contenttype
            # buffer += 'Content-Length: %s\r\n' % file_size
            fd.seek(0)
            buffer += '\r\n' + fd.read() + '\r\n'
        buffer += '--%s--\r\n\r\n' % boundary
        return boundary, buffer
    multipart_encode = Callable(multipart_encode)

    https_request = http_request

 ## def main():
 ##     import tempfile, sys

 ##     validatorURL = "http://validator.w3.org/check"
 ##     opener = urllib2.build_opener(MultipartPostHandler)

 ##     def validateFile(url):
 ##         temp = tempfile.mkstemp(suffix=".html")
 ##         os.write(temp[0], opener.open(url).read())
 ##         params = { "ss" : "0",            # show source
 ##                    "doctype" : "Inline",
 ##                    "uploaded_file" : open(temp[1], "rb") }
 ##         print opener.open(validatorURL, params).read()
 ##         os.remove(temp[1])

 ##     if len(sys.argv[1:]) > 0:
 ##         for arg in sys.argv[1:]:
 ##             validateFile(arg)
 ##     else:
 ##         validateFile("http://www.google.com")

 ## if __name__=="__main__":
 ##     main()
--- a/Leicestershire.cgi
+++ b/Leicestershire.cgi
--- a/cgi-bin/Newcastle-under-Lyme.cgi
+++ b/cgi-bin/Newcastle-under-Lyme.cgi
--- a/cgi-bin/Newham.cgi
+++ b/cgi-bin/Newham.cgi
--- a/cgi-bin/North
+++ b/cgi-bin/North
--- a/Warwickshire.cgi
+++ b/Warwickshire.cgi
--- a/cgi-bin/Northumberland.cgi
+++ b/cgi-bin/Northumberland.cgi
--- a/cgi-bin/Oadby
+++ b/cgi-bin/Oadby
--- a/cgi-bin/Oswestry.cgi
+++ b/cgi-bin/Oswestry.cgi
--- a/cgi-bin/Peterborough.cgi
+++ b/cgi-bin/Peterborough.cgi
--- a/cgi-bin/Portsmouth.cgi
+++ b/cgi-bin/Portsmouth.cgi
--- a/cgi-bin/Redditch.cgi
+++ b/cgi-bin/Redditch.cgi
--- a/cgi-bin/Rushmoor.cgi
+++ b/cgi-bin/Rushmoor.cgi
--- a/cgi-bin/Scarborough.cgi
+++ b/cgi-bin/Scarborough.cgi
--- a/cgi-bin/Sevenoaks.cgi
+++ b/cgi-bin/Sevenoaks.cgi
--- a/cgi-bin/South
+++ b/cgi-bin/South
--- a/cgi-bin/South
+++ b/cgi-bin/South
--- a/Staffordshire.cgi
+++ b/Staffordshire.cgi
--- a/cgi-bin/SouthOxfordshire.cgi
+++ b/cgi-bin/SouthOxfordshire.cgi
--- a/cgi-bin/Southampton.cgi
+++ b/cgi-bin/Southampton.cgi
--- a/cgi-bin/Spelthorne.cgi
+++ b/cgi-bin/Spelthorne.cgi
--- a/Helens.cgi
+++ b/Helens.cgi
--- a/cgi-bin/Stevenage.cgi
+++ b/cgi-bin/Stevenage.cgi
--- a/cgi-bin/Stirling.cgi
+++ b/cgi-bin/Stirling.cgi
--- a/cgi-bin/Stockton-On-Tees.cgi
+++ b/cgi-bin/Stockton-On-Tees.cgi
--- a/cgi-bin/Stratford.cgi
+++ b/cgi-bin/Stratford.cgi
--- a/cgi-bin/Sunderland.cgi
+++ b/cgi-bin/Sunderland.cgi
--- a/cgi-bin/Teignbridge.cgi
+++ b/cgi-bin/Teignbridge.cgi
--- a/cgi-bin/Test
+++ b/cgi-bin/Test
--- a/cgi-bin/Tonbridge.cgi
+++ b/cgi-bin/Tonbridge.cgi
--- a/cgi-bin/Torbay.cgi
+++ b/cgi-bin/Torbay.cgi
--- a/cgi-bin/Vale
+++ b/cgi-bin/Vale
--- a/cgi-bin/Waveney.cgi
+++ b/cgi-bin/Waveney.cgi
--- a/cgi-bin/Wear
+++ b/cgi-bin/Wear
--- a/cgi-bin/Wellingborough.cgi
+++ b/cgi-bin/Wellingborough.cgi
--- a/Berkshire.cgi
+++ b/Berkshire.cgi
--- a/Lancashire.cgi
+++ b/Lancashire.cgi
--- a/cgi-bin/West
+++ b/cgi-bin/West
--- a/cgi-bin/Winchester.cgi
+++ b/cgi-bin/Winchester.cgi
--- a/cgi-bin/Woking.cgi
+++ b/cgi-bin/Woking.cgi
--- a/cgi-bin/Wolverhampton.cgi
+++ b/cgi-bin/Wolverhampton.cgi
--- a/cgi-bin/York.cgi
+++ b/cgi-bin/York.cgi
--- a/cgi-bin/broxbourne.cgi
+++ b/cgi-bin/broxbourne.cgi
@@ -1,163 +0,0 @@
 #!/usr/bin/perl

 use strict;
 use warnings;

 use CGI qw(:cgi);
 use DateTime;
 #use DateTime::Format::DateParse;
 use HTML::TreeBuilder;
 use LWP::UserAgent;
 use XML::Writer;

 # The master URL for the Broxbourne planning search
 our $SearchURL = "http://www2.broxbourne.gov.uk/planningsearch/webform1.aspx";

 # We're a CGI script...
 my $query = CGI->new();

 # Get the date as an offset from 2000-01-01
 my $epoch = DateTime->new(year => 2000, month => 1, day => 1);
 my $querydate = DateTime->new(year => $query->param("year"),
                              month => $query->param("month"),
                              day => $query->param("day"));
 $querydate = $querydate->delta_days($epoch)->delta_days;

 # Construct an LWP user agent
 our $UA = LWP::UserAgent->new(env_proxy => 1);

 # Post the URL to get an initial blank form
 my $state = get_state(do_post());

 # Post each date in turn to build up the state - you can thank
 # Microsoft and ASP.NET for the horrible way we have to do this
 # by posting each argument in turn to build up the state
 $state = get_state(do_post_back($state, 'DateSelector1$Calendar1', $querydate));
 $state = get_state(do_post_back($state, 'DateSelector2$Calendar1', $querydate));

 # Output an HTTP response header
 print $query->header(-type  => "text/xml");

 # Create an XML output stream
 my $Writer = XML::Writer->new(DATA_MODE => 1);

 # Output the XML header data
 $Writer->xmlDecl("UTF-8");
 $Writer->startTag("planning");
 $Writer->dataElement("authority_name", "Borough of Broxbourne");
 $Writer->dataElement("authority_short_name", "Broxbourne");
 $Writer->startTag("applications");

 # Get the arguments for the search...
 my $args = {
    "Srch" => "rb1",
    "__VIEWSTATE" => $state,
    "btnSearch" => "Search",
    "tbReference" => "",
    "tbRef2" => ""
 };

 # ...and then (at last) we can do the search!
 my $page = do_post($args);

 # Loop processing pages of results
 while ($page)
 {
    my $table = $page->look_down("_tag" => "table", "id" => "DataGrid1");

    # Remember the state
    $state = get_state($page);

    # Clear the page for now - this will be reinitialised if we
    # find another page of results to make us go round the loop
    # all over again
    undef $page;

    # Check that we found a table - searches that find no results
    # produce a page with no table in it
    if ($table)
    {
        # Process each row of the results
        foreach my $row ($table->look_down("_tag" => "tr"))
        {
            my @cells = $row->look_down("_tag" => "td");

            if ($cells[0]->look_down("_tag" => "input"))
            {
                my $reference = $cells[1]->as_trimmed_text;
                my $date = $cells[2]->as_trimmed_text;
                my $address = $cells[3]->as_trimmed_text;
                my $description = $cells[4]->as_trimmed_text;
                my $postcode;

                if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/)
                {
                    $postcode = $1;
                }

                $Writer->startTag("application");
                $Writer->dataElement("council_reference", $reference);
                $Writer->dataElement("address", $address);
                $Writer->dataElement("postcode", $postcode);
                $Writer->dataElement("description", $description);
                $Writer->dataElement("date_received", $date);
                $Writer->endTag("application");
            }
            elsif ($cells[0]->attr("colspan") && $cells[0]->attr("colspan") eq "5")
            {
                foreach my $link ($cells[0]->look_down("_tag" => "a"))
                {
                    if ($link->as_trimmed_text eq ">" &&
                        $link->attr("href") =~ /^javascript:__doPostBack\('([^\']*)','([^\']*)'\)$/)
                    {
                        $page = do_post_back($state, $1, $2);
                    }
                }
            }
        }
    }
 }

 # Finish off XML output
 $Writer->endTag("applications");
 $Writer->endTag("planning");
 $Writer->end();

 exit 0;

 # Extract the state from a page so we can repost it
 sub get_state
 {
    my $page = shift;
    my $viewstate = $page->look_down("_tag" => "input", "name" => "__VIEWSTATE");

    return $viewstate->attr("value");
 }

 # Fake up what the doPostBack javascript function in the page does...
 sub do_post_back
 {
    my $state = shift;
    my $target = shift;
    my $argument = shift;

    $target =~ s/\$/:/g;

    my $args = {
        "__EVENTTARGET" => $target,
        "__EVENTARGUMENT" => $argument,
        "__VIEWSTATE" => $state
    };

    return do_post($args);
 }

 # Post to the planning search page
 sub do_post
 {
    my $response = $UA->post($SearchURL, @_);

    die $response->status_line unless $response->is_success;

    return HTML::TreeBuilder->new_from_content($response->content);
 }
--- a/python_scrapers/AcolnetParser.py
+++ b/python_scrapers/AcolnetParser.py
@@ -0,0 +1,364 @@
 #!/usr/local/bin/python

 import urllib, urllib2
 import HTMLParser
 #from BeautifulSoup import BeautifulSoup

 import urlparse

 import re

 end_head_regex = re.compile("</head", re.IGNORECASE)

 import MultipartPostHandler
 # this is not mine, or part of standard python (though it should be!)
 # it comes from http://pipe.scs.fsu.edu/PostHandler/MultipartPostHandler.py

 from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication

 from datetime import date
 from time import strptime


 date_format = "%d/%m/%Y"
 our_date = date(2007,4,25)


 class AcolnetParser(HTMLParser.HTMLParser):
    case_number_tr = None # this one can be got by the td class attribute
    reg_date_tr = None
    location_tr = None
    proposal_tr = None

    # There is no online comment facility in these, so we provide an
    # appropriate email address instead
    comments_email_address = None

    
    def __init__(self,
                 authority_name,
                 authority_short_name,
                 base_url,
                 debug=False):


        HTMLParser.HTMLParser.__init__(self)

        self.authority_name = authority_name
        self.authority_short_name = authority_short_name
        self.base_url = base_url

        self.debug = debug

        self._tr_number = 0

        # This will be used to track the subtable depth
        # when we are in a results-table, in order to
        # avoid adding an application before we have got to
        # the end of the results-table
        self._subtable_depth = None

        self._in_td = False

        # This in where we store the results
        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

        # This will store the planning application we are currently working on.
        self._current_application = None


    def _cleanupHTML(self, html):
        """This method should be overridden in subclasses to perform site specific
        HTML cleanup."""
        return html

    def handle_starttag(self, tag, attrs):
        #print tag, attrs
                    
        if tag == "table":
            if self._current_application is None:
                # Each application is in a separate table with class "results-table"
                for key, value in attrs:
                    if key == "class" and value == "results-table":
                        #print "found results-table"
                        self._current_application = PlanningApplication()
                        self._tr_number = 0
                        self._subtable_depth = 0
                        self._current_application.comment_url = self.comments_email_address
                        break
            else:
                # We are already in a results-table, and this is the start of a subtable,
                # so increment the subtable depth.
                self._subtable_depth += 1

        elif self._current_application is not None:
            if tag == "tr" and self._subtable_depth == 0:
                self._tr_number += 1
            if tag == "td":
                self._in_td = True
                if self._tr_number == self.case_number_tr:
                    #get the reference and the info link here
                    pass
                elif self._tr_number == self.reg_date_tr:
                    #get the registration date here
                    pass
                elif self._tr_number == self.location_tr:
                    #get the address and postcode here
                    pass
                elif self._tr_number == self.proposal_tr:
                    #get the description here
                    pass
            if tag == "a" and self._tr_number == self.case_number_tr:
                # this is where we get the info link and the case number
                for key, value in attrs:
                    if key == "href":
                        self._current_application.info_url = value
                        
    def handle_data(self, data):
        # If we are in the tr which contains the case number,
        # then data is the council reference, so
        # add it to self._current_application.
        if self._in_td:
            if self._tr_number == self.case_number_tr:
                self._current_application.council_reference = data.strip()
            elif self._tr_number == self.reg_date_tr:
                # we need to make a date object out of data
                date_as_str = ''.join(data.strip().split())
                received_date = date(*strptime(date_as_str, date_format)[0:3])

                #print received_date

                self._current_application.date_received = received_date

            elif self._tr_number == self.location_tr:
                location = data.strip()

                self._current_application.address = location
                self._current_application.postcode = getPostcodeFromText(location)
            elif self._tr_number == self.proposal_tr:
                self._current_application.description = data.strip()


    def handle_endtag(self, tag):
        #print "ending: ", tag
        if tag == "table" and self._current_application is not None:
            if self._subtable_depth > 0:
                self._subtable_depth -= 1
            else:
                # We need to add the last application in the table
                if self._current_application is not None:
                    #print "adding application"
                    self._results.addApplication(self._current_application)
                    #print self._current_application
                    self._current_application = None
                    self._tr_number = None
                    self._subtable_depth = None
        elif tag == "td":
            self._in_td = False

    def getResultsByDayMonthYear(self, day, month, year):
        # first we fetch the search page to get ourselves some session info...
        search_form_response = urllib2.urlopen(self.base_url)
        search_form_contents = search_form_response.read()

        # This sometimes causes a problem in HTMLParser, so let's just get the link
        # out with a regex...

        groups = self.action_regex.search(search_form_contents).groups()

        action = groups[0] 
        #print action

        action_url = urlparse.urljoin(self.base_url, action)
        #print action_url

        our_date = date(year, month, day)
        
        search_data = {"regdate1": our_date.strftime(date_format),
                       "regdate2": our_date.strftime(date_format),
                       }
        
        opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
        response = opener.open(action_url, search_data)
        results_html = response.read()

        # This is for doing site specific html cleanup
        results_html = self._cleanupHTML(results_html)

        #some javascript garbage in the header upsets HTMLParser,
        #so we'll just have the body
        just_body = "<html>" + end_head_regex.split(results_html)[-1]

        #outfile = open(self.authority_short_name + ".debug", "w")
        #outfile.write(just_body)        

        self.feed(just_body)
        
        return self._results



    def getResults(self, day, month, year):
        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()


 class BaberghParser(AcolnetParser):
    #search_url = "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

    case_number_tr = 1 # this one can be got by the td class attribute
    reg_date_tr = 2
    location_tr = 4
    proposal_tr = 5

    #authority_name = "Babergh District Council"
    #authority_short_name = "Babergh"

    # It would be nice to scrape this...
    comments_email_address = "planning.reception@babergh.gov.uk"

    action_regex = re.compile("<FORM  name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">")

 class BasingstokeParser(AcolnetParser):
    #search_url = "http://planning.basingstoke.gov.uk/DCOnline2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
    
    case_number_tr = 1 # this one can be got by the td class attribute
    reg_date_tr = 3
    location_tr = 6
    proposal_tr = 8

    #authority_name = "Basingstoke and Deane Borough Council"
    #authority_short_name = "Basingstoke and Deane"

    # It would be nice to scrape this...
    comments_email_address = "development.control@basingstoke.gov.uk"

    action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">")

 class BassetlawParser(AcolnetParser):
    #search_url =  "http://www.bassetlaw.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

    case_number_tr = 1 # this one can be got by the td class attribute
    reg_date_tr = 2
    location_tr = 5
    proposal_tr = 6    

    #authority_name = "Bassetlaw District Council"
    #authority_short_name = "Bassetlaw"

    comments_email_address = "planning@bassetlaw.gov.uk"

    action_regex = re.compile("<FORM  name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)

    def _cleanupHTML(self, html):
        """There is a broken div in this page. We don't need any divs, so
        let's get rid of them all."""

        div_regex = re.compile("</?div[^>]*>", re.IGNORECASE)
        return div_regex.sub('', html)


 class BridgenorthParser(AcolnetParser):
    #search_url = "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"

    case_number_tr = 1 # this one can be got by the td class attribute
    reg_date_tr = 2
    location_tr = 4
    proposal_tr = 5    

    #authority_name = "Bridgenorth District Council"
    #authority_short_name = "Bridgenorth"

    comments_email_address = "contactus@bridgnorth-dc.gov.uk"

    action_regex = re.compile("<FORM  name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">")

 class BuryParser(AcolnetParser):
    #search_url = "http://e-planning.bury.gov.uk/ePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"

    case_number_tr = 1 # this one can be got by the td class attribute
    reg_date_tr = 2
    location_tr = 4
    proposal_tr = 5    

    #authority_name = "Bury Metropolitan Borough Council"
    #authority_short_name = "Bury"

    comments_email_address = "development.control@bury.gov.uk"
    action_regex = re.compile("<FORM  name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">")

 ## class CanterburyParser(AcolnetParser):
 ##     search_url = "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

 ##     case_number_tr = 1 # this one can be got by the td class attribute
 ##     reg_date_tr = 2
 ##     location_tr = 4
 ##     proposal_tr = 5    

 ##     authority_name = "Canterbury City Council"
 ##     authority_short_name = "Canterbury"

 ##     comments_email_address = ""
 ##     action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">")

 class CarlisleParser(AcolnetParser):
    #search_url = "http://planning.carlisle.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"

    case_number_tr = 1 # this one can be got by the td class attribute
    reg_date_tr = 2
    location_tr = 5
    proposal_tr = 6    

    #authority_name = "Carlisle City Council"
    #authority_short_name = "Carlisle"

    comments_email_address = "dc@carlisle.gov.uk"
    action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">")


 class DerbyParser(AcolnetParser):
    #search_url = "http://195.224.106.204/scripts/planningpages02%5CXSLPagesDC_DERBY%5CDCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

    case_number_tr = 1 # this one can be got by the td class attribute
    reg_date_tr = 3
    location_tr = 4
    proposal_tr = 5    

    #authority_name = "Derby City Council"
    #authority_short_name = "Derby"

    comments_email_address = "developmentcontrol@derby.gov.uk"
    action_regex = re.compile("<FORM  name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">")


 if __name__ == '__main__':
    day = 15
    month = 3
    year = 2007

    # working
    # parser = BasingstokeParser()
    parser = BaberghParser("Babergh District Council", "Babergh", "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")

    # works with the divs stripped out
    #parser = BassetlawParser()

    # returns error 400 - bad request
    #parser = BridgenorthParser()

    # working
    #parser = BuryParser()

    # cambridgeshire is a bit different...
    # no advanced search page

    # canterbury
    # results as columns of one table

    # returns error 400 - bad request
    #parser = CarlisleParser()

    # working
    #parser = DerbyParser()
    
    print parser.getResults(day, month, year)