| @@ -0,0 +1,364 @@ | |||||
| #!/usr/local/bin/python | |||||
| import urllib, urllib2 | |||||
| import HTMLParser | |||||
| #from BeautifulSoup import BeautifulSoup | |||||
| import urlparse | |||||
| import re | |||||
| end_head_regex = re.compile("</head", re.IGNORECASE) | |||||
| import MultipartPostHandler | |||||
| # this is not mine, or part of standard python (though it should be!) | |||||
| # it comes from http://pipe.scs.fsu.edu/PostHandler/MultipartPostHandler.py | |||||
| from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication | |||||
| from datetime import date | |||||
| from time import strptime | |||||
| date_format = "%d/%m/%Y" | |||||
| our_date = date(2007,4,25) | |||||
| class AcolnetParser(HTMLParser.HTMLParser): | |||||
| case_number_tr = None # this one can be got by the td class attribute | |||||
| reg_date_tr = None | |||||
| location_tr = None | |||||
| proposal_tr = None | |||||
| # There is no online comment facility in these, so we provide an | |||||
| # appropriate email address instead | |||||
| comments_email_address = None | |||||
| def __init__(self, | |||||
| authority_name, | |||||
| authority_short_name, | |||||
| base_url, | |||||
| debug=False): | |||||
| HTMLParser.HTMLParser.__init__(self) | |||||
| self.authority_name = authority_name | |||||
| self.authority_short_name = authority_short_name | |||||
| self.base_url = base_url | |||||
| self.debug = debug | |||||
| self._tr_number = 0 | |||||
| # This will be used to track the subtable depth | |||||
| # when we are in a results-table, in order to | |||||
| # avoid adding an application before we have got to | |||||
| # the end of the results-table | |||||
| self._subtable_depth = None | |||||
| self._in_td = False | |||||
| # This in where we store the results | |||||
| self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) | |||||
| # This will store the planning application we are currently working on. | |||||
| self._current_application = None | |||||
| def _cleanupHTML(self, html): | |||||
| """This method should be overridden in subclasses to perform site specific | |||||
| HTML cleanup.""" | |||||
| return html | |||||
| def handle_starttag(self, tag, attrs): | |||||
| #print tag, attrs | |||||
| if tag == "table": | |||||
| if self._current_application is None: | |||||
| # Each application is in a separate table with class "results-table" | |||||
| for key, value in attrs: | |||||
| if key == "class" and value == "results-table": | |||||
| #print "found results-table" | |||||
| self._current_application = PlanningApplication() | |||||
| self._tr_number = 0 | |||||
| self._subtable_depth = 0 | |||||
| self._current_application.comment_url = self.comments_email_address | |||||
| break | |||||
| else: | |||||
| # We are already in a results-table, and this is the start of a subtable, | |||||
| # so increment the subtable depth. | |||||
| self._subtable_depth += 1 | |||||
| elif self._current_application is not None: | |||||
| if tag == "tr" and self._subtable_depth == 0: | |||||
| self._tr_number += 1 | |||||
| if tag == "td": | |||||
| self._in_td = True | |||||
| if self._tr_number == self.case_number_tr: | |||||
| #get the reference and the info link here | |||||
| pass | |||||
| elif self._tr_number == self.reg_date_tr: | |||||
| #get the registration date here | |||||
| pass | |||||
| elif self._tr_number == self.location_tr: | |||||
| #get the address and postcode here | |||||
| pass | |||||
| elif self._tr_number == self.proposal_tr: | |||||
| #get the description here | |||||
| pass | |||||
| if tag == "a" and self._tr_number == self.case_number_tr: | |||||
| # this is where we get the info link and the case number | |||||
| for key, value in attrs: | |||||
| if key == "href": | |||||
| self._current_application.info_url = value | |||||
| def handle_data(self, data): | |||||
| # If we are in the tr which contains the case number, | |||||
| # then data is the council reference, so | |||||
| # add it to self._current_application. | |||||
| if self._in_td: | |||||
| if self._tr_number == self.case_number_tr: | |||||
| self._current_application.council_reference = data.strip() | |||||
| elif self._tr_number == self.reg_date_tr: | |||||
| # we need to make a date object out of data | |||||
| date_as_str = ''.join(data.strip().split()) | |||||
| received_date = date(*strptime(date_as_str, date_format)[0:3]) | |||||
| #print received_date | |||||
| self._current_application.date_received = received_date | |||||
| elif self._tr_number == self.location_tr: | |||||
| location = data.strip() | |||||
| self._current_application.address = location | |||||
| self._current_application.postcode = getPostcodeFromText(location) | |||||
| elif self._tr_number == self.proposal_tr: | |||||
| self._current_application.description = data.strip() | |||||
| def handle_endtag(self, tag): | |||||
| #print "ending: ", tag | |||||
| if tag == "table" and self._current_application is not None: | |||||
| if self._subtable_depth > 0: | |||||
| self._subtable_depth -= 1 | |||||
| else: | |||||
| # We need to add the last application in the table | |||||
| if self._current_application is not None: | |||||
| #print "adding application" | |||||
| self._results.addApplication(self._current_application) | |||||
| #print self._current_application | |||||
| self._current_application = None | |||||
| self._tr_number = None | |||||
| self._subtable_depth = None | |||||
| elif tag == "td": | |||||
| self._in_td = False | |||||
| def getResultsByDayMonthYear(self, day, month, year): | |||||
| # first we fetch the search page to get ourselves some session info... | |||||
| search_form_response = urllib2.urlopen(self.base_url) | |||||
| search_form_contents = search_form_response.read() | |||||
| # This sometimes causes a problem in HTMLParser, so let's just get the link | |||||
| # out with a regex... | |||||
| groups = self.action_regex.search(search_form_contents).groups() | |||||
| action = groups[0] | |||||
| #print action | |||||
| action_url = urlparse.urljoin(self.base_url, action) | |||||
| #print action_url | |||||
| our_date = date(year, month, day) | |||||
| search_data = {"regdate1": our_date.strftime(date_format), | |||||
| "regdate2": our_date.strftime(date_format), | |||||
| } | |||||
| opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler) | |||||
| response = opener.open(action_url, search_data) | |||||
| results_html = response.read() | |||||
| # This is for doing site specific html cleanup | |||||
| results_html = self._cleanupHTML(results_html) | |||||
| #some javascript garbage in the header upsets HTMLParser, | |||||
| #so we'll just have the body | |||||
| just_body = "<html>" + end_head_regex.split(results_html)[-1] | |||||
| #outfile = open(self.authority_short_name + ".debug", "w") | |||||
| #outfile.write(just_body) | |||||
| self.feed(just_body) | |||||
| return self._results | |||||
| def getResults(self, day, month, year): | |||||
| return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() | |||||
| class BaberghParser(AcolnetParser): | |||||
| #search_url = "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch" | |||||
| case_number_tr = 1 # this one can be got by the td class attribute | |||||
| reg_date_tr = 2 | |||||
| location_tr = 4 | |||||
| proposal_tr = 5 | |||||
| #authority_name = "Babergh District Council" | |||||
| #authority_short_name = "Babergh" | |||||
| # It would be nice to scrape this... | |||||
| comments_email_address = "planning.reception@babergh.gov.uk" | |||||
| action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">") | |||||
| class BasingstokeParser(AcolnetParser): | |||||
| #search_url = "http://planning.basingstoke.gov.uk/DCOnline2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch" | |||||
| case_number_tr = 1 # this one can be got by the td class attribute | |||||
| reg_date_tr = 3 | |||||
| location_tr = 6 | |||||
| proposal_tr = 8 | |||||
| #authority_name = "Basingstoke and Deane Borough Council" | |||||
| #authority_short_name = "Basingstoke and Deane" | |||||
| # It would be nice to scrape this... | |||||
| comments_email_address = "development.control@basingstoke.gov.uk" | |||||
| action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">") | |||||
| class BassetlawParser(AcolnetParser): | |||||
| #search_url = "http://www.bassetlaw.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch" | |||||
| case_number_tr = 1 # this one can be got by the td class attribute | |||||
| reg_date_tr = 2 | |||||
| location_tr = 5 | |||||
| proposal_tr = 6 | |||||
| #authority_name = "Bassetlaw District Council" | |||||
| #authority_short_name = "Bassetlaw" | |||||
| comments_email_address = "planning@bassetlaw.gov.uk" | |||||
| action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE) | |||||
| def _cleanupHTML(self, html): | |||||
| """There is a broken div in this page. We don't need any divs, so | |||||
| let's get rid of them all.""" | |||||
| div_regex = re.compile("</?div[^>]*>", re.IGNORECASE) | |||||
| return div_regex.sub('', html) | |||||
| class BridgenorthParser(AcolnetParser): | |||||
| #search_url = "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch" | |||||
| case_number_tr = 1 # this one can be got by the td class attribute | |||||
| reg_date_tr = 2 | |||||
| location_tr = 4 | |||||
| proposal_tr = 5 | |||||
| #authority_name = "Bridgenorth District Council" | |||||
| #authority_short_name = "Bridgenorth" | |||||
| comments_email_address = "contactus@bridgnorth-dc.gov.uk" | |||||
| action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">") | |||||
| class BuryParser(AcolnetParser): | |||||
| #search_url = "http://e-planning.bury.gov.uk/ePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch" | |||||
| case_number_tr = 1 # this one can be got by the td class attribute | |||||
| reg_date_tr = 2 | |||||
| location_tr = 4 | |||||
| proposal_tr = 5 | |||||
| #authority_name = "Bury Metropolitan Borough Council" | |||||
| #authority_short_name = "Bury" | |||||
| comments_email_address = "development.control@bury.gov.uk" | |||||
| action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">") | |||||
| ## class CanterburyParser(AcolnetParser): | |||||
| ## search_url = "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch" | |||||
| ## case_number_tr = 1 # this one can be got by the td class attribute | |||||
| ## reg_date_tr = 2 | |||||
| ## location_tr = 4 | |||||
| ## proposal_tr = 5 | |||||
| ## authority_name = "Canterbury City Council" | |||||
| ## authority_short_name = "Canterbury" | |||||
| ## comments_email_address = "" | |||||
| ## action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">") | |||||
| class CarlisleParser(AcolnetParser): | |||||
| #search_url = "http://planning.carlisle.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch" | |||||
| case_number_tr = 1 # this one can be got by the td class attribute | |||||
| reg_date_tr = 2 | |||||
| location_tr = 5 | |||||
| proposal_tr = 6 | |||||
| #authority_name = "Carlisle City Council" | |||||
| #authority_short_name = "Carlisle" | |||||
| comments_email_address = "dc@carlisle.gov.uk" | |||||
| action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">") | |||||
| class DerbyParser(AcolnetParser): | |||||
| #search_url = "http://195.224.106.204/scripts/planningpages02%5CXSLPagesDC_DERBY%5CDCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch" | |||||
| case_number_tr = 1 # this one can be got by the td class attribute | |||||
| reg_date_tr = 3 | |||||
| location_tr = 4 | |||||
| proposal_tr = 5 | |||||
| #authority_name = "Derby City Council" | |||||
| #authority_short_name = "Derby" | |||||
| comments_email_address = "developmentcontrol@derby.gov.uk" | |||||
| action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">") | |||||
| if __name__ == '__main__': | |||||
| day = 15 | |||||
| month = 3 | |||||
| year = 2007 | |||||
| # working | |||||
| # parser = BasingstokeParser() | |||||
| parser = BaberghParser("Babergh District Council", "Babergh", "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") | |||||
| # works with the divs stripped out | |||||
| #parser = BassetlawParser() | |||||
| # returns error 400 - bad request | |||||
| #parser = BridgenorthParser() | |||||
| # working | |||||
| #parser = BuryParser() | |||||
| # cambridgeshire is a bit different... | |||||
| # no advanced search page | |||||
| # canterbury | |||||
| # results as columns of one table | |||||
| # returns error 400 - bad request | |||||
| #parser = CarlisleParser() | |||||
| # working | |||||
| #parser = DerbyParser() | |||||
| print parser.getResults(day, month, year) | |||||
| @@ -0,0 +1,29 @@ | |||||
| #!/usr/local/bin/python | |||||
| # This is the parser for Babergh District Council. | |||||
| # it is generated from the file CGITemplate | |||||
| import cgi | |||||
| import cgitb | |||||
| #cgitb.enable(display=0, logdir="/tmp") | |||||
| form = cgi.FieldStorage() | |||||
| day = form.getfirst('day') | |||||
| month = form.getfirst('month') | |||||
| year = form.getfirst('year') | |||||
| authority_name = "Babergh District Council" | |||||
| authority_short_name = "Babergh" | |||||
| base_url = "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch" | |||||
| import AcolnetParser | |||||
| parser = AcolnetParser.BaberghParser(authority_name, authority_short_name, base_url) | |||||
| xml = parser.getResults(day, month, year) | |||||
| print "Content-Type: text/xml" # XML is following | |||||
| print xml # print the xml | |||||
| @@ -0,0 +1,29 @@ | |||||
| #!/usr/local/bin/python | |||||
| # This is the parser for Basingstoke and Deane Borough Council. | |||||
| # it is generated from the file CGITemplate | |||||
| import cgi | |||||
| import cgitb | |||||
| #cgitb.enable(display=0, logdir="/tmp") | |||||
| form = cgi.FieldStorage() | |||||
| day = form.getfirst('day') | |||||
| month = form.getfirst('month') | |||||
| year = form.getfirst('year') | |||||
| authority_name = "Basingstoke and Deane Borough Council" | |||||
| authority_short_name = "Basingstoke and Deane" | |||||
| base_url = "http://planning.basingstoke.gov.uk/DCOnline2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch" | |||||
| import AcolnetParser | |||||
| parser = AcolnetParser.BasingstokeParser(authority_name, authority_short_name, base_url) | |||||
| xml = parser.getResults(day, month, year) | |||||
| print "Content-Type: text/xml" # XML is following | |||||
| print xml # print the xml | |||||
| @@ -0,0 +1,29 @@ | |||||
| #!/usr/local/bin/python | |||||
| # This is the parser for Bassetlaw District Council. | |||||
| # it is generated from the file CGITemplate | |||||
| import cgi | |||||
| import cgitb | |||||
| #cgitb.enable(display=0, logdir="/tmp") | |||||
| form = cgi.FieldStorage() | |||||
| day = form.getfirst('day') | |||||
| month = form.getfirst('month') | |||||
| year = form.getfirst('year') | |||||
| authority_name = "Bassetlaw District Council" | |||||
| authority_short_name = "Bassetlaw" | |||||
| base_url = "http://www.bassetlaw.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch" | |||||
| import AcolnetParser | |||||
| parser = AcolnetParser.BassetlawParser(authority_name, authority_short_name, base_url) | |||||
| xml = parser.getResults(day, month, year) | |||||
| print "Content-Type: text/xml" # XML is following | |||||
| print xml # print the xml | |||||
| @@ -0,0 +1,29 @@ | |||||
| #!/usr/local/bin/python | |||||
| # This is the parser for Bury Metropolitan Borough Council. | |||||
| # it is generated from the file CGITemplate | |||||
| import cgi | |||||
| import cgitb | |||||
| #cgitb.enable(display=0, logdir="/tmp") | |||||
| form = cgi.FieldStorage() | |||||
| day = form.getfirst('day') | |||||
| month = form.getfirst('month') | |||||
| year = form.getfirst('year') | |||||
| authority_name = "Bury Metropolitan Borough Council" | |||||
| authority_short_name = "Bury" | |||||
| base_url = "http://e-planning.bury.gov.uk/ePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch" | |||||
| import AcolnetParser | |||||
| parser = AcolnetParser.BuryParser(authority_name, authority_short_name, base_url) | |||||
| xml = parser.getResults(day, month, year) | |||||
| print "Content-Type: text/xml" # XML is following | |||||
| print xml # print the xml | |||||
| @@ -1,108 +0,0 @@ | |||||
| #!/usr/bin/perl | |||||
| use strict; | |||||
| use warnings; | |||||
| use CGI qw(:cgi); | |||||
| use HTML::TreeBuilder; | |||||
| use LWP::UserAgent; | |||||
| use XML::Writer; | |||||
| # The master URLs for the Dacorum planning search | |||||
| our $SearchURL = "http://www.dacorum.gov.uk/default.aspx?page=1495"; | |||||
| our $InfoURL = "http://www.dacorum.gov.uk/Default.aspx?page=1497&ID="; | |||||
| our $CommentURL = "http://www.dacorum.gov.uk/Default.aspx?page=2847&ID="; | |||||
| # We're a CGI script... | |||||
| my $query = CGI->new(); | |||||
| # Construct an LWP user agent | |||||
| our $UA = LWP::UserAgent->new(env_proxy => 1, | |||||
| cookie_jar => {}, | |||||
| requests_redirectable => [ 'GET', 'HEAD', 'POST' ]); | |||||
| # Post the URL to get an initial blank form | |||||
| my $state = get_state(do_post()); | |||||
| # Do the search | |||||
| my $page = do_post({"__VIEWSTATE" => $state, | |||||
| "Template:_ctl10:_ctl0:btnSearch" => "Search", | |||||
| "Template:_ctl10:_ctl0:tbRegistrationFromDay" => $query->param("day"), | |||||
| "Template:_ctl10:_ctl0:tbRegistrationFromMon" => $query->param("month"), | |||||
| "Template:_ctl10:_ctl0:tbRegistrationFromYear" => $query->param("year"), | |||||
| "Template:_ctl10:_ctl0:tbRegistrationToDay" => $query->param("day"), | |||||
| "Template:_ctl10:_ctl0:tbRegistrationToMon" => $query->param("month"), | |||||
| "Template:_ctl10:_ctl0:tbRegistrationToYear" => $query->param("year")}); | |||||
| # Output an HTTP response header | |||||
| print $query->header(-type => "text/xml"); | |||||
| # Create an XML output stream | |||||
| my $Writer = XML::Writer->new(DATA_MODE => 1); | |||||
| # Output the XML header data | |||||
| $Writer->xmlDecl("UTF-8"); | |||||
| $Writer->startTag("planning"); | |||||
| $Writer->dataElement("authority_name", "Dacorum Borough Council"); | |||||
| $Writer->dataElement("authority_short_name", "Dacorum"); | |||||
| $Writer->startTag("applications"); | |||||
| # Find the result table | |||||
| my $table = $page->look_down("_tag" => "table", "class" => "FormDataGrid"); | |||||
| # Process each row of the results | |||||
| foreach my $row ($table->look_down("_tag" => "tr")) | |||||
| { | |||||
| my @cells = $row->look_down("_tag" => "td"); | |||||
| if ($cells[0]->attr("class") eq "FormGridDataItem" || | |||||
| $cells[0]->attr("class") eq "FormGridAlternatingDataItem") | |||||
| { | |||||
| my $reference = $cells[0]->as_trimmed_text; | |||||
| my $address = $cells[1]->as_trimmed_text; | |||||
| my $description = $cells[2]->as_trimmed_text; | |||||
| my $date = $cells[3]->as_trimmed_text; | |||||
| my $postcode; | |||||
| if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/) | |||||
| { | |||||
| $postcode = $1; | |||||
| } | |||||
| $Writer->startTag("application"); | |||||
| $Writer->dataElement("council_reference", $reference); | |||||
| $Writer->dataElement("address", $address); | |||||
| $Writer->dataElement("postcode", $postcode); | |||||
| $Writer->dataElement("description", $description); | |||||
| $Writer->dataElement("info_url", $InfoURL . $reference); | |||||
| $Writer->dataElement("comment_url", $CommentURL . $reference); | |||||
| $Writer->dataElement("date_received", $date); | |||||
| $Writer->endTag("application"); | |||||
| } | |||||
| } | |||||
| # Finish off XML output | |||||
| $Writer->endTag("applications"); | |||||
| $Writer->endTag("planning"); | |||||
| $Writer->end(); | |||||
| exit 0; | |||||
| # Extract the state from a page so we can repost it | |||||
| sub get_state | |||||
| { | |||||
| my $page = shift; | |||||
| my $viewstate = $page->look_down("_tag" => "input", "name" => "__VIEWSTATE"); | |||||
| return $viewstate->attr("value"); | |||||
| } | |||||
| # Post to the planning search page | |||||
| sub do_post | |||||
| { | |||||
| my $response = $UA->post($SearchURL, @_); | |||||
| die $response->status_line unless $response->is_success; | |||||
| return HTML::TreeBuilder->new_from_content($response->content); | |||||
| } | |||||
| @@ -0,0 +1,29 @@ | |||||
| #!/usr/local/bin/python | |||||
| # This is the parser for Derby City Council. | |||||
| # it is generated from the file CGITemplate | |||||
| import cgi | |||||
| import cgitb | |||||
| #cgitb.enable(display=0, logdir="/tmp") | |||||
| form = cgi.FieldStorage() | |||||
| day = form.getfirst('day') | |||||
| month = form.getfirst('month') | |||||
| year = form.getfirst('year') | |||||
| authority_name = "Derby City Council" | |||||
| authority_short_name = "Derby" | |||||
| base_url = "http://195.224.106.204/scripts/planningpages02%5CXSLPagesDC_DERBY%5CDCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch" | |||||
| import AcolnetParser | |||||
| parser = AcolnetParser.DerbyParser(authority_name, authority_short_name, base_url) | |||||
| xml = parser.getResults(day, month, year) | |||||
| print "Content-Type: text/xml" # XML is following | |||||
| print xml # print the xml | |||||
| @@ -1,122 +0,0 @@ | |||||
| #!/usr/bin/perl | |||||
| use strict; | |||||
| use warnings; | |||||
| use CGI qw(:cgi); | |||||
| use HTML::TreeBuilder; | |||||
| use LWP::UserAgent; | |||||
| use XML::Writer; | |||||
| # The master URLs for the East Herts planning search | |||||
| our $SearchURL = "http://e-services.eastherts.gov.uk/swiftlg/apas/run/WPHAPPCRITERIA"; | |||||
| our $InfoURL = "http://e-services.eastherts.gov.uk/swiftlg/apas/run/WPHAPPDETAIL.DisplayUrl?theApnID="; | |||||
| our $CommentURL = "http://e-services.eastherts.gov.uk/swiftlg/apas/run/wphmakerep.displayURL?ApnID="; | |||||
| # We're a CGI script... | |||||
| my $query = CGI->new(); | |||||
| # Get the date to fetch | |||||
| my $date = $query->param("day") . "/" . $query->param("month") . "/" . $query->param("year"); | |||||
| # Construct an LWP user agent | |||||
| our $UA = LWP::UserAgent->new(env_proxy => 1); | |||||
| # Do the search | |||||
| my $page = do_post($SearchURL, | |||||
| {"REGFROMDATE.MAINBODY.WPACIS.1." => $date, | |||||
| "REGTODATE.MAINBODY.WPACIS.1." => $date, | |||||
| "SEARCHBUTTON.MAINBODY.WPACIS.1." => "Search"}); | |||||
| # Output an HTTP response header | |||||
| print $query->header(-type => "text/xml"); | |||||
| # Create an XML output stream | |||||
| my $Writer = XML::Writer->new(DATA_MODE => 1); | |||||
| # Output the XML header data | |||||
| $Writer->xmlDecl("UTF-8"); | |||||
| $Writer->startTag("planning"); | |||||
| $Writer->dataElement("authority_name", "East Herts Council"); | |||||
| $Writer->dataElement("authority_short_name", "East Herts"); | |||||
| $Writer->startTag("applications"); | |||||
| # Output any applications on the first page | |||||
| output_applications($page); | |||||
| # Loop over any additional results pages | |||||
| foreach my $link ($page->look_down("_tag" => "a", "href" => qr/^WPHAPPSEARCHRES\.displayResultsURL/)) | |||||
| { | |||||
| # Fetch this page... | |||||
| $page = do_get(URI->new_abs($link->attr("href"), $SearchURL)); | |||||
| # ...and output the applications from it | |||||
| output_applications($page); | |||||
| } | |||||
| # Finish off XML output | |||||
| $Writer->endTag("applications"); | |||||
| $Writer->endTag("planning"); | |||||
| $Writer->end(); | |||||
| exit 0; | |||||
| # Make a GET request | |||||
| sub do_get | |||||
| { | |||||
| my $response = $UA->get(@_); | |||||
| die $response->status_line unless $response->is_success; | |||||
| return HTML::TreeBuilder->new_from_content($response->content); | |||||
| } | |||||
| # Make a POST request | |||||
| sub do_post | |||||
| { | |||||
| my $response = $UA->post(@_); | |||||
| die $response->status_line unless $response->is_success; | |||||
| return HTML::TreeBuilder->new_from_content($response->content); | |||||
| } | |||||
| # Output applications from a results page | |||||
| sub output_applications | |||||
| { | |||||
| my $page = shift; | |||||
| # Find the result table | |||||
| my $table = $page->look_down("_tag" => "table", "cellspacing" => "2", "cellpadding" => "2"); | |||||
| # Process each row of the results | |||||
| foreach my $row ($table->look_down("_tag" => "tr")) | |||||
| { | |||||
| my @cells = $row->look_down("_tag" => "td"); | |||||
| if (@cells >= 3) | |||||
| { | |||||
| my $reference = $cells[0]->as_trimmed_text; | |||||
| my $description = $cells[1]->as_trimmed_text; | |||||
| my $address = $cells[2]->as_trimmed_text; | |||||
| my $postcode; | |||||
| if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/) | |||||
| { | |||||
| $postcode = $1; | |||||
| } | |||||
| $Writer->startTag("application"); | |||||
| $Writer->dataElement("council_reference", $reference); | |||||
| $Writer->dataElement("address", $address); | |||||
| $Writer->dataElement("postcode", $postcode); | |||||
| $Writer->dataElement("description", $description); | |||||
| $Writer->dataElement("info_url", $InfoURL . $reference); | |||||
| $Writer->dataElement("comment_url", $CommentURL . $reference); | |||||
| $Writer->dataElement("date_received", $date); | |||||
| $Writer->endTag("application"); | |||||
| } | |||||
| } | |||||
| return; | |||||
| } | |||||
| @@ -1,121 +0,0 @@ | |||||
| #!/usr/bin/perl | |||||
| use strict; | |||||
| use warnings; | |||||
| use CGI qw(:cgi); | |||||
| use HTML::TreeBuilder; | |||||
| use LWP::UserAgent; | |||||
| use XML::Writer; | |||||
| # The master URLs for the Enfield planning search | |||||
| our $SearchURL = "http://forms.enfield.gov.uk/swiftlg/apas/run/WPHAPPCRITERIA"; | |||||
| our $InfoURL = "http://forms.enfield.gov.uk/swiftlg/apas/run/WPHAPPDETAIL.DisplayUrl?theApnID="; | |||||
| # We're a CGI script... | |||||
| my $query = CGI->new(); | |||||
| # Get the date to fetch | |||||
| my $date = $query->param("day") . "/" . $query->param("month") . "/" . $query->param("year"); | |||||
| # Construct an LWP user agent | |||||
| our $UA = LWP::UserAgent->new(env_proxy => 1); | |||||
| # Do the search | |||||
| my $page = do_post($SearchURL, | |||||
| {"REGFROMDATE.MAINBODY.WPACIS.1." => $date, | |||||
| "REGTODATE.MAINBODY.WPACIS.1." => $date, | |||||
| "SEARCHBUTTON.MAINBODY.WPACIS.1." => "Search"}); | |||||
| # Output an HTTP response header | |||||
| print $query->header(-type => "text/xml"); | |||||
| # Create an XML output stream | |||||
| my $Writer = XML::Writer->new(DATA_MODE => 1); | |||||
| # Output the XML header data | |||||
| $Writer->xmlDecl("UTF-8"); | |||||
| $Writer->startTag("planning"); | |||||
| $Writer->dataElement("authority_name", "Enfield Council"); | |||||
| $Writer->dataElement("authority_short_name", "Enfield"); | |||||
| $Writer->startTag("applications"); | |||||
| # Output any applications on the first page | |||||
| output_applications($page); | |||||
| # Loop over any additional results pages | |||||
| foreach my $link ($page->look_down("_tag" => "a", "href" => qr/^WPHAPPSEARCHRES\.displayResultsURL/)) | |||||
| { | |||||
| # Fetch this page... | |||||
| $page = do_get(URI->new_abs($link->attr("href"), $SearchURL)); | |||||
| # ...and output the applications from it | |||||
| output_applications($page); | |||||
| } | |||||
| # Finish off XML output | |||||
| $Writer->endTag("applications"); | |||||
| $Writer->endTag("planning"); | |||||
| $Writer->end(); | |||||
| exit 0; | |||||
| # Make a GET request | |||||
| sub do_get | |||||
| { | |||||
| my $response = $UA->get(@_); | |||||
| die $response->status_line unless $response->is_success; | |||||
| return HTML::TreeBuilder->new_from_content($response->content); | |||||
| } | |||||
| # Make a POST request | |||||
| sub do_post | |||||
| { | |||||
| my $response = $UA->post(@_); | |||||
| die $response->status_line unless $response->is_success; | |||||
| return HTML::TreeBuilder->new_from_content($response->content); | |||||
| } | |||||
| # Output applications from a results page | |||||
| sub output_applications | |||||
| { | |||||
| my $page = shift; | |||||
| # Find the result table | |||||
| my $table = $page->look_down("_tag" => "table", "class" => "apas_tbl"); | |||||
| # Process each row of the results | |||||
| foreach my $row ($table->look_down("_tag" => "tr")) | |||||
| { | |||||
| my @cells = $row->look_down("_tag" => "td"); | |||||
| if (@cells >= 3) | |||||
| { | |||||
| my $reference = $cells[0]->as_trimmed_text; | |||||
| my $description = $cells[1]->as_trimmed_text; | |||||
| my $address = $cells[2]->as_trimmed_text; | |||||
| my $postcode; | |||||
| if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/) | |||||
| { | |||||
| $postcode = $1; | |||||
| } | |||||
| $Writer->startTag("application"); | |||||
| $Writer->dataElement("council_reference", $reference); | |||||
| $Writer->dataElement("address", $address); | |||||
| $Writer->dataElement("postcode", $postcode); | |||||
| $Writer->dataElement("description", $description); | |||||
| $Writer->dataElement("info_url", $InfoURL . $reference); | |||||
| $Writer->dataElement("date_received", $date); | |||||
| $Writer->endTag("application"); | |||||
| } | |||||
| } | |||||
| return; | |||||
| } | |||||
| @@ -0,0 +1,133 @@ | |||||
| #### | |||||
| # 02/2006 Will Holcomb <wholcomb@gmail.com> | |||||
| # | |||||
| # This library is free software; you can redistribute it and/or | |||||
| # modify it under the terms of the GNU Lesser General Public | |||||
| # License as published by the Free Software Foundation; either | |||||
| # version 2.1 of the License, or (at your option) any later version. | |||||
| # | |||||
| # This library is distributed in the hope that it will be useful, | |||||
| # but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| # Lesser General Public License for more details. | |||||
| # | |||||
| # I have edited out a bit in the middle of this which reverts to a normal | |||||
| # post with "application/x-www-form-urlencoded" content-type when there are | |||||
| # no files. | |||||
| # Duncan 5/5/2007 | |||||
| """ | |||||
| Usage: | |||||
| Enables the use of multipart/form-data for posting forms | |||||
| Inspirations: | |||||
| Upload files in python: | |||||
| http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/146306 | |||||
| urllib2_file: | |||||
| Fabien Seisen: <fabien@seisen.org> | |||||
| Example: | |||||
| import MultipartPostHandler, urllib2, cookielib | |||||
| cookies = cookielib.CookieJar() | |||||
| opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookies), | |||||
| MultipartPostHandler.MultipartPostHandler) | |||||
| params = { "username" : "bob", "password" : "riviera", | |||||
| "file" : open("filename", "rb") } | |||||
| opener.open("http://wwww.bobsite.com/upload/", params) | |||||
| Further Example: | |||||
| The main function of this file is a sample which downloads a page and | |||||
| then uploads it to the W3C validator. | |||||
| """ | |||||
| import urllib | |||||
| import urllib2 | |||||
| import mimetools, mimetypes | |||||
| import os, stat | |||||
| class Callable: | |||||
| def __init__(self, anycallable): | |||||
| self.__call__ = anycallable | |||||
| # Controls how sequences are uncoded. If true, elements may be given multiple values by | |||||
| # assigning a sequence. | |||||
| doseq = 1 | |||||
| class MultipartPostHandler(urllib2.BaseHandler): | |||||
| handler_order = urllib2.HTTPHandler.handler_order - 10 # needs to run first | |||||
| def http_request(self, request): | |||||
| data = request.get_data() | |||||
| if data is not None and type(data) != str: | |||||
| v_files = [] | |||||
| v_vars = [] | |||||
| try: | |||||
| for(key, value) in data.items(): | |||||
| if type(value) == file: | |||||
| v_files.append((key, value)) | |||||
| else: | |||||
| v_vars.append((key, value)) | |||||
| except TypeError: | |||||
| systype, value, traceback = sys.exc_info() | |||||
| raise TypeError, "not a valid non-string sequence or mapping object", traceback | |||||
| boundary, data = self.multipart_encode(v_vars, v_files) | |||||
| contenttype = 'multipart/form-data; boundary=%s' % boundary | |||||
| if(request.has_header('Content-Type') | |||||
| and request.get_header('Content-Type').find('multipart/form-data') != 0): | |||||
| print "Replacing %s with %s" % (request.get_header('content-type'), 'multipart/form-data') | |||||
| request.add_unredirected_header('Content-Type', contenttype) | |||||
| request.add_data(data) | |||||
| return request | |||||
| def multipart_encode(vars, files, boundary = None, buffer = None): | |||||
| if boundary is None: | |||||
| boundary = mimetools.choose_boundary() | |||||
| if buffer is None: | |||||
| buffer = '' | |||||
| for(key, value) in vars: | |||||
| buffer += '--%s\r\n' % boundary | |||||
| buffer += 'Content-Disposition: form-data; name="%s"' % key | |||||
| buffer += '\r\n\r\n' + value + '\r\n' | |||||
| for(key, fd) in files: | |||||
| file_size = os.fstat(fd.fileno())[stat.ST_SIZE] | |||||
| filename = fd.name.split('/')[-1] | |||||
| contenttype = mimetypes.guess_type(filename)[0] or 'application/octet-stream' | |||||
| buffer += '--%s\r\n' % boundary | |||||
| buffer += 'Content-Disposition: form-data; name="%s"; filename="%s"\r\n' % (key, filename) | |||||
| buffer += 'Content-Type: %s\r\n' % contenttype | |||||
| # buffer += 'Content-Length: %s\r\n' % file_size | |||||
| fd.seek(0) | |||||
| buffer += '\r\n' + fd.read() + '\r\n' | |||||
| buffer += '--%s--\r\n\r\n' % boundary | |||||
| return boundary, buffer | |||||
| multipart_encode = Callable(multipart_encode) | |||||
| https_request = http_request | |||||
| ## def main(): | |||||
| ## import tempfile, sys | |||||
| ## validatorURL = "http://validator.w3.org/check" | |||||
| ## opener = urllib2.build_opener(MultipartPostHandler) | |||||
| ## def validateFile(url): | |||||
| ## temp = tempfile.mkstemp(suffix=".html") | |||||
| ## os.write(temp[0], opener.open(url).read()) | |||||
| ## params = { "ss" : "0", # show source | |||||
| ## "doctype" : "Inline", | |||||
| ## "uploaded_file" : open(temp[1], "rb") } | |||||
| ## print opener.open(validatorURL, params).read() | |||||
| ## os.remove(temp[1]) | |||||
| ## if len(sys.argv[1:]) > 0: | |||||
| ## for arg in sys.argv[1:]: | |||||
| ## validateFile(arg) | |||||
| ## else: | |||||
| ## validateFile("http://www.google.com") | |||||
| ## if __name__=="__main__": | |||||
| ## main() | |||||
| @@ -1,163 +0,0 @@ | |||||
| #!/usr/bin/perl | |||||
| use strict; | |||||
| use warnings; | |||||
| use CGI qw(:cgi); | |||||
| use DateTime; | |||||
| #use DateTime::Format::DateParse; | |||||
| use HTML::TreeBuilder; | |||||
| use LWP::UserAgent; | |||||
| use XML::Writer; | |||||
| # The master URL for the Broxbourne planning search | |||||
| our $SearchURL = "http://www2.broxbourne.gov.uk/planningsearch/webform1.aspx"; | |||||
| # We're a CGI script... | |||||
| my $query = CGI->new(); | |||||
| # Get the date as an offset from 2000-01-01 | |||||
| my $epoch = DateTime->new(year => 2000, month => 1, day => 1); | |||||
| my $querydate = DateTime->new(year => $query->param("year"), | |||||
| month => $query->param("month"), | |||||
| day => $query->param("day")); | |||||
| $querydate = $querydate->delta_days($epoch)->delta_days; | |||||
| # Construct an LWP user agent | |||||
| our $UA = LWP::UserAgent->new(env_proxy => 1); | |||||
| # Post the URL to get an initial blank form | |||||
| my $state = get_state(do_post()); | |||||
| # Post each date in turn to build up the state - you can thank | |||||
| # Microsoft and ASP.NET for the horrible way we have to do this | |||||
| # by posting each argument in turn to build up the state | |||||
| $state = get_state(do_post_back($state, 'DateSelector1$Calendar1', $querydate)); | |||||
| $state = get_state(do_post_back($state, 'DateSelector2$Calendar1', $querydate)); | |||||
| # Output an HTTP response header | |||||
| print $query->header(-type => "text/xml"); | |||||
| # Create an XML output stream | |||||
| my $Writer = XML::Writer->new(DATA_MODE => 1); | |||||
| # Output the XML header data | |||||
| $Writer->xmlDecl("UTF-8"); | |||||
| $Writer->startTag("planning"); | |||||
| $Writer->dataElement("authority_name", "Borough of Broxbourne"); | |||||
| $Writer->dataElement("authority_short_name", "Broxbourne"); | |||||
| $Writer->startTag("applications"); | |||||
| # Get the arguments for the search... | |||||
| my $args = { | |||||
| "Srch" => "rb1", | |||||
| "__VIEWSTATE" => $state, | |||||
| "btnSearch" => "Search", | |||||
| "tbReference" => "", | |||||
| "tbRef2" => "" | |||||
| }; | |||||
| # ...and then (at last) we can do the search! | |||||
| my $page = do_post($args); | |||||
| # Loop processing pages of results | |||||
| while ($page) | |||||
| { | |||||
| my $table = $page->look_down("_tag" => "table", "id" => "DataGrid1"); | |||||
| # Remember the state | |||||
| $state = get_state($page); | |||||
| # Clear the page for now - this will be reinitialised if we | |||||
| # find another page of results to make us go round the loop | |||||
| # all over again | |||||
| undef $page; | |||||
| # Check that we found a table - searches that find no results | |||||
| # produce a page with no table in it | |||||
| if ($table) | |||||
| { | |||||
| # Process each row of the results | |||||
| foreach my $row ($table->look_down("_tag" => "tr")) | |||||
| { | |||||
| my @cells = $row->look_down("_tag" => "td"); | |||||
| if ($cells[0]->look_down("_tag" => "input")) | |||||
| { | |||||
| my $reference = $cells[1]->as_trimmed_text; | |||||
| my $date = $cells[2]->as_trimmed_text; | |||||
| my $address = $cells[3]->as_trimmed_text; | |||||
| my $description = $cells[4]->as_trimmed_text; | |||||
| my $postcode; | |||||
| if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/) | |||||
| { | |||||
| $postcode = $1; | |||||
| } | |||||
| $Writer->startTag("application"); | |||||
| $Writer->dataElement("council_reference", $reference); | |||||
| $Writer->dataElement("address", $address); | |||||
| $Writer->dataElement("postcode", $postcode); | |||||
| $Writer->dataElement("description", $description); | |||||
| $Writer->dataElement("date_received", $date); | |||||
| $Writer->endTag("application"); | |||||
| } | |||||
| elsif ($cells[0]->attr("colspan") && $cells[0]->attr("colspan") eq "5") | |||||
| { | |||||
| foreach my $link ($cells[0]->look_down("_tag" => "a")) | |||||
| { | |||||
| if ($link->as_trimmed_text eq ">" && | |||||
| $link->attr("href") =~ /^javascript:__doPostBack\('([^\']*)','([^\']*)'\)$/) | |||||
| { | |||||
| $page = do_post_back($state, $1, $2); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| # Finish off XML output | |||||
| $Writer->endTag("applications"); | |||||
| $Writer->endTag("planning"); | |||||
| $Writer->end(); | |||||
| exit 0; | |||||
| # Extract the state from a page so we can repost it | |||||
| sub get_state | |||||
| { | |||||
| my $page = shift; | |||||
| my $viewstate = $page->look_down("_tag" => "input", "name" => "__VIEWSTATE"); | |||||
| return $viewstate->attr("value"); | |||||
| } | |||||
| # Fake up what the doPostBack javascript function in the page does... | |||||
| sub do_post_back | |||||
| { | |||||
| my $state = shift; | |||||
| my $target = shift; | |||||
| my $argument = shift; | |||||
| $target =~ s/\$/:/g; | |||||
| my $args = { | |||||
| "__EVENTTARGET" => $target, | |||||
| "__EVENTARGUMENT" => $argument, | |||||
| "__VIEWSTATE" => $state | |||||
| }; | |||||
| return do_post($args); | |||||
| } | |||||
| # Post to the planning search page | |||||
| sub do_post | |||||
| { | |||||
| my $response = $UA->post($SearchURL, @_); | |||||
| die $response->status_line unless $response->is_success; | |||||
| return HTML::TreeBuilder->new_from_content($response->content); | |||||
| } | |||||
| @@ -0,0 +1,364 @@ | |||||
| #!/usr/local/bin/python | |||||
| import urllib, urllib2 | |||||
| import HTMLParser | |||||
| #from BeautifulSoup import BeautifulSoup | |||||
| import urlparse | |||||
| import re | |||||
| end_head_regex = re.compile("</head", re.IGNORECASE) | |||||
| import MultipartPostHandler | |||||
| # this is not mine, or part of standard python (though it should be!) | |||||
| # it comes from http://pipe.scs.fsu.edu/PostHandler/MultipartPostHandler.py | |||||
| from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication | |||||
| from datetime import date | |||||
| from time import strptime | |||||
| date_format = "%d/%m/%Y" | |||||
| our_date = date(2007,4,25) | |||||
| class AcolnetParser(HTMLParser.HTMLParser): | |||||
| case_number_tr = None # this one can be got by the td class attribute | |||||
| reg_date_tr = None | |||||
| location_tr = None | |||||
| proposal_tr = None | |||||
| # There is no online comment facility in these, so we provide an | |||||
| # appropriate email address instead | |||||
| comments_email_address = None | |||||
| def __init__(self, | |||||
| authority_name, | |||||
| authority_short_name, | |||||
| base_url, | |||||
| debug=False): | |||||
| HTMLParser.HTMLParser.__init__(self) | |||||
| self.authority_name = authority_name | |||||
| self.authority_short_name = authority_short_name | |||||
| self.base_url = base_url | |||||
| self.debug = debug | |||||
| self._tr_number = 0 | |||||
| # This will be used to track the subtable depth | |||||
| # when we are in a results-table, in order to | |||||
| # avoid adding an application before we have got to | |||||
| # the end of the results-table | |||||
| self._subtable_depth = None | |||||
| self._in_td = False | |||||
| # This in where we store the results | |||||
| self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) | |||||
| # This will store the planning application we are currently working on. | |||||
| self._current_application = None | |||||
| def _cleanupHTML(self, html): | |||||
| """This method should be overridden in subclasses to perform site specific | |||||
| HTML cleanup.""" | |||||
| return html | |||||
| def handle_starttag(self, tag, attrs): | |||||
| #print tag, attrs | |||||
| if tag == "table": | |||||
| if self._current_application is None: | |||||
| # Each application is in a separate table with class "results-table" | |||||
| for key, value in attrs: | |||||
| if key == "class" and value == "results-table": | |||||
| #print "found results-table" | |||||
| self._current_application = PlanningApplication() | |||||
| self._tr_number = 0 | |||||
| self._subtable_depth = 0 | |||||
| self._current_application.comment_url = self.comments_email_address | |||||
| break | |||||
| else: | |||||
| # We are already in a results-table, and this is the start of a subtable, | |||||
| # so increment the subtable depth. | |||||
| self._subtable_depth += 1 | |||||
| elif self._current_application is not None: | |||||
| if tag == "tr" and self._subtable_depth == 0: | |||||
| self._tr_number += 1 | |||||
| if tag == "td": | |||||
| self._in_td = True | |||||
| if self._tr_number == self.case_number_tr: | |||||
| #get the reference and the info link here | |||||
| pass | |||||
| elif self._tr_number == self.reg_date_tr: | |||||
| #get the registration date here | |||||
| pass | |||||
| elif self._tr_number == self.location_tr: | |||||
| #get the address and postcode here | |||||
| pass | |||||
| elif self._tr_number == self.proposal_tr: | |||||
| #get the description here | |||||
| pass | |||||
| if tag == "a" and self._tr_number == self.case_number_tr: | |||||
| # this is where we get the info link and the case number | |||||
| for key, value in attrs: | |||||
| if key == "href": | |||||
| self._current_application.info_url = value | |||||
| def handle_data(self, data): | |||||
| # If we are in the tr which contains the case number, | |||||
| # then data is the council reference, so | |||||
| # add it to self._current_application. | |||||
| if self._in_td: | |||||
| if self._tr_number == self.case_number_tr: | |||||
| self._current_application.council_reference = data.strip() | |||||
| elif self._tr_number == self.reg_date_tr: | |||||
| # we need to make a date object out of data | |||||
| date_as_str = ''.join(data.strip().split()) | |||||
| received_date = date(*strptime(date_as_str, date_format)[0:3]) | |||||
| #print received_date | |||||
| self._current_application.date_received = received_date | |||||
| elif self._tr_number == self.location_tr: | |||||
| location = data.strip() | |||||
| self._current_application.address = location | |||||
| self._current_application.postcode = getPostcodeFromText(location) | |||||
| elif self._tr_number == self.proposal_tr: | |||||
| self._current_application.description = data.strip() | |||||
| def handle_endtag(self, tag): | |||||
| #print "ending: ", tag | |||||
| if tag == "table" and self._current_application is not None: | |||||
| if self._subtable_depth > 0: | |||||
| self._subtable_depth -= 1 | |||||
| else: | |||||
| # We need to add the last application in the table | |||||
| if self._current_application is not None: | |||||
| #print "adding application" | |||||
| self._results.addApplication(self._current_application) | |||||
| #print self._current_application | |||||
| self._current_application = None | |||||
| self._tr_number = None | |||||
| self._subtable_depth = None | |||||
| elif tag == "td": | |||||
| self._in_td = False | |||||
| def getResultsByDayMonthYear(self, day, month, year): | |||||
| # first we fetch the search page to get ourselves some session info... | |||||
| search_form_response = urllib2.urlopen(self.base_url) | |||||
| search_form_contents = search_form_response.read() | |||||
| # This sometimes causes a problem in HTMLParser, so let's just get the link | |||||
| # out with a regex... | |||||
| groups = self.action_regex.search(search_form_contents).groups() | |||||
| action = groups[0] | |||||
| #print action | |||||
| action_url = urlparse.urljoin(self.base_url, action) | |||||
| #print action_url | |||||
| our_date = date(year, month, day) | |||||
| search_data = {"regdate1": our_date.strftime(date_format), | |||||
| "regdate2": our_date.strftime(date_format), | |||||
| } | |||||
| opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler) | |||||
| response = opener.open(action_url, search_data) | |||||
| results_html = response.read() | |||||
| # This is for doing site specific html cleanup | |||||
| results_html = self._cleanupHTML(results_html) | |||||
| #some javascript garbage in the header upsets HTMLParser, | |||||
| #so we'll just have the body | |||||
| just_body = "<html>" + end_head_regex.split(results_html)[-1] | |||||
| #outfile = open(self.authority_short_name + ".debug", "w") | |||||
| #outfile.write(just_body) | |||||
| self.feed(just_body) | |||||
| return self._results | |||||
| def getResults(self, day, month, year): | |||||
| return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() | |||||
| class BaberghParser(AcolnetParser): | |||||
| #search_url = "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch" | |||||
| case_number_tr = 1 # this one can be got by the td class attribute | |||||
| reg_date_tr = 2 | |||||
| location_tr = 4 | |||||
| proposal_tr = 5 | |||||
| #authority_name = "Babergh District Council" | |||||
| #authority_short_name = "Babergh" | |||||
| # It would be nice to scrape this... | |||||
| comments_email_address = "planning.reception@babergh.gov.uk" | |||||
| action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">") | |||||
| class BasingstokeParser(AcolnetParser): | |||||
| #search_url = "http://planning.basingstoke.gov.uk/DCOnline2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch" | |||||
| case_number_tr = 1 # this one can be got by the td class attribute | |||||
| reg_date_tr = 3 | |||||
| location_tr = 6 | |||||
| proposal_tr = 8 | |||||
| #authority_name = "Basingstoke and Deane Borough Council" | |||||
| #authority_short_name = "Basingstoke and Deane" | |||||
| # It would be nice to scrape this... | |||||
| comments_email_address = "development.control@basingstoke.gov.uk" | |||||
| action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">") | |||||
| class BassetlawParser(AcolnetParser): | |||||
| #search_url = "http://www.bassetlaw.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch" | |||||
| case_number_tr = 1 # this one can be got by the td class attribute | |||||
| reg_date_tr = 2 | |||||
| location_tr = 5 | |||||
| proposal_tr = 6 | |||||
| #authority_name = "Bassetlaw District Council" | |||||
| #authority_short_name = "Bassetlaw" | |||||
| comments_email_address = "planning@bassetlaw.gov.uk" | |||||
| action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE) | |||||
| def _cleanupHTML(self, html): | |||||
| """There is a broken div in this page. We don't need any divs, so | |||||
| let's get rid of them all.""" | |||||
| div_regex = re.compile("</?div[^>]*>", re.IGNORECASE) | |||||
| return div_regex.sub('', html) | |||||
| class BridgenorthParser(AcolnetParser): | |||||
| #search_url = "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch" | |||||
| case_number_tr = 1 # this one can be got by the td class attribute | |||||
| reg_date_tr = 2 | |||||
| location_tr = 4 | |||||
| proposal_tr = 5 | |||||
| #authority_name = "Bridgenorth District Council" | |||||
| #authority_short_name = "Bridgenorth" | |||||
| comments_email_address = "contactus@bridgnorth-dc.gov.uk" | |||||
| action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">") | |||||
| class BuryParser(AcolnetParser): | |||||
| #search_url = "http://e-planning.bury.gov.uk/ePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch" | |||||
| case_number_tr = 1 # this one can be got by the td class attribute | |||||
| reg_date_tr = 2 | |||||
| location_tr = 4 | |||||
| proposal_tr = 5 | |||||
| #authority_name = "Bury Metropolitan Borough Council" | |||||
| #authority_short_name = "Bury" | |||||
| comments_email_address = "development.control@bury.gov.uk" | |||||
| action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">") | |||||
| ## class CanterburyParser(AcolnetParser): | |||||
| ## search_url = "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch" | |||||
| ## case_number_tr = 1 # this one can be got by the td class attribute | |||||
| ## reg_date_tr = 2 | |||||
| ## location_tr = 4 | |||||
| ## proposal_tr = 5 | |||||
| ## authority_name = "Canterbury City Council" | |||||
| ## authority_short_name = "Canterbury" | |||||
| ## comments_email_address = "" | |||||
| ## action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">") | |||||
| class CarlisleParser(AcolnetParser): | |||||
| #search_url = "http://planning.carlisle.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch" | |||||
| case_number_tr = 1 # this one can be got by the td class attribute | |||||
| reg_date_tr = 2 | |||||
| location_tr = 5 | |||||
| proposal_tr = 6 | |||||
| #authority_name = "Carlisle City Council" | |||||
| #authority_short_name = "Carlisle" | |||||
| comments_email_address = "dc@carlisle.gov.uk" | |||||
| action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">") | |||||
| class DerbyParser(AcolnetParser): | |||||
| #search_url = "http://195.224.106.204/scripts/planningpages02%5CXSLPagesDC_DERBY%5CDCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch" | |||||
| case_number_tr = 1 # this one can be got by the td class attribute | |||||
| reg_date_tr = 3 | |||||
| location_tr = 4 | |||||
| proposal_tr = 5 | |||||
| #authority_name = "Derby City Council" | |||||
| #authority_short_name = "Derby" | |||||
| comments_email_address = "developmentcontrol@derby.gov.uk" | |||||
| action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">") | |||||
| if __name__ == '__main__': | |||||
| day = 15 | |||||
| month = 3 | |||||
| year = 2007 | |||||
| # working | |||||
| # parser = BasingstokeParser() | |||||
| parser = BaberghParser("Babergh District Council", "Babergh", "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") | |||||
| # works with the divs stripped out | |||||
| #parser = BassetlawParser() | |||||
| # returns error 400 - bad request | |||||
| #parser = BridgenorthParser() | |||||
| # working | |||||
| #parser = BuryParser() | |||||
| # cambridgeshire is a bit different... | |||||
| # no advanced search page | |||||
| # canterbury | |||||
| # results as columns of one table | |||||
| # returns error 400 - bad request | |||||
| #parser = CarlisleParser() | |||||
| # working | |||||
| #parser = DerbyParser() | |||||
| print parser.getResults(day, month, year) | |||||