diff --git a/CGI/Angus.cgi b/CGI/Angus.cgi new file mode 100755 index 0000000..0e9d3c0 --- /dev/null +++ b/CGI/Angus.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Angus Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Angus Council" +authority_short_name = "Angus" +base_url = "http://planning.angus.gov.uk/PublicAccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/Basildon.cgi b/CGI/Basildon.cgi new file mode 100755 index 0000000..0443765 --- /dev/null +++ b/CGI/Basildon.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Basildon District Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Basildon District Council" +authority_short_name = "Basildon" +base_url = "http://planning.basildon.gov.uk/publicaccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/Bath.cgi b/CGI/Bath.cgi new file mode 100755 index 0000000..ab482af --- /dev/null +++ b/CGI/Bath.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Bath and North East Somerset. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Bath and North East Somerset" +authority_short_name = "Bath" +base_url = "http://planning.bathnes.gov.uk/publicaccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/Bexley.cgi b/CGI/Bexley.cgi new file mode 100755 index 0000000..6524777 --- /dev/null +++ b/CGI/Bexley.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Bexley Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Bexley Council" +authority_short_name = "Bexley" +base_url = "http://publicaccess.bexley.gov.uk/publicaccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/Blaby.cgi b/CGI/Blaby.cgi new file mode 100755 index 0000000..8c3aba4 --- /dev/null +++ b/CGI/Blaby.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Blaby District Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Blaby District Council" +authority_short_name = "Blaby" +base_url = "http://www.blaby.gov.uk/PublicAccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/Bristol.cgi b/CGI/Bristol.cgi new file mode 100755 index 0000000..fae3e76 --- /dev/null +++ b/CGI/Bristol.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Bristol City Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Bristol City Council" +authority_short_name = "Bristol" +base_url = "http://e2eweb.bristol-city.gov.uk/publicaccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/Buckinghamshire.cgi b/CGI/Buckinghamshire.cgi new file mode 100755 index 0000000..9c431c0 --- /dev/null +++ b/CGI/Buckinghamshire.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Buckinghamshire County Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Buckinghamshire County Council" +authority_short_name = "Buckinghamshire" +base_url = "http://www.bucksplanning.gov.uk/PublicAccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/Cherwell.cgi b/CGI/Cherwell.cgi new file mode 100755 index 0000000..116d63d --- /dev/null +++ b/CGI/Cherwell.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Cherwell District Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Cherwell District Council" +authority_short_name = "Cherwell" +base_url = "http://cherweb.cherwell-dc.gov.uk/publicaccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/East Devon.cgi b/CGI/East Devon.cgi new file mode 100755 index 0000000..581d1e8 --- /dev/null +++ b/CGI/East Devon.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for East Devon District Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "East Devon District Council" +authority_short_name = "East Devon" +base_url = "http://planning.eastdevon.gov.uk/PublicAccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/Hart.cgi b/CGI/Hart.cgi new file mode 100755 index 0000000..0c744eb --- /dev/null +++ b/CGI/Hart.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Hart District Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Hart District Council" +authority_short_name = "Hart" +base_url = "http://publicaccess.hart.gov.uk/publicaccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/Huntingdonshire.cgi b/CGI/Huntingdonshire.cgi new file mode 100755 index 0000000..40035a8 --- /dev/null +++ b/CGI/Huntingdonshire.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Huntingdonshire District Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Huntingdonshire District Council" +authority_short_name = "Huntingdonshire" +base_url = "http://planning.huntsdc.gov.uk/publicaccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/Lancaster.cgi b/CGI/Lancaster.cgi new file mode 100755 index 0000000..95e3357 --- /dev/null +++ b/CGI/Lancaster.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Lancaster City Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Lancaster City Council" +authority_short_name = "Lancaster" +base_url = "http://planapps.lancaster.gov.uk/PublicAccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/Luton.cgi b/CGI/Luton.cgi new file mode 100755 index 0000000..894e99e --- /dev/null +++ b/CGI/Luton.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Luton Borough Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Luton Borough Council" +authority_short_name = "Luton" +base_url = "http://www.eplan.luton.gov.uk/PublicAccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/Mid Devon.cgi b/CGI/Mid Devon.cgi new file mode 100755 index 0000000..a49bbb4 --- /dev/null +++ b/CGI/Mid Devon.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Mid Devon District Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Mid Devon District Council" +authority_short_name = "Mid Devon" +base_url = "http://planning.middevon.gov.uk/publicaccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/Oadby and Wigston.cgi b/CGI/Oadby and Wigston.cgi new file mode 100755 index 0000000..196c047 --- /dev/null +++ b/CGI/Oadby and Wigston.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for The Borough of Oadby and Wigston. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "The Borough of Oadby and Wigston" +authority_short_name = "Oadby and Wigston" +base_url = "http://web.owbc.net/PublicAccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/PlanningUtils.py b/CGI/PlanningUtils.py new file mode 100644 index 0000000..3430576 --- /dev/null +++ b/CGI/PlanningUtils.py @@ -0,0 +1,101 @@ +__auth__ = None + +import re + +date_format = "%d/%m/%Y" + + +def xmlQuote(text): + # Change &s to &s + # I suspect there is probably some standard python + # function I should be using for this... + return text.replace('&', '&') + +def fixNewlines(text): + # This can be used to sort out windows newlines + return text.replace("\r\n","\n") + +# So what can a postcode look like then? +# This list of formats comes from http://www.mailsorttechnical.com/frequentlyaskedquestions.cfm +#AN NAA M1 1AA +#ANN NAA M60 1NW +#AAN NAA CR2 6XH +#AANN NAA DN55 1PT +#ANA NAA W1A 1HP +#AANA NAA EC1A 1BB + +postcode_regex = re.compile("[A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z]") + +def getPostcodeFromText(text): + """This function takes a piece of text and returns the first + bit of it that looks like a postcode.""" + + postcode_match = postcode_regex.search(text) + + if postcode_match is not None: + return postcode_match.group() + + +class PlanningAuthorityResults: + """This class represents a set of results of a planning search. + + This should probably be separated out so that it can be used for + authorities other than Cherwell. + """ + + def __init__(self, authority_name, authority_short_name): + self.authority_name = authority_name + self.authority_short_name = authority_short_name + + # this will be a list of PlanningApplication objects + self.planning_applications = [] + + + def addApplication(self, application): + self.planning_applications.append(application) + + def __repr__(self): + return self.displayXML() + + def displayXML(self): + """This should display the contents of this object in the planningalerts format. + i.e. in the same format as this one: + http://www.planningalerts.com/lambeth.xml + """ + + applications_bit = "".join([x.displayXML() for x in self.planning_applications]) + + return "\n" +\ + "%s\n" %self.authority_name +\ + "%s\n" %self.authority_short_name +\ + "\n" + applications_bit +\ + "\n" +\ + "\n" + + + +class PlanningApplication: + def __init__(self, no_postcode_default='No postcode'): + self.council_reference = None + self.address = None + self.postcode = no_postcode_default + self.description = None + self.info_url = None + self.comment_url = None + + # expecting this as a datetime.date object + self.date_received = None + + def __repr__(self): + return self.displayXML() + + def displayXML(self): + return "\n" +\ + "%s\n" %xmlQuote(self.council_reference) +\ + "
%s
\n" %xmlQuote(self.address) +\ + "%s\n" %self.postcode +\ + "%s\n" %xmlQuote(self.description) +\ + "%s\n" %xmlQuote(self.info_url) +\ + "%s\n" %xmlQuote(self.comment_url) +\ + "%s\n" %self.date_received.strftime(date_format) +\ + "
\n" diff --git a/CGI/Portsmouth.cgi b/CGI/Portsmouth.cgi new file mode 100755 index 0000000..1fe91cc --- /dev/null +++ b/CGI/Portsmouth.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Portsmouth City Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Portsmouth City Council" +authority_short_name = "Portsmouth" +base_url = "http://planning.portsmouth.gov.uk/PublicAccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/PublicAccess.py b/CGI/PublicAccess.py new file mode 100644 index 0000000..db635be --- /dev/null +++ b/CGI/PublicAccess.py @@ -0,0 +1,341 @@ +#!/usr/bin/python + +import urllib, urllib2 +import HTMLParser +import urlparse +import datetime, time + +import cookielib + +cookie_jar = cookielib.CookieJar() + + +from PlanningUtils import fixNewlines, PlanningAuthorityResults, PlanningApplication + + +search_form_url_end = "tdc/DcApplication/application_searchform.aspx" +search_results_url_end = "tdc/DcApplication/application_searchresults.aspx" +comments_url_end = "tdc/DcApplication/application_comments_entryform.aspx" + +class PublicAccessParser(HTMLParser.HTMLParser): + """This is the class which parses the PublicAccess search results page. + """ + + def __init__(self, + authority_name, + authority_short_name, + base_url, + debug=False): + + HTMLParser.HTMLParser.__init__(self) + + self.authority_name = authority_name + self.authority_short_name = authority_short_name + self.base_url = base_url + + self.debug = debug + + # this will change to True when we enter the table of results + self._in_results_table = False + + # this will be set to True when we have passed the header row + # in the results table + self._past_header_row = False + + # this will be true when we are in a in the results table + self._in_td = False + + # For each row, this will say how many tds we have seen so far + self._td_count = 0 + + # The object which stores our set of planning application results + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + # This will store the planning application we are currently working on. + self._current_application = None + + def handle_starttag(self, tag, attrs): + if tag == "table": + self.handle_start_table(attrs) + # we are only interested in tr tags if we are in the results table + elif self._in_results_table and tag == "tr": + self.handle_start_tr(attrs) + # we are only interested in td tags if we are in the results table + elif self._in_results_table and tag == "td": + self.handle_start_td(attrs) + # we are only interested in tags if we are in the 6th td in + # the results table. + elif self._in_td and self._td_count == 6 and tag == "a": + self.handle_start_a(attrs) + # If the tag is not one of these then we aren't interested + + def handle_endtag(self, tag): + # we only need to consider end tags if we are in the results table + if self._in_results_table: + if tag == "table": + self.handle_end_table() + if tag == "tr": + self.handle_end_tr() + if tag == "td": + self.handle_end_td() + + def handle_start_table(self, attrs): + for attr,value in attrs: + if attr == "class": + if value == "cResultsForm": + self._in_results_table = True + break + + def handle_end_table(self): + # If we see an end table tag, then note that we have left the + # results table. This method is only called if we are in that table. + self._in_results_table = False + + + def handle_start_tr(self, attrs): + # The first tr we meet in the results table is just headers + # We will set a flag at the end of that tr to avoid creating + # a blank PlanningApplication + if self._past_header_row: + # Create a candidate result object + self._current_application = PlanningApplication() + self._td_count = 0 + + def handle_end_tr(self): + # If we are in the results table, and not finishing the header row + # append the current result to the results list. + if self._past_header_row: + self._results.addApplication(self._current_application) + else: + # The first row of the results table is headers + # We want to do nothing until after it + self._past_header_row = True + + def handle_start_td(self, attrs): + # increase the td count by one + self._td_count += 1 + + # note that we are now in a td + self._in_td = True + + def handle_end_td(self): + # note that we are now not in a td + self._in_td = False + + def handle_start_a(self, attrs): + # this method is only getting called if we are in the + # 6th td of a non-header row of the results table. + + # go through the attributes of the looking for one + # named 'href' + for attr,value in attrs: + if attr == "href": + # the value of this tag is a relative url. + # parse it so we can get the query string from it + parsed_info_url = urlparse.urlparse(value) + + # the 4th part of the tuple is the query string + query_string = parsed_info_url[4] + + # join this query string to the search URL, and store this as + # the info URL of the current planning application + self._current_application.info_url = urlparse.urljoin(self.base_url, value) + + # Join this query string to the comments URL, and store this as + # the comments URL of the current planning application + comments_url = urlparse.urljoin(self.base_url, comments_url_end) + self._current_application.comment_url = urlparse.urljoin(comments_url, query_string) + + # while we're here, let's follow some links to find the postcode... + # the postcode is in an input tag in the property page. This page + # can be found by following the info url. + # The newlines in the info page need fixing. + info_file_contents = fixNewlines(urllib2.urlopen(self._current_application.info_url).read()) + + info_file_parser = PublicAccessInfoPageParser() + info_file_parser.feed(info_file_contents) + + property_page_url = urlparse.urljoin(self._current_application.info_url, info_file_parser.property_page_url) + + # the newlines in this page need fixing + property_file_contents = fixNewlines(urllib2.urlopen(property_page_url).read()) + + property_file_parser = PublicAccessPropertyPageParser() + property_file_parser.feed(property_file_contents) + + # Set the postcode on the current planning application from the + # one found on the property page + if property_file_parser.postcode is not None: + self._current_application.postcode = property_file_parser.postcode + + # There is no need for us to look at any more attributes. + break + + + def handle_data(self, data): + if self._in_td: + # The first td contains the reference + if self._td_count == 1: + self._current_application.council_reference = data + + # The second td contains the date the application was received + elif self._td_count == 2: + year, month, day = time.strptime(data, "%d/%m/%Y")[:3] + received_date = datetime.date(year, month, day) + + self._current_application.date_received = received_date + + # The third td contains the address + elif self._td_count == 3: + #data = data.replace("^M","\n") + self._current_application.address = data + + # The fourth td contains the description + elif self._td_count == 4: + self._current_application.description = data + # 5 is status - we don't need it. + # 6 is a button - this is where we will get our postcode, + # comment_url, and info_url from (when handling the tag). + + + def getResultsByDayMonthYear(self, day, month, year): + # First download the search form (in order to get a session cookie + search_form_request = urllib2.Request(urlparse.urljoin(self.base_url, search_form_url_end)) + search_form_response = urllib2.urlopen(search_form_request) + + cookie_jar.extract_cookies(search_form_response, search_form_request) + + + # We are only doing this first search in order to get a cookie + # The paging on the site doesn't work with cookies turned off. + + search_data1 = urllib.urlencode({"searchType":"ADV", + "caseNo":"", + "PPReference":"", + "AltReference":"", + "srchtype":"", + "srchstatus":"", + "srchdecision":"", + "srchapstatus":"", + "srchappealdecision":"", + "srchwardcode":"", + "srchparishcode":"", + "srchagentdetails":"", + "srchDateReceivedStart":"%(day)02d/%(month)02d/%(year)d" %{"day":day ,"month": month ,"year": year}, + "srchDateReceivedEnd":"%(day)02d/%(month)02d/%(year)d" %{"day":day, "month":month, "year":year} }) + + if self.debug: + print search_data1 + + + search_url = urlparse.urljoin(self.base_url, search_results_url_end) + request1 = urllib2.Request(search_url, search_data1) + cookie_jar.add_cookie_header(request1) + response1 = urllib2.urlopen(request1) + + # This search is the one we will actually use. + # a maximum of 100 results are returned on this site, + # hence setting "pagesize" to 100. I doubt there will ever + # be more than 100 in one day in PublicAccess... + # "currentpage" = 1 gets us to the first page of results + # (there will only be one anyway, as we are asking for 100 results...) + +#http://planning.york.gov.uk/PublicAccess/tdc/DcApplication/application_searchresults.aspx?szSearchDescription=Applications%20received%20between%2022/02/2007%20and%2022/02/2007&searchType=ADV&bccaseno=¤tpage=2&pagesize=10&module=P3 + + search_data2 = urllib.urlencode((("szSearchDescription","Applications received between %(day)02d/%(month)02d/%(year)d and %(day)02d/%(month)02d/%(year)d"%{"day":day ,"month": month ,"year": year}), ("searchType","ADV"), ("bccaseno",""), ("currentpage","1"), ("pagesize","100"), ("module","P3"))) + + if self.debug: + print search_data2 + + # This time we want to do a get request, so add the search data into the url + request2_url = urlparse.urljoin(self.base_url, search_results_url_end + "?" + search_data2) + + request2 = urllib2.Request(request2_url) + + # add the cookie we stored from our first search + cookie_jar.add_cookie_header(request2) + + response2 = urllib2.urlopen(request2) + + contents = fixNewlines(response2.read()) + + if self.debug: + print contents + + self.feed(contents) + + return self._results + + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + + + + +class PublicAccessInfoPageParser(HTMLParser.HTMLParser): + """A parser to get the URL for the property details page out of the + info page (this url is needed in order to get the postcode of the + application. + """ + + def __init__(self): + HTMLParser.HTMLParser.__init__(self) + + self.property_page_url = None + + def handle_starttag(self, tag, attrs): + """The URL of the property details page is contained in an tag in + an attribute with key 'A_btnPropertyDetails'. There is some garbage on + either side of it which we will have to clear up before storing it... + + We go through the tags looking for one with an attribute with + key 'id' and value 'A_btnPropertyDetails'. When we find it we go through + its attributes looking for one with key 'href' - the value of this attribute + contains the URL we want, after a bit of tidying up. + + Once we have got the URL, there is no need for us to look at any more tags. + """ + if tag == "a" and self.property_page_url is None: + if attrs.count(("id","A_btnPropertyDetails")) > 0: + for attr,value in attrs: + if attr == "href": + the_link = value + + # this has some garbage on either side of it... + # let's strip that off + + # the garbage on the left is separated by whitespace. + # the garbage on the right is separated by a "'". + + self.property_page_url = the_link.split()[1].split("'")[0] + + + + +class PublicAccessPropertyPageParser(HTMLParser.HTMLParser): + """A parser to get the postcode out of the property details page.""" + def __init__(self): + HTMLParser.HTMLParser.__init__(self) + + self.postcode = None + + def handle_starttag(self, tag, attrs): + """The postcode is contained in an tag. + This tag has an attribute 'name' with value postcode. + It also has an attribute 'value' with value the postcode of this application. + + We go through the input tags looking for one with an attribute with + key 'name' and value 'postcode'. When we find one, + we look through its attributes for one with key 'value' - we store the value of this + attribute as self.postcode. + + Once we have the postcode, there is no need to look at any more input tags. + """ + + if tag == "input" and self.postcode is None: + if attrs.count(("name","postcode")) > 0: + for attr,value in attrs: + if attr == "value": + self.postcode = value + diff --git a/CGI/README b/CGI/README new file mode 100644 index 0000000..6261806 --- /dev/null +++ b/CGI/README @@ -0,0 +1,5 @@ + +WARNING - this directory is only for generated files +and files which are automatically copied in. +Anything manually added here will be svn deleted. + diff --git a/CGI/Rushmoor.cgi b/CGI/Rushmoor.cgi new file mode 100755 index 0000000..f834ff0 --- /dev/null +++ b/CGI/Rushmoor.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Rushmoor Borough Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Rushmoor Borough Council" +authority_short_name = "Rushmoor" +base_url = "http://pa-dc.rushmoor.gov.uk/publicaccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/Sevenoaks.cgi b/CGI/Sevenoaks.cgi new file mode 100755 index 0000000..f44f9d1 --- /dev/null +++ b/CGI/Sevenoaks.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Sevenoaks District Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Sevenoaks District Council" +authority_short_name = "Sevenoaks" +base_url = "http://publicaccess.sevenoaks.gov.uk/publicaccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/South Staffs.cgi b/CGI/South Staffs.cgi new file mode 100755 index 0000000..a5b6aa8 --- /dev/null +++ b/CGI/South Staffs.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for South Staffordshire Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "South Staffordshire Council" +authority_short_name = "South Staffs" +base_url = "https://services.sstaffs.gov.uk/PublicAccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/SouthOxfordshire.cgi b/CGI/SouthOxfordshire.cgi new file mode 100755 index 0000000..6e0322a --- /dev/null +++ b/CGI/SouthOxfordshire.cgi @@ -0,0 +1,20 @@ +#!/usr/bin/python + +import cgi +import cgitb; cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + +from SouthOxfordshireParser import SouthOxfordshireParser + +parser = SouthOxfordshireParser() + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/SouthOxfordshireParser.py b/CGI/SouthOxfordshireParser.py new file mode 100644 index 0000000..0097ee5 --- /dev/null +++ b/CGI/SouthOxfordshireParser.py @@ -0,0 +1,248 @@ + +import urllib, urllib2 + +import HTMLParser +import urlparse +import datetime, time + +# This needs a page number inserting +search_url = "http://www.southoxon.gov.uk/ccm/planning/ApplicationList.jsp?PAGE=%d" + +# This needs the council reference +comment_url = "https://forms.southoxon.gov.uk/ufs/ufsmain?formid=PLANNINGCOMMENT&PLNGAPPL_REFERENCE=%(reference)s" + +authority_name = "South Oxfordshire District Council" +authority_short_name = "South Oxfordshire" + + +from PlanningUtils import fixNewlines, \ + getPostcodeFromText, \ + PlanningAuthorityResults, \ + PlanningApplication + +class SouthOxfordshireParser(HTMLParser.HTMLParser): + """In this case we'll take the date, so that we can avoid doing dowloads for + the other days in this week's file. This date should be a datetime.date object. + """ + def __init__(self): + HTMLParser.HTMLParser.__init__(self) + + self._requested_date = None + + # We'll keep a count of the number of tables we have seen. + # All the interesting stuff is in table 3 + self._table_count = 0 + + # While inside table 3, we'll keep a count of the number of + # s we have seen. What is in which numbered is detailed below. + # 1 reference + # 3 place and description + # 5 date received + # 2 and 4 are just padding + self._td_count = 0 + + # This is just a flag to say that we are now ready to get the reference + # from the next bit of data + self._get_reference = False + + self._data = '' + + # this will hold the application we are currently working on. + self._current_application = None + + # The object which stores our set of planning application results + self._results = PlanningAuthorityResults(authority_name, authority_short_name) + + def handle_starttag(self, tag, attrs): + # if we see a table tag, increment the table count. + if tag == 'table': + self._table_count += 1 + + # we are only interested in other tags if we are in table 3. + if self._table_count == 3: + + # If we are starting a , create a new PlanningApplication object + # for the application currently being processed + if tag == 'tr': + self._current_application = PlanningApplication() + + # if we see a td, increment the count. + if tag == 'td': + self._td_count += 1 + + # if we are in the first , and we see a link, + # then it is to the info page for this applicaion. + if tag == 'a' and self._td_count == 1: + for key, value in attrs: + if key == 'href': + url_end = value + self._current_application.info_url = urlparse.urljoin(search_url,url_end) + + # We now know that the next bit of data is the reference + self._get_reference = True + + # href is the only attribute we are interested in. + break + + def handle_endtag(self, tag): + # There is no need to do anything unless we are in table 3. + if self._table_count == 3: + + # The end indicates that the current application is finished. + # Now we can fetch the info_page to get the address, postcode, + # and description. + # If we don't have a reference, then we are in the header row, + # which we don't want. + # There is no point in doing this if the date is not the requested one. + + if tag == 'tr' and \ + self._current_application.council_reference is not None and \ + self._current_application.date_received == self._requested_date: + + info_page_parser = SouthOxfordshireInfoURLParser() + info_page_parser.feed(urllib2.urlopen(self._current_application.info_url).read()) + + self._current_application.address = info_page_parser.address + self._current_application.postcode = getPostcodeFromText(info_page_parser.address) + self._current_application.description = info_page_parser.description + + # Add the current application to the results set + self._results.addApplication(self._current_application) + + # At the end of the 5th , self._data should contain + # the received date of the application. + if tag == 'td' and self._td_count == 5: + app_year, app_month, app_day = tuple(time.strptime(self._data, "%d %B %Y")[:3]) + self._current_application.date_received = datetime.date(app_year, app_month, app_day) + self._data = '' + self._td_count = 0 + + def handle_data(self, data): + # There is no need to do anything if we aren't in table 3. + if self._table_count == 3: + # If we are in the first , and the get_reference flag is set, + # then the next data is the reference. + if self._td_count == 1 and self._get_reference: + self._current_application.council_reference = data + + # The comment url can now be made, as it depends only on the reference. + # On this site, the link to the comment page is only displayed once + # the planning authority has decided who is handling this application + # and has opened consultations. The link below works straight away, + # and also works for apps for which the consultation period is over. + # I have no idea if anything is actually done with these comments if + # it is followed too early... + self._current_application.comment_url = comment_url %{'reference': self._current_application.council_reference} + + # Set the get_reference flag back to False. + self._get_reference = False + + # If we are in the 5th , then we need to collect all the data together + # before we can use it. This is actually processed in handle_endtag. + if self._td_count == 5: + self._data += data + + def handle_entityref( self, ref ): + # We might have some entity_refs to clear up. + # there is no need to bother with this if we aren't in the results table. + if self._table_count == 3 and self._td_count == 5: + if ref == 'nbsp': + self._data += ' ' + + + def getResultsByDayMonthYear(self, day, month, year): + """This will return an ApplicationResults object containg the + applications for the date passed in.""" + + today = datetime.date.today() + self._requested_date = datetime.date(year, month, day) + delta = today - self._requested_date + + # to get the correct page, we need + # page ((days mod 7) + 1) + page_number = delta.days/7 + 1 + + response = urllib2.urlopen(search_url %page_number) + + contents = response.read() + + self.feed(contents) + + return self._results + + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +class SouthOxfordshireInfoURLParser(HTMLParser.HTMLParser): + """This parser is to get the description and address out of the info page + for a South Oxfordshire application.""" + + def __init__(self): + HTMLParser.HTMLParser.__init__(self) + + self.address = None + self.description = None + + # These two states will be set to: + # 0 - if we haven't yet got that bit + # 1 - if we are currently working on it + # 2 - if we have finished + self._address_state = 0 + self._description_state = 0 + + # We well need to know whether or not we are in a + self._in_td = False + + # This is used for collecting together date which comes in several bits. + self._data = '' + + def handle_starttag(self, tag, attrs): + # If we see the start of a and we are still interested in some data + # then set the td flag to true, and blank the data + if tag == 'td' and (self._address_state < 2 or self._description_state < 2): + self._in_td = True + self._data = '' + + def handle_endtag(self, tag): + if tag == 'td' and (self._address_state < 2 or self._description_state < 2): + # If we are working on the description, + # set description from _data and note that we need to work on it no more. + if self._description_state == 1: + self.description = self._data + self._description_state = 2 + + + # If we are working on the address, + # set address from _data and note that we need to work on it no more. + elif self._address_state == 1: + self.address = self._data + self._address_state = 2 + + # If we see data which says 'Descripton', + # then set the description state to working. + elif self._data.strip() == 'Description': + self._description_state = 1 + + # If we see data which says 'Location', + # then set the addresss state to working. + elif self._data.strip() == 'Location': + self._address_state = 1 + + # Note that we are leaving the + self._in_td = False + + def handle_data(self, data): + # if we are in a td, and we are still interested in the data for something, + # append the current bit to self._data + if self._in_td and (self._address_state < 2 or self._description_state < 2): + self._data += data + + +# TODO + +# find out what time of day this is run - does it matter that +# we aren't being careful with daylight saving time etc. + +# Can we check that scraped email address really is +# an email address? diff --git a/CGI/Southampton.cgi b/CGI/Southampton.cgi new file mode 100755 index 0000000..01188c3 --- /dev/null +++ b/CGI/Southampton.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Southampton City Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Southampton City Council" +authority_short_name = "Southampton" +base_url = "http://publicaccess.southampton.gov.uk/publicaccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/Spelthorne.cgi b/CGI/Spelthorne.cgi new file mode 100755 index 0000000..f507ad7 --- /dev/null +++ b/CGI/Spelthorne.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Spelthorne Borough Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Spelthorne Borough Council" +authority_short_name = "Spelthorne" +base_url = "http://phoenix.spelthorne.gov.uk/PublicAccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/Stevenage.cgi b/CGI/Stevenage.cgi new file mode 100755 index 0000000..25bf3ca --- /dev/null +++ b/CGI/Stevenage.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Stevenage Borough Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Stevenage Borough Council" +authority_short_name = "Stevenage" +base_url = "http://publicaccess.stevenage.gov.uk/publicaccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/Sunderland.cgi b/CGI/Sunderland.cgi new file mode 100755 index 0000000..bf8bc15 --- /dev/null +++ b/CGI/Sunderland.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Sunderland City Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Sunderland City Council" +authority_short_name = "Sunderland" +base_url = "http://www.sunderland.gov.uk/publicaccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/Test Valley.cgi b/CGI/Test Valley.cgi new file mode 100755 index 0000000..f782eaf --- /dev/null +++ b/CGI/Test Valley.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Test Valley Borough Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Test Valley Borough Council" +authority_short_name = "Test Valley" +base_url = "http://publicaccess.testvalley.gov.uk/publicaccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/Tonbridge.cgi b/CGI/Tonbridge.cgi new file mode 100755 index 0000000..a9256f9 --- /dev/null +++ b/CGI/Tonbridge.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Tonbridge and Malling Borough Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Tonbridge and Malling Borough Council" +authority_short_name = "Tonbridge" +base_url = "http://publicaccess.tmbc.gov.uk/publicaccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/West Norfolk.cgi b/CGI/West Norfolk.cgi new file mode 100755 index 0000000..47e3dce --- /dev/null +++ b/CGI/West Norfolk.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Kings Lynn and West Norfolk Borough Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Kings Lynn and West Norfolk Borough Council" +authority_short_name = "West Norfolk" +base_url = "http://online.west-norfolk.gov.uk/publicaccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/Woking.cgi b/CGI/Woking.cgi new file mode 100755 index 0000000..6c8f51f --- /dev/null +++ b/CGI/Woking.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Woking Borough Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Woking Borough Council" +authority_short_name = "Woking" +base_url = "http://caps.woking.gov.uk/publicaccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/York.cgi b/CGI/York.cgi new file mode 100755 index 0000000..ed59990 --- /dev/null +++ b/CGI/York.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for City of York Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "City of York Council" +authority_short_name = "York" +base_url = "http://planning.york.gov.uk/PublicAccess/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml