diff --git a/CGI/Oswestry.cgi b/CGI/Oswestry.cgi new file mode 100755 index 0000000..dc1629b --- /dev/null +++ b/CGI/Oswestry.cgi @@ -0,0 +1,31 @@ +#!/usr/bin/python + +# This is the parser for Oswestry Borough Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Oswestry Borough Council" +authority_short_name = "Oswestry" +base_url = "http://193.114.205.78/PublicAccess/tdc/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, + authority_short_name, + base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/CGI/PlanningUtils.py b/CGI/PlanningUtils.py deleted file mode 100644 index 3430576..0000000 --- a/CGI/PlanningUtils.py +++ /dev/null @@ -1,101 +0,0 @@ -__auth__ = None - -import re - -date_format = "%d/%m/%Y" - - -def xmlQuote(text): - # Change &s to &s - # I suspect there is probably some standard python - # function I should be using for this... - return text.replace('&', '&') - -def fixNewlines(text): - # This can be used to sort out windows newlines - return text.replace("\r\n","\n") - -# So what can a postcode look like then? -# This list of formats comes from http://www.mailsorttechnical.com/frequentlyaskedquestions.cfm -#AN NAA M1 1AA -#ANN NAA M60 1NW -#AAN NAA CR2 6XH -#AANN NAA DN55 1PT -#ANA NAA W1A 1HP -#AANA NAA EC1A 1BB - -postcode_regex = re.compile("[A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z]") - -def getPostcodeFromText(text): - """This function takes a piece of text and returns the first - bit of it that looks like a postcode.""" - - postcode_match = postcode_regex.search(text) - - if postcode_match is not None: - return postcode_match.group() - - -class PlanningAuthorityResults: - """This class represents a set of results of a planning search. - - This should probably be separated out so that it can be used for - authorities other than Cherwell. - """ - - def __init__(self, authority_name, authority_short_name): - self.authority_name = authority_name - self.authority_short_name = authority_short_name - - # this will be a list of PlanningApplication objects - self.planning_applications = [] - - - def addApplication(self, application): - self.planning_applications.append(application) - - def __repr__(self): - return self.displayXML() - - def displayXML(self): - """This should display the contents of this object in the planningalerts format. - i.e. in the same format as this one: - http://www.planningalerts.com/lambeth.xml - """ - - applications_bit = "".join([x.displayXML() for x in self.planning_applications]) - - return "\n" +\ - "%s\n" %self.authority_name +\ - "%s\n" %self.authority_short_name +\ - "\n" + applications_bit +\ - "\n" +\ - "\n" - - - -class PlanningApplication: - def __init__(self, no_postcode_default='No postcode'): - self.council_reference = None - self.address = None - self.postcode = no_postcode_default - self.description = None - self.info_url = None - self.comment_url = None - - # expecting this as a datetime.date object - self.date_received = None - - def __repr__(self): - return self.displayXML() - - def displayXML(self): - return "\n" +\ - "%s\n" %xmlQuote(self.council_reference) +\ - "
%s
\n" %xmlQuote(self.address) +\ - "%s\n" %self.postcode +\ - "%s\n" %xmlQuote(self.description) +\ - "%s\n" %xmlQuote(self.info_url) +\ - "%s\n" %xmlQuote(self.comment_url) +\ - "%s\n" %self.date_received.strftime(date_format) +\ - "
\n" diff --git a/CGI/PublicAccess.py b/CGI/PublicAccess.py deleted file mode 100644 index b7873ac..0000000 --- a/CGI/PublicAccess.py +++ /dev/null @@ -1,358 +0,0 @@ -#!/usr/bin/python - -import urllib, urllib2 -import HTMLParser -import urlparse -import datetime, time - -import cookielib - -cookie_jar = cookielib.CookieJar() - - -from PlanningUtils import fixNewlines, PlanningAuthorityResults, PlanningApplication - - -search_form_url_end = "DcApplication/application_searchform.aspx" -search_results_url_end = "DcApplication/application_searchresults.aspx" -comments_url_end = "DcApplication/application_comments_entryform.aspx" - -class PublicAccessParser(HTMLParser.HTMLParser): - """This is the class which parses the PublicAccess search results page. - """ - - def __init__(self, - authority_name, - authority_short_name, - base_url, - debug=False): - - HTMLParser.HTMLParser.__init__(self) - - self.authority_name = authority_name - self.authority_short_name = authority_short_name - self.base_url = base_url - - self.debug = debug - - # this will change to True when we enter the table of results - self._in_results_table = False - - # this will be set to True when we have passed the header row - # in the results table - self._past_header_row = False - - # this will be true when we are in a in the results table - self._in_td = False - - # For each row, this will say how many tds we have seen so far - self._td_count = 0 - - # The object which stores our set of planning application results - self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) - - # This will store the planning application we are currently working on. - self._current_application = None - - def handle_starttag(self, tag, attrs): - if tag == "table": - self.handle_start_table(attrs) - # we are only interested in tr tags if we are in the results table - elif self._in_results_table and tag == "tr": - self.handle_start_tr(attrs) - # we are only interested in td tags if we are in the results table - elif self._in_results_table and tag == "td": - self.handle_start_td(attrs) - # we are only interested in tags if we are in the 6th td in - # the results table. - elif self._in_td and self._td_count == 6 and tag == "a": - self.handle_start_a(attrs) - # If the tag is not one of these then we aren't interested - - def handle_endtag(self, tag): - # we only need to consider end tags if we are in the results table - if self._in_results_table: - if tag == "table": - self.handle_end_table() - if tag == "tr": - self.handle_end_tr() - if tag == "td": - self.handle_end_td() - - def handle_start_table(self, attrs): - for attr,value in attrs: - if attr == "class": - if value == "cResultsForm": - self._in_results_table = True - break - - def handle_end_table(self): - # If we see an end table tag, then note that we have left the - # results table. This method is only called if we are in that table. - self._in_results_table = False - - - def handle_start_tr(self, attrs): - # The first tr we meet in the results table is just headers - # We will set a flag at the end of that tr to avoid creating - # a blank PlanningApplication - if self._past_header_row: - # Create a candidate result object - self._current_application = PlanningApplication() - self._td_count = 0 - - def handle_end_tr(self): - # If we are in the results table, and not finishing the header row - # append the current result to the results list. - if self._past_header_row: - self._results.addApplication(self._current_application) - else: - # The first row of the results table is headers - # We want to do nothing until after it - self._past_header_row = True - - def handle_start_td(self, attrs): - # increase the td count by one - self._td_count += 1 - - # note that we are now in a td - self._in_td = True - - def handle_end_td(self): - # note that we are now not in a td - self._in_td = False - - def handle_start_a(self, attrs): - # this method is only getting called if we are in the - # 6th td of a non-header row of the results table. - - # go through the attributes of the looking for one - # named 'href' - for attr,value in attrs: - if attr == "href": - # the value of this tag is a relative url. - # parse it so we can get the query string from it - parsed_info_url = urlparse.urlparse(value) - - # the 4th part of the tuple is the query string - query_string = parsed_info_url[4] - - # join this query string to the search URL, and store this as - # the info URL of the current planning application - self._current_application.info_url = urlparse.urljoin(self.base_url, value) - - # Join this query string to the comments URL, and store this as - # the comments URL of the current planning application - comments_url = urlparse.urljoin(self.base_url, comments_url_end) - self._current_application.comment_url = urlparse.urljoin(comments_url, query_string) - - # while we're here, let's follow some links to find the postcode... - # the postcode is in an input tag in the property page. This page - # can be found by following the info url. - # The newlines in the info page need fixing. - info_file_contents = fixNewlines(urllib2.urlopen(self._current_application.info_url).read()) - - info_file_parser = PublicAccessInfoPageParser() - info_file_parser.feed(info_file_contents) - - property_page_url = urlparse.urljoin(self._current_application.info_url, info_file_parser.property_page_url) - - # the newlines in this page need fixing - property_file_contents = fixNewlines(urllib2.urlopen(property_page_url).read()) - - property_file_parser = PublicAccessPropertyPageParser() - property_file_parser.feed(property_file_contents) - - # Set the postcode on the current planning application from the - # one found on the property page - if property_file_parser.postcode is not None: - self._current_application.postcode = property_file_parser.postcode - - # There is no need for us to look at any more attributes. - break - - - def handle_data(self, data): - if self._in_td: - # The first td contains the reference - if self._td_count == 1: - self._current_application.council_reference = data - - # The second td contains the date the application was received - elif self._td_count == 2: - year, month, day = time.strptime(data, "%d/%m/%Y")[:3] - received_date = datetime.date(year, month, day) - - self._current_application.date_received = received_date - - # The third td contains the address - elif self._td_count == 3: - #data = data.replace("^M","\n") - self._current_application.address = data - - # The fourth td contains the description - elif self._td_count == 4: - self._current_application.description = data - # 5 is status - we don't need it. - # 6 is a button - this is where we will get our postcode, - # comment_url, and info_url from (when handling the tag). - - - def getResultsByDayMonthYear(self, day, month, year): - # First download the search form (in order to get a session cookie - search_form_request = urllib2.Request(urlparse.urljoin(self.base_url, search_form_url_end)) - search_form_response = urllib2.urlopen(search_form_request) - - cookie_jar.extract_cookies(search_form_response, search_form_request) - - - # We are only doing this first search in order to get a cookie - # The paging on the site doesn't work with cookies turned off. - - search_data1 = urllib.urlencode({"searchType":"ADV", - "caseNo":"", - "PPReference":"", - "AltReference":"", - "srchtype":"", - "srchstatus":"", - "srchdecision":"", - "srchapstatus":"", - "srchappealdecision":"", - "srchwardcode":"", - "srchparishcode":"", - "srchagentdetails":"", - "srchDateReceivedStart":"%(day)02d/%(month)02d/%(year)d" %{"day":day ,"month": month ,"year": year}, - "srchDateReceivedEnd":"%(day)02d/%(month)02d/%(year)d" %{"day":day, "month":month, "year":year} }) - - if self.debug: - print search_data1 - - - search_url = urlparse.urljoin(self.base_url, search_results_url_end) - request1 = urllib2.Request(search_url, search_data1) - cookie_jar.add_cookie_header(request1) - response1 = urllib2.urlopen(request1) - - # This search is the one we will actually use. - # a maximum of 100 results are returned on this site, - # hence setting "pagesize" to 100. I doubt there will ever - # be more than 100 in one day in PublicAccess... - # "currentpage" = 1 gets us to the first page of results - # (there will only be one anyway, as we are asking for 100 results...) - -#http://planning.york.gov.uk/PublicAccess/tdc/DcApplication/application_searchresults.aspx?szSearchDescription=Applications%20received%20between%2022/02/2007%20and%2022/02/2007&searchType=ADV&bccaseno=¤tpage=2&pagesize=10&module=P3 - - search_data2 = urllib.urlencode((("szSearchDescription","Applications received between %(day)02d/%(month)02d/%(year)d and %(day)02d/%(month)02d/%(year)d"%{"day":day ,"month": month ,"year": year}), ("searchType","ADV"), ("bccaseno",""), ("currentpage","1"), ("pagesize","100"), ("module","P3"))) - - if self.debug: - print search_data2 - - # This time we want to do a get request, so add the search data into the url - request2_url = urlparse.urljoin(self.base_url, search_results_url_end + "?" + search_data2) - - request2 = urllib2.Request(request2_url) - - # add the cookie we stored from our first search - cookie_jar.add_cookie_header(request2) - - response2 = urllib2.urlopen(request2) - - contents = fixNewlines(response2.read()) - - if self.debug: - print contents - - self.feed(contents) - - return self._results - - - def getResults(self, day, month, year): - return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() - - - - -class PublicAccessInfoPageParser(HTMLParser.HTMLParser): - """A parser to get the URL for the property details page out of the - info page (this url is needed in order to get the postcode of the - application. - """ - - def __init__(self): - HTMLParser.HTMLParser.__init__(self) - - self.property_page_url = None - - def handle_starttag(self, tag, attrs): - """The URL of the property details page is contained in an tag in - an attribute with key 'A_btnPropertyDetails'. There is some garbage on - either side of it which we will have to clear up before storing it... - - We go through the tags looking for one with an attribute with - key 'id' and value 'A_btnPropertyDetails'. When we find it we go through - its attributes looking for one with key 'href' - the value of this attribute - contains the URL we want, after a bit of tidying up. - - Once we have got the URL, there is no need for us to look at any more tags. - """ - if tag == "a" and self.property_page_url is None: - if attrs.count(("id","A_btnPropertyDetails")) > 0: - for attr,value in attrs: - if attr == "href": - the_link = value - - # this has some garbage on either side of it... - # let's strip that off - - # the garbage on the left is separated by whitespace. - # the garbage on the right is separated by a "'". - - self.property_page_url = the_link.split()[1].split("'")[0] - - - - -class PublicAccessPropertyPageParser(HTMLParser.HTMLParser): - """A parser to get the postcode out of the property details page.""" - def __init__(self): - HTMLParser.HTMLParser.__init__(self) - - self.postcode = None - - def handle_starttag(self, tag, attrs): - """The postcode is contained in an tag. - This tag has an attribute 'name' with value postcode. - It also has an attribute 'value' with value the postcode of this application. - - We go through the input tags looking for one with an attribute with - key 'name' and value 'postcode'. When we find one, - we look through its attributes for one with key 'value' - we store the value of this - attribute as self.postcode. - - Once we have the postcode, there is no need to look at any more input tags. - """ - - if tag == "input" and self.postcode is None: - if attrs.count(("name","postcode")) > 0: - for attr,value in attrs: - if attr == "value": - self.postcode = value - - -# These still don't work: - -# Perthshire -#http://193.63.61.22/publicaccess/tdc/DcApplication/application_searchform.aspx -#"Perth and Kinross Council", "Perthshire", "http://193.63.61.22/publicaccess/tdc/" - -# Hambleton -#http://planning.hambleton.gov.uk/publicaccess/tdc/DcApplication/application_searchform.aspx -#"Hambleton District Council", "Hambleton", "http://planning.hambleton.gov.uk/publicaccess/tdc/" - -# These use https: - -# Chiltern -#https://isa.chiltern.gov.uk/publicaccess/tdc/tdc_home.aspx -# Hinckley-Bosworth -#https://cx.hinckley-bosworth.gov.uk/PublicAccess/dc/DcApplication/application_searchform.aspx diff --git a/CGI/README b/CGI/README deleted file mode 100644 index 6261806..0000000 --- a/CGI/README +++ /dev/null @@ -1,5 +0,0 @@ - -WARNING - this directory is only for generated files -and files which are automatically copied in. -Anything manually added here will be svn deleted. - diff --git a/CGI/South Staffs.cgi b/CGI/South Staffordshire.cgi similarity index 94% rename from CGI/South Staffs.cgi rename to CGI/South Staffordshire.cgi index 0fcbd39..94d2183 100755 --- a/CGI/South Staffs.cgi +++ b/CGI/South Staffordshire.cgi @@ -15,7 +15,7 @@ year = form.getfirst('year') authority_name = "South Staffordshire Council" -authority_short_name = "South Staffs" +authority_short_name = "South Staffordshire" base_url = "https://services.sstaffs.gov.uk/PublicAccess/tdc/" import PublicAccess diff --git a/CGI/SouthOxfordshire.cgi b/CGI/SouthOxfordshire.cgi deleted file mode 100755 index 6e0322a..0000000 --- a/CGI/SouthOxfordshire.cgi +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/python - -import cgi -import cgitb; cgitb.enable(display=0, logdir="/tmp") - - -form = cgi.FieldStorage() -day = form.getfirst('day') -month = form.getfirst('month') -year = form.getfirst('year') - -from SouthOxfordshireParser import SouthOxfordshireParser - -parser = SouthOxfordshireParser() - -xml = parser.getResults(day, month, year) - -print "Content-Type: text/xml" # XML is following -print -print xml # print the xml diff --git a/CGI/SouthOxfordshireParser.py b/CGI/SouthOxfordshireParser.py deleted file mode 100644 index 0097ee5..0000000 --- a/CGI/SouthOxfordshireParser.py +++ /dev/null @@ -1,248 +0,0 @@ - -import urllib, urllib2 - -import HTMLParser -import urlparse -import datetime, time - -# This needs a page number inserting -search_url = "http://www.southoxon.gov.uk/ccm/planning/ApplicationList.jsp?PAGE=%d" - -# This needs the council reference -comment_url = "https://forms.southoxon.gov.uk/ufs/ufsmain?formid=PLANNINGCOMMENT&PLNGAPPL_REFERENCE=%(reference)s" - -authority_name = "South Oxfordshire District Council" -authority_short_name = "South Oxfordshire" - - -from PlanningUtils import fixNewlines, \ - getPostcodeFromText, \ - PlanningAuthorityResults, \ - PlanningApplication - -class SouthOxfordshireParser(HTMLParser.HTMLParser): - """In this case we'll take the date, so that we can avoid doing dowloads for - the other days in this week's file. This date should be a datetime.date object. - """ - def __init__(self): - HTMLParser.HTMLParser.__init__(self) - - self._requested_date = None - - # We'll keep a count of the number of tables we have seen. - # All the interesting stuff is in table 3 - self._table_count = 0 - - # While inside table 3, we'll keep a count of the number of - # s we have seen. What is in which numbered is detailed below. - # 1 reference - # 3 place and description - # 5 date received - # 2 and 4 are just padding - self._td_count = 0 - - # This is just a flag to say that we are now ready to get the reference - # from the next bit of data - self._get_reference = False - - self._data = '' - - # this will hold the application we are currently working on. - self._current_application = None - - # The object which stores our set of planning application results - self._results = PlanningAuthorityResults(authority_name, authority_short_name) - - def handle_starttag(self, tag, attrs): - # if we see a table tag, increment the table count. - if tag == 'table': - self._table_count += 1 - - # we are only interested in other tags if we are in table 3. - if self._table_count == 3: - - # If we are starting a , create a new PlanningApplication object - # for the application currently being processed - if tag == 'tr': - self._current_application = PlanningApplication() - - # if we see a td, increment the count. - if tag == 'td': - self._td_count += 1 - - # if we are in the first , and we see a link, - # then it is to the info page for this applicaion. - if tag == 'a' and self._td_count == 1: - for key, value in attrs: - if key == 'href': - url_end = value - self._current_application.info_url = urlparse.urljoin(search_url,url_end) - - # We now know that the next bit of data is the reference - self._get_reference = True - - # href is the only attribute we are interested in. - break - - def handle_endtag(self, tag): - # There is no need to do anything unless we are in table 3. - if self._table_count == 3: - - # The end indicates that the current application is finished. - # Now we can fetch the info_page to get the address, postcode, - # and description. - # If we don't have a reference, then we are in the header row, - # which we don't want. - # There is no point in doing this if the date is not the requested one. - - if tag == 'tr' and \ - self._current_application.council_reference is not None and \ - self._current_application.date_received == self._requested_date: - - info_page_parser = SouthOxfordshireInfoURLParser() - info_page_parser.feed(urllib2.urlopen(self._current_application.info_url).read()) - - self._current_application.address = info_page_parser.address - self._current_application.postcode = getPostcodeFromText(info_page_parser.address) - self._current_application.description = info_page_parser.description - - # Add the current application to the results set - self._results.addApplication(self._current_application) - - # At the end of the 5th , self._data should contain - # the received date of the application. - if tag == 'td' and self._td_count == 5: - app_year, app_month, app_day = tuple(time.strptime(self._data, "%d %B %Y")[:3]) - self._current_application.date_received = datetime.date(app_year, app_month, app_day) - self._data = '' - self._td_count = 0 - - def handle_data(self, data): - # There is no need to do anything if we aren't in table 3. - if self._table_count == 3: - # If we are in the first , and the get_reference flag is set, - # then the next data is the reference. - if self._td_count == 1 and self._get_reference: - self._current_application.council_reference = data - - # The comment url can now be made, as it depends only on the reference. - # On this site, the link to the comment page is only displayed once - # the planning authority has decided who is handling this application - # and has opened consultations. The link below works straight away, - # and also works for apps for which the consultation period is over. - # I have no idea if anything is actually done with these comments if - # it is followed too early... - self._current_application.comment_url = comment_url %{'reference': self._current_application.council_reference} - - # Set the get_reference flag back to False. - self._get_reference = False - - # If we are in the 5th , then we need to collect all the data together - # before we can use it. This is actually processed in handle_endtag. - if self._td_count == 5: - self._data += data - - def handle_entityref( self, ref ): - # We might have some entity_refs to clear up. - # there is no need to bother with this if we aren't in the results table. - if self._table_count == 3 and self._td_count == 5: - if ref == 'nbsp': - self._data += ' ' - - - def getResultsByDayMonthYear(self, day, month, year): - """This will return an ApplicationResults object containg the - applications for the date passed in.""" - - today = datetime.date.today() - self._requested_date = datetime.date(year, month, day) - delta = today - self._requested_date - - # to get the correct page, we need - # page ((days mod 7) + 1) - page_number = delta.days/7 + 1 - - response = urllib2.urlopen(search_url %page_number) - - contents = response.read() - - self.feed(contents) - - return self._results - - - def getResults(self, day, month, year): - return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() - -class SouthOxfordshireInfoURLParser(HTMLParser.HTMLParser): - """This parser is to get the description and address out of the info page - for a South Oxfordshire application.""" - - def __init__(self): - HTMLParser.HTMLParser.__init__(self) - - self.address = None - self.description = None - - # These two states will be set to: - # 0 - if we haven't yet got that bit - # 1 - if we are currently working on it - # 2 - if we have finished - self._address_state = 0 - self._description_state = 0 - - # We well need to know whether or not we are in a - self._in_td = False - - # This is used for collecting together date which comes in several bits. - self._data = '' - - def handle_starttag(self, tag, attrs): - # If we see the start of a and we are still interested in some data - # then set the td flag to true, and blank the data - if tag == 'td' and (self._address_state < 2 or self._description_state < 2): - self._in_td = True - self._data = '' - - def handle_endtag(self, tag): - if tag == 'td' and (self._address_state < 2 or self._description_state < 2): - # If we are working on the description, - # set description from _data and note that we need to work on it no more. - if self._description_state == 1: - self.description = self._data - self._description_state = 2 - - - # If we are working on the address, - # set address from _data and note that we need to work on it no more. - elif self._address_state == 1: - self.address = self._data - self._address_state = 2 - - # If we see data which says 'Descripton', - # then set the description state to working. - elif self._data.strip() == 'Description': - self._description_state = 1 - - # If we see data which says 'Location', - # then set the addresss state to working. - elif self._data.strip() == 'Location': - self._address_state = 1 - - # Note that we are leaving the - self._in_td = False - - def handle_data(self, data): - # if we are in a td, and we are still interested in the data for something, - # append the current bit to self._data - if self._in_td and (self._address_state < 2 or self._description_state < 2): - self._data += data - - -# TODO - -# find out what time of day this is run - does it matter that -# we aren't being careful with daylight saving time etc. - -# Can we check that scraped email address really is -# an email address?