and a script to generate lots more scrapers...import/raw
@@ -0,0 +1,29 @@ | |||
# This is the parser for %(authority_name)s. | |||
# it is generated from the file CGITemplate | |||
import cgi | |||
import cgitb | |||
#cgitb.enable(display=0, logdir="/tmp") | |||
form = cgi.FieldStorage() | |||
day = form.getfirst('day') | |||
month = form.getfirst('month') | |||
year = form.getfirst('year') | |||
authority_name = "%(authority_name)s" | |||
authority_short_name = "%(authority_short_name)s" | |||
base_url = "%(base_url)s" | |||
import PublicAccess | |||
parser = PublicAccess.PublicAccessParser(authority_name, | |||
authority_short_name, | |||
base_url) | |||
xml = parser.getResults(day, month, year) | |||
print "Content-Type: text/xml" # XML is following | |||
print xml # print the xml |
@@ -0,0 +1,101 @@ | |||
__auth__ = None | |||
import re | |||
date_format = "%d/%m/%Y" | |||
def xmlQuote(text): | |||
# Change &s to &s | |||
# I suspect there is probably some standard python | |||
# function I should be using for this... | |||
return text.replace('&', '&') | |||
def fixNewlines(text): | |||
# This can be used to sort out windows newlines | |||
return text.replace("\r\n","\n") | |||
# So what can a postcode look like then? | |||
# This list of formats comes from http://www.mailsorttechnical.com/frequentlyaskedquestions.cfm | |||
#AN NAA M1 1AA | |||
#ANN NAA M60 1NW | |||
#AAN NAA CR2 6XH | |||
#AANN NAA DN55 1PT | |||
#ANA NAA W1A 1HP | |||
#AANA NAA EC1A 1BB | |||
postcode_regex = re.compile("[A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z]") | |||
def getPostcodeFromText(text): | |||
"""This function takes a piece of text and returns the first | |||
bit of it that looks like a postcode.""" | |||
postcode_match = postcode_regex.search(text) | |||
if postcode_match is not None: | |||
return postcode_match.group() | |||
class PlanningAuthorityResults: | |||
"""This class represents a set of results of a planning search. | |||
This should probably be separated out so that it can be used for | |||
authorities other than Cherwell. | |||
""" | |||
def __init__(self, authority_name, authority_short_name): | |||
self.authority_name = authority_name | |||
self.authority_short_name = authority_short_name | |||
# this will be a list of PlanningApplication objects | |||
self.planning_applications = [] | |||
def addApplication(self, application): | |||
self.planning_applications.append(application) | |||
def __repr__(self): | |||
return self.displayXML() | |||
def displayXML(self): | |||
"""This should display the contents of this object in the planningalerts format. | |||
i.e. in the same format as this one: | |||
http://www.planningalerts.com/lambeth.xml | |||
""" | |||
applications_bit = "".join([x.displayXML() for x in self.planning_applications]) | |||
return "<planning>\n" +\ | |||
"<authority_name>%s</authority_name>\n" %self.authority_name +\ | |||
"<authority_short_name>%s</authority_short_name>\n" %self.authority_short_name +\ | |||
"<applications>\n" + applications_bit +\ | |||
"</applications>\n" +\ | |||
"</planning>\n" | |||
class PlanningApplication: | |||
def __init__(self, no_postcode_default='No postcode'): | |||
self.council_reference = None | |||
self.address = None | |||
self.postcode = no_postcode_default | |||
self.description = None | |||
self.info_url = None | |||
self.comment_url = None | |||
# expecting this as a datetime.date object | |||
self.date_received = None | |||
def __repr__(self): | |||
return self.displayXML() | |||
def displayXML(self): | |||
return "<application>\n" +\ | |||
"<council_reference>%s</council_reference>\n" %xmlQuote(self.council_reference) +\ | |||
"<address>%s</address>\n" %xmlQuote(self.address) +\ | |||
"<postcode>%s</postcode>\n" %self.postcode +\ | |||
"<description>%s</description>\n" %xmlQuote(self.description) +\ | |||
"<info_url>%s</info_url>\n" %xmlQuote(self.info_url) +\ | |||
"<comment_url>%s</comment_url>\n" %xmlQuote(self.comment_url) +\ | |||
"<date_received>%s</date_received>\n" %self.date_received.strftime(date_format) +\ | |||
"</application>\n" |
@@ -0,0 +1,341 @@ | |||
#!/usr/bin/python | |||
import urllib, urllib2 | |||
import HTMLParser | |||
import urlparse | |||
import datetime, time | |||
import cookielib | |||
cookie_jar = cookielib.CookieJar() | |||
from PlanningUtils import fixNewlines, PlanningAuthorityResults, PlanningApplication | |||
search_form_url_end = "tdc/DcApplication/application_searchform.aspx" | |||
search_results_url_end = "tdc/DcApplication/application_searchresults.aspx" | |||
comments_url_end = "tdc/DcApplication/application_comments_entryform.aspx" | |||
class PublicAccessParser(HTMLParser.HTMLParser): | |||
"""This is the class which parses the PublicAccess search results page. | |||
""" | |||
def __init__(self, | |||
authority_name, | |||
authority_short_name, | |||
base_url, | |||
debug=False): | |||
HTMLParser.HTMLParser.__init__(self) | |||
self.authority_name = authority_name | |||
self.authority_short_name = authority_short_name | |||
self.base_url = base_url | |||
self.debug = debug | |||
# this will change to True when we enter the table of results | |||
self._in_results_table = False | |||
# this will be set to True when we have passed the header row | |||
# in the results table | |||
self._past_header_row = False | |||
# this will be true when we are in a <td> in the results table | |||
self._in_td = False | |||
# For each row, this will say how many tds we have seen so far | |||
self._td_count = 0 | |||
# The object which stores our set of planning application results | |||
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) | |||
# This will store the planning application we are currently working on. | |||
self._current_application = None | |||
def handle_starttag(self, tag, attrs): | |||
if tag == "table": | |||
self.handle_start_table(attrs) | |||
# we are only interested in tr tags if we are in the results table | |||
elif self._in_results_table and tag == "tr": | |||
self.handle_start_tr(attrs) | |||
# we are only interested in td tags if we are in the results table | |||
elif self._in_results_table and tag == "td": | |||
self.handle_start_td(attrs) | |||
# we are only interested in <a> tags if we are in the 6th td in | |||
# the results table. | |||
elif self._in_td and self._td_count == 6 and tag == "a": | |||
self.handle_start_a(attrs) | |||
# If the tag is not one of these then we aren't interested | |||
def handle_endtag(self, tag): | |||
# we only need to consider end tags if we are in the results table | |||
if self._in_results_table: | |||
if tag == "table": | |||
self.handle_end_table() | |||
if tag == "tr": | |||
self.handle_end_tr() | |||
if tag == "td": | |||
self.handle_end_td() | |||
def handle_start_table(self, attrs): | |||
for attr,value in attrs: | |||
if attr == "class": | |||
if value == "cResultsForm": | |||
self._in_results_table = True | |||
break | |||
def handle_end_table(self): | |||
# If we see an end table tag, then note that we have left the | |||
# results table. This method is only called if we are in that table. | |||
self._in_results_table = False | |||
def handle_start_tr(self, attrs): | |||
# The first tr we meet in the results table is just headers | |||
# We will set a flag at the end of that tr to avoid creating | |||
# a blank PlanningApplication | |||
if self._past_header_row: | |||
# Create a candidate result object | |||
self._current_application = PlanningApplication() | |||
self._td_count = 0 | |||
def handle_end_tr(self): | |||
# If we are in the results table, and not finishing the header row | |||
# append the current result to the results list. | |||
if self._past_header_row: | |||
self._results.addApplication(self._current_application) | |||
else: | |||
# The first row of the results table is headers | |||
# We want to do nothing until after it | |||
self._past_header_row = True | |||
def handle_start_td(self, attrs): | |||
# increase the td count by one | |||
self._td_count += 1 | |||
# note that we are now in a td | |||
self._in_td = True | |||
def handle_end_td(self): | |||
# note that we are now not in a td | |||
self._in_td = False | |||
def handle_start_a(self, attrs): | |||
# this method is only getting called if we are in the | |||
# 6th td of a non-header row of the results table. | |||
# go through the attributes of the <a> looking for one | |||
# named 'href' | |||
for attr,value in attrs: | |||
if attr == "href": | |||
# the value of this tag is a relative url. | |||
# parse it so we can get the query string from it | |||
parsed_info_url = urlparse.urlparse(value) | |||
# the 4th part of the tuple is the query string | |||
query_string = parsed_info_url[4] | |||
# join this query string to the search URL, and store this as | |||
# the info URL of the current planning application | |||
self._current_application.info_url = urlparse.urljoin(self.base_url, value) | |||
# Join this query string to the comments URL, and store this as | |||
# the comments URL of the current planning application | |||
comments_url = urlparse.urljoin(self.base_url, comments_url_end) | |||
self._current_application.comment_url = urlparse.urljoin(comments_url, query_string) | |||
# while we're here, let's follow some links to find the postcode... | |||
# the postcode is in an input tag in the property page. This page | |||
# can be found by following the info url. | |||
# The newlines in the info page need fixing. | |||
info_file_contents = fixNewlines(urllib2.urlopen(self._current_application.info_url).read()) | |||
info_file_parser = PublicAccessInfoPageParser() | |||
info_file_parser.feed(info_file_contents) | |||
property_page_url = urlparse.urljoin(self._current_application.info_url, info_file_parser.property_page_url) | |||
# the newlines in this page need fixing | |||
property_file_contents = fixNewlines(urllib2.urlopen(property_page_url).read()) | |||
property_file_parser = PublicAccessPropertyPageParser() | |||
property_file_parser.feed(property_file_contents) | |||
# Set the postcode on the current planning application from the | |||
# one found on the property page | |||
if property_file_parser.postcode is not None: | |||
self._current_application.postcode = property_file_parser.postcode | |||
# There is no need for us to look at any more attributes. | |||
break | |||
def handle_data(self, data): | |||
if self._in_td: | |||
# The first td contains the reference | |||
if self._td_count == 1: | |||
self._current_application.council_reference = data | |||
# The second td contains the date the application was received | |||
elif self._td_count == 2: | |||
year, month, day = time.strptime(data, "%d/%m/%Y")[:3] | |||
received_date = datetime.date(year, month, day) | |||
self._current_application.date_received = received_date | |||
# The third td contains the address | |||
elif self._td_count == 3: | |||
#data = data.replace("^M","\n") | |||
self._current_application.address = data | |||
# The fourth td contains the description | |||
elif self._td_count == 4: | |||
self._current_application.description = data | |||
# 5 is status - we don't need it. | |||
# 6 is a button - this is where we will get our postcode, | |||
# comment_url, and info_url from (when handling the <a> tag). | |||
def getResultsByDayMonthYear(self, day, month, year): | |||
# First download the search form (in order to get a session cookie | |||
search_form_request = urllib2.Request(urlparse.urljoin(self.base_url, search_form_url_end)) | |||
search_form_response = urllib2.urlopen(search_form_request) | |||
cookie_jar.extract_cookies(search_form_response, search_form_request) | |||
# We are only doing this first search in order to get a cookie | |||
# The paging on the site doesn't work with cookies turned off. | |||
search_data1 = urllib.urlencode({"searchType":"ADV", | |||
"caseNo":"", | |||
"PPReference":"", | |||
"AltReference":"", | |||
"srchtype":"", | |||
"srchstatus":"", | |||
"srchdecision":"", | |||
"srchapstatus":"", | |||
"srchappealdecision":"", | |||
"srchwardcode":"", | |||
"srchparishcode":"", | |||
"srchagentdetails":"", | |||
"srchDateReceivedStart":"%(day)02d/%(month)02d/%(year)d" %{"day":day ,"month": month ,"year": year}, | |||
"srchDateReceivedEnd":"%(day)02d/%(month)02d/%(year)d" %{"day":day, "month":month, "year":year} }) | |||
if self.debug: | |||
print search_data1 | |||
search_url = urlparse.urljoin(self.base_url, search_results_url_end) | |||
request1 = urllib2.Request(search_url, search_data1) | |||
cookie_jar.add_cookie_header(request1) | |||
response1 = urllib2.urlopen(request1) | |||
# This search is the one we will actually use. | |||
# a maximum of 100 results are returned on this site, | |||
# hence setting "pagesize" to 100. I doubt there will ever | |||
# be more than 100 in one day in PublicAccess... | |||
# "currentpage" = 1 gets us to the first page of results | |||
# (there will only be one anyway, as we are asking for 100 results...) | |||
#http://planning.york.gov.uk/PublicAccess/tdc/DcApplication/application_searchresults.aspx?szSearchDescription=Applications%20received%20between%2022/02/2007%20and%2022/02/2007&searchType=ADV&bccaseno=¤tpage=2&pagesize=10&module=P3 | |||
search_data2 = urllib.urlencode((("szSearchDescription","Applications received between %(day)02d/%(month)02d/%(year)d and %(day)02d/%(month)02d/%(year)d"%{"day":day ,"month": month ,"year": year}), ("searchType","ADV"), ("bccaseno",""), ("currentpage","1"), ("pagesize","100"), ("module","P3"))) | |||
if self.debug: | |||
print search_data2 | |||
# This time we want to do a get request, so add the search data into the url | |||
request2_url = urlparse.urljoin(self.base_url, search_results_url_end + "?" + search_data2) | |||
request2 = urllib2.Request(request2_url) | |||
# add the cookie we stored from our first search | |||
cookie_jar.add_cookie_header(request2) | |||
response2 = urllib2.urlopen(request2) | |||
contents = fixNewlines(response2.read()) | |||
if self.debug: | |||
print contents | |||
self.feed(contents) | |||
return self._results | |||
def getResults(self, day, month, year): | |||
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() | |||
class PublicAccessInfoPageParser(HTMLParser.HTMLParser): | |||
"""A parser to get the URL for the property details page out of the | |||
info page (this url is needed in order to get the postcode of the | |||
application. | |||
""" | |||
def __init__(self): | |||
HTMLParser.HTMLParser.__init__(self) | |||
self.property_page_url = None | |||
def handle_starttag(self, tag, attrs): | |||
"""The URL of the property details page is contained in an <a> tag in | |||
an attribute with key 'A_btnPropertyDetails'. There is some garbage on | |||
either side of it which we will have to clear up before storing it... | |||
We go through the <a> tags looking for one with an attribute with | |||
key 'id' and value 'A_btnPropertyDetails'. When we find it we go through | |||
its attributes looking for one with key 'href' - the value of this attribute | |||
contains the URL we want, after a bit of tidying up. | |||
Once we have got the URL, there is no need for us to look at any more <a> tags. | |||
""" | |||
if tag == "a" and self.property_page_url is None: | |||
if attrs.count(("id","A_btnPropertyDetails")) > 0: | |||
for attr,value in attrs: | |||
if attr == "href": | |||
the_link = value | |||
# this has some garbage on either side of it... | |||
# let's strip that off | |||
# the garbage on the left is separated by whitespace. | |||
# the garbage on the right is separated by a "'". | |||
self.property_page_url = the_link.split()[1].split("'")[0] | |||
class PublicAccessPropertyPageParser(HTMLParser.HTMLParser): | |||
"""A parser to get the postcode out of the property details page.""" | |||
def __init__(self): | |||
HTMLParser.HTMLParser.__init__(self) | |||
self.postcode = None | |||
def handle_starttag(self, tag, attrs): | |||
"""The postcode is contained in an <input> tag. | |||
This tag has an attribute 'name' with value postcode. | |||
It also has an attribute 'value' with value the postcode of this application. | |||
We go through the input tags looking for one with an attribute with | |||
key 'name' and value 'postcode'. When we find one, | |||
we look through its attributes for one with key 'value' - we store the value of this | |||
attribute as self.postcode. | |||
Once we have the postcode, there is no need to look at any more input tags. | |||
""" | |||
if tag == "input" and self.postcode is None: | |||
if attrs.count(("name","postcode")) > 0: | |||
for attr,value in attrs: | |||
if attr == "value": | |||
self.postcode = value | |||
@@ -0,0 +1,5 @@ | |||
"authority_name", "authority_short_name", "base_url" | |||
"City of York Council", "York", "http://planning.york.gov.uk/PublicAccess/" | |||
"Cherwell District Council", "Cherwell", "http://cherweb.cherwell-dc.gov.uk/publicaccess/" | |||
"Angus Council", "Angus", "http://planning.angus.gov.uk/PublicAccess/" | |||
"Huntingdonshire District Council", "Huntingdonshire", "http://planning.huntsdc.gov.uk/publicaccess/" |
@@ -0,0 +1,20 @@ | |||
#!/usr/bin/python | |||
import cgi | |||
import cgitb; cgitb.enable(display=0, logdir="/tmp") | |||
form = cgi.FieldStorage() | |||
day = form.getfirst('day') | |||
month = form.getfirst('month') | |||
year = form.getfirst('year') | |||
from SouthOxfordshireParser import SouthOxfordshireParser | |||
parser = SouthOxfordshireParser() | |||
xml = parser.getResults(day, month, year) | |||
print "Content-Type: text/xml" # XML is following | |||
print xml # print the xml |
@@ -0,0 +1,263 @@ | |||
#!/usr/bin/python | |||
import cgi | |||
import cgitb; cgitb.enable(display=0, logdir="/tmp") | |||
import urllib, urllib2 | |||
import HTMLParser | |||
import urlparse | |||
import datetime, time | |||
# This needs a page number inserting | |||
search_url = "http://www.southoxon.gov.uk/ccm/planning/ApplicationList.jsp?PAGE=%d" | |||
# This needs the council reference | |||
comment_url = "https://forms.southoxon.gov.uk/ufs/ufsmain?formid=PLANNINGCOMMENT&PLNGAPPL_REFERENCE=%(reference)s" | |||
authority_name = "South Oxfordshire District Council" | |||
authority_short_name = "South Oxfordshire" | |||
from PlanningUtils import fixNewlines, \ | |||
getPostcodeFromText, \ | |||
PlanningAuthorityResults, \ | |||
PlanningApplication | |||
class SouthOxfordshireParser(HTMLParser.HTMLParser): | |||
"""In this case we'll take the date, so that we can avoid doing dowloads for | |||
the other days in this week's file. This date should be a datetime.date object. | |||
""" | |||
def __init__(self): | |||
HTMLParser.HTMLParser.__init__(self) | |||
self._requested_date = None | |||
# We'll keep a count of the number of tables we have seen. | |||
# All the interesting stuff is in table 3 | |||
self._table_count = 0 | |||
# While inside table 3, we'll keep a count of the number of | |||
# <td>s we have seen. What is in which numbered <td> is detailed below. | |||
# 1 reference | |||
# 3 place and description | |||
# 5 date received | |||
# 2 and 4 are just padding | |||
self._td_count = 0 | |||
# This is just a flag to say that we are now ready to get the reference | |||
# from the next bit of data | |||
self._get_reference = False | |||
self._data = '' | |||
# this will hold the application we are currently working on. | |||
self._current_application = None | |||
# The object which stores our set of planning application results | |||
self._results = PlanningAuthorityResults(authority_name, authority_short_name) | |||
def handle_starttag(self, tag, attrs): | |||
# if we see a table tag, increment the table count. | |||
if tag == 'table': | |||
self._table_count += 1 | |||
# we are only interested in other tags if we are in table 3. | |||
if self._table_count == 3: | |||
# If we are starting a <tr>, create a new PlanningApplication object | |||
# for the application currently being processed | |||
if tag == 'tr': | |||
self._current_application = PlanningApplication() | |||
# if we see a td, increment the <td> count. | |||
if tag == 'td': | |||
self._td_count += 1 | |||
# if we are in the first <td>, and we see a link, | |||
# then it is to the info page for this applicaion. | |||
if tag == 'a' and self._td_count == 1: | |||
for key, value in attrs: | |||
if key == 'href': | |||
url_end = value | |||
self._current_application.info_url = urlparse.urljoin(search_url,url_end) | |||
# We now know that the next bit of data is the reference | |||
self._get_reference = True | |||
# href is the only attribute we are interested in. | |||
break | |||
def handle_endtag(self, tag): | |||
# There is no need to do anything unless we are in table 3. | |||
if self._table_count == 3: | |||
# The end <tr> indicates that the current application is finished. | |||
# Now we can fetch the info_page to get the address, postcode, | |||
# and description. | |||
# If we don't have a reference, then we are in the header row, | |||
# which we don't want. | |||
# There is no point in doing this if the date is not the requested one. | |||
if tag == 'tr' and \ | |||
self._current_application.council_reference is not None and \ | |||
self._current_application.date_received == self._requested_date: | |||
info_page_parser = SouthOxfordshireInfoURLParser() | |||
info_page_parser.feed(urllib2.urlopen(self._current_application.info_url).read()) | |||
self._current_application.address = info_page_parser.address | |||
self._current_application.postcode = getPostcodeFromText(info_page_parser.address) | |||
self._current_application.description = info_page_parser.description | |||
# Add the current application to the results set | |||
self._results.addApplication(self._current_application) | |||
# At the end of the 5th <td>, self._data should contain | |||
# the received date of the application. | |||
if tag == 'td' and self._td_count == 5: | |||
app_year, app_month, app_day = tuple(time.strptime(self._data, "%d %B %Y")[:3]) | |||
self._current_application.date_received = datetime.date(app_year, app_month, app_day) | |||
self._data = '' | |||
self._td_count = 0 | |||
def handle_data(self, data): | |||
# There is no need to do anything if we aren't in table 3. | |||
if self._table_count == 3: | |||
# If we are in the first <td>, and the get_reference flag is set, | |||
# then the next data is the reference. | |||
if self._td_count == 1 and self._get_reference: | |||
self._current_application.council_reference = data | |||
# The comment url can now be made, as it depends only on the reference. | |||
# On this site, the link to the comment page is only displayed once | |||
# the planning authority has decided who is handling this application | |||
# and has opened consultations. The link below works straight away, | |||
# and also works for apps for which the consultation period is over. | |||
# I have no idea if anything is actually done with these comments if | |||
# it is followed too early... | |||
self._current_application.comment_url = comment_url %{'reference': self._current_application.council_reference} | |||
# Set the get_reference flag back to False. | |||
self._get_reference = False | |||
# If we are in the 5th <td>, then we need to collect all the data together | |||
# before we can use it. This is actually processed in handle_endtag. | |||
if self._td_count == 5: | |||
self._data += data | |||
def handle_entityref( self, ref ): | |||
# We might have some entity_refs to clear up. | |||
# there is no need to bother with this if we aren't in the results table. | |||
if self._table_count == 3 and self._td_count == 5: | |||
if ref == 'nbsp': | |||
self._data += ' ' | |||
def getResultsByDayMonthYear(self, day, month, year): | |||
"""This will return an ApplicationResults object containg the | |||
applications for the date passed in.""" | |||
today = datetime.date.today() | |||
self.requested_date = datetime.date(year, month, day) | |||
delta = today - self.requested_date | |||
# to get the correct page, we need | |||
# page ((days mod 7) + 1) | |||
page_number = delta.days/7 + 1 | |||
response = urllib2.urlopen(search_url %page_number) | |||
self.feed(response.read()) | |||
return self._results | |||
def getResults(self, day, month, year): | |||
return getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() | |||
class SouthOxfordshireInfoURLParser(HTMLParser.HTMLParser): | |||
"""This parser is to get the description and address out of the info page | |||
for a South Oxfordshire application.""" | |||
def __init__(self): | |||
HTMLParser.HTMLParser.__init__(self) | |||
self.address = None | |||
self.description = None | |||
# These two states will be set to: | |||
# 0 - if we haven't yet got that bit | |||
# 1 - if we are currently working on it | |||
# 2 - if we have finished | |||
self._address_state = 0 | |||
self._description_state = 0 | |||
# We well need to know whether or not we are in a <td> | |||
self._in_td = False | |||
# This is used for collecting together date which comes in several bits. | |||
self._data = '' | |||
def handle_starttag(self, tag, attrs): | |||
# If we see the start of a <td> and we are still interested in some data | |||
# then set the td flag to true, and blank the data | |||
if tag == 'td' and (self._address_state < 2 or self._description_state < 2): | |||
self._in_td = True | |||
self._data = '' | |||
def handle_endtag(self, tag): | |||
if tag == 'td' and (self._address_state < 2 or self._description_state < 2): | |||
# If we are working on the description, | |||
# set description from _data and note that we need to work on it no more. | |||
if self._description_state == 1: | |||
self.description = self._data | |||
self._description_state = 2 | |||
# If we are working on the address, | |||
# set address from _data and note that we need to work on it no more. | |||
elif self._address_state == 1: | |||
self.address = self._data | |||
self._address_state = 2 | |||
# If we see data which says 'Descripton', | |||
# then set the description state to working. | |||
elif self._data.strip() == 'Description': | |||
self._description_state = 1 | |||
# If we see data which says 'Location', | |||
# then set the addresss state to working. | |||
elif self._data.strip() == 'Location': | |||
self._address_state = 1 | |||
# Note that we are leaving the <td> | |||
self._in_td = False | |||
def handle_data(self, data): | |||
# if we are in a td, and we are still interested in the data for something, | |||
# append the current bit to self._data | |||
if self._in_td and (self._address_state < 2 or self._description_state < 2): | |||
self._data += data | |||
# TODO | |||
# find out what time of day this is run - does it matter that | |||
# we aren't being careful with daylight saving time etc. | |||
# Can we check that scraped email address really is | |||
# an email address? | |||
if __name__ == "__main__": | |||
form = cgi.FieldStorage() | |||
day = form.getfirst('day') | |||
month = form.getfirst('month') | |||
year = form.getfirst('year') | |||
parser = SouthOxfordshireParser() | |||
print "Content-Type: text/xml" # XML is following | |||
print xml # print the xml |
@@ -0,0 +1,248 @@ | |||
import urllib, urllib2 | |||
import HTMLParser | |||
import urlparse | |||
import datetime, time | |||
# This needs a page number inserting | |||
search_url = "http://www.southoxon.gov.uk/ccm/planning/ApplicationList.jsp?PAGE=%d" | |||
# This needs the council reference | |||
comment_url = "https://forms.southoxon.gov.uk/ufs/ufsmain?formid=PLANNINGCOMMENT&PLNGAPPL_REFERENCE=%(reference)s" | |||
authority_name = "South Oxfordshire District Council" | |||
authority_short_name = "South Oxfordshire" | |||
from PlanningUtils import fixNewlines, \ | |||
getPostcodeFromText, \ | |||
PlanningAuthorityResults, \ | |||
PlanningApplication | |||
class SouthOxfordshireParser(HTMLParser.HTMLParser): | |||
"""In this case we'll take the date, so that we can avoid doing dowloads for | |||
the other days in this week's file. This date should be a datetime.date object. | |||
""" | |||
def __init__(self): | |||
HTMLParser.HTMLParser.__init__(self) | |||
self._requested_date = None | |||
# We'll keep a count of the number of tables we have seen. | |||
# All the interesting stuff is in table 3 | |||
self._table_count = 0 | |||
# While inside table 3, we'll keep a count of the number of | |||
# <td>s we have seen. What is in which numbered <td> is detailed below. | |||
# 1 reference | |||
# 3 place and description | |||
# 5 date received | |||
# 2 and 4 are just padding | |||
self._td_count = 0 | |||
# This is just a flag to say that we are now ready to get the reference | |||
# from the next bit of data | |||
self._get_reference = False | |||
self._data = '' | |||
# this will hold the application we are currently working on. | |||
self._current_application = None | |||
# The object which stores our set of planning application results | |||
self._results = PlanningAuthorityResults(authority_name, authority_short_name) | |||
def handle_starttag(self, tag, attrs): | |||
# if we see a table tag, increment the table count. | |||
if tag == 'table': | |||
self._table_count += 1 | |||
# we are only interested in other tags if we are in table 3. | |||
if self._table_count == 3: | |||
# If we are starting a <tr>, create a new PlanningApplication object | |||
# for the application currently being processed | |||
if tag == 'tr': | |||
self._current_application = PlanningApplication() | |||
# if we see a td, increment the <td> count. | |||
if tag == 'td': | |||
self._td_count += 1 | |||
# if we are in the first <td>, and we see a link, | |||
# then it is to the info page for this applicaion. | |||
if tag == 'a' and self._td_count == 1: | |||
for key, value in attrs: | |||
if key == 'href': | |||
url_end = value | |||
self._current_application.info_url = urlparse.urljoin(search_url,url_end) | |||
# We now know that the next bit of data is the reference | |||
self._get_reference = True | |||
# href is the only attribute we are interested in. | |||
break | |||
def handle_endtag(self, tag): | |||
# There is no need to do anything unless we are in table 3. | |||
if self._table_count == 3: | |||
# The end <tr> indicates that the current application is finished. | |||
# Now we can fetch the info_page to get the address, postcode, | |||
# and description. | |||
# If we don't have a reference, then we are in the header row, | |||
# which we don't want. | |||
# There is no point in doing this if the date is not the requested one. | |||
if tag == 'tr' and \ | |||
self._current_application.council_reference is not None and \ | |||
self._current_application.date_received == self._requested_date: | |||
info_page_parser = SouthOxfordshireInfoURLParser() | |||
info_page_parser.feed(urllib2.urlopen(self._current_application.info_url).read()) | |||
self._current_application.address = info_page_parser.address | |||
self._current_application.postcode = getPostcodeFromText(info_page_parser.address) | |||
self._current_application.description = info_page_parser.description | |||
# Add the current application to the results set | |||
self._results.addApplication(self._current_application) | |||
# At the end of the 5th <td>, self._data should contain | |||
# the received date of the application. | |||
if tag == 'td' and self._td_count == 5: | |||
app_year, app_month, app_day = tuple(time.strptime(self._data, "%d %B %Y")[:3]) | |||
self._current_application.date_received = datetime.date(app_year, app_month, app_day) | |||
self._data = '' | |||
self._td_count = 0 | |||
def handle_data(self, data): | |||
# There is no need to do anything if we aren't in table 3. | |||
if self._table_count == 3: | |||
# If we are in the first <td>, and the get_reference flag is set, | |||
# then the next data is the reference. | |||
if self._td_count == 1 and self._get_reference: | |||
self._current_application.council_reference = data | |||
# The comment url can now be made, as it depends only on the reference. | |||
# On this site, the link to the comment page is only displayed once | |||
# the planning authority has decided who is handling this application | |||
# and has opened consultations. The link below works straight away, | |||
# and also works for apps for which the consultation period is over. | |||
# I have no idea if anything is actually done with these comments if | |||
# it is followed too early... | |||
self._current_application.comment_url = comment_url %{'reference': self._current_application.council_reference} | |||
# Set the get_reference flag back to False. | |||
self._get_reference = False | |||
# If we are in the 5th <td>, then we need to collect all the data together | |||
# before we can use it. This is actually processed in handle_endtag. | |||
if self._td_count == 5: | |||
self._data += data | |||
def handle_entityref( self, ref ): | |||
# We might have some entity_refs to clear up. | |||
# there is no need to bother with this if we aren't in the results table. | |||
if self._table_count == 3 and self._td_count == 5: | |||
if ref == 'nbsp': | |||
self._data += ' ' | |||
def getResultsByDayMonthYear(self, day, month, year): | |||
"""This will return an ApplicationResults object containg the | |||
applications for the date passed in.""" | |||
today = datetime.date.today() | |||
self._requested_date = datetime.date(year, month, day) | |||
delta = today - self._requested_date | |||
# to get the correct page, we need | |||
# page ((days mod 7) + 1) | |||
page_number = delta.days/7 + 1 | |||
response = urllib2.urlopen(search_url %page_number) | |||
contents = response.read() | |||
self.feed(contents) | |||
return self._results | |||
def getResults(self, day, month, year): | |||
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() | |||
class SouthOxfordshireInfoURLParser(HTMLParser.HTMLParser): | |||
"""This parser is to get the description and address out of the info page | |||
for a South Oxfordshire application.""" | |||
def __init__(self): | |||
HTMLParser.HTMLParser.__init__(self) | |||
self.address = None | |||
self.description = None | |||
# These two states will be set to: | |||
# 0 - if we haven't yet got that bit | |||
# 1 - if we are currently working on it | |||
# 2 - if we have finished | |||
self._address_state = 0 | |||
self._description_state = 0 | |||
# We well need to know whether or not we are in a <td> | |||
self._in_td = False | |||
# This is used for collecting together date which comes in several bits. | |||
self._data = '' | |||
def handle_starttag(self, tag, attrs): | |||
# If we see the start of a <td> and we are still interested in some data | |||
# then set the td flag to true, and blank the data | |||
if tag == 'td' and (self._address_state < 2 or self._description_state < 2): | |||
self._in_td = True | |||
self._data = '' | |||
def handle_endtag(self, tag): | |||
if tag == 'td' and (self._address_state < 2 or self._description_state < 2): | |||
# If we are working on the description, | |||
# set description from _data and note that we need to work on it no more. | |||
if self._description_state == 1: | |||
self.description = self._data | |||
self._description_state = 2 | |||
# If we are working on the address, | |||
# set address from _data and note that we need to work on it no more. | |||
elif self._address_state == 1: | |||
self.address = self._data | |||
self._address_state = 2 | |||
# If we see data which says 'Descripton', | |||
# then set the description state to working. | |||
elif self._data.strip() == 'Description': | |||
self._description_state = 1 | |||
# If we see data which says 'Location', | |||
# then set the addresss state to working. | |||
elif self._data.strip() == 'Location': | |||
self._address_state = 1 | |||
# Note that we are leaving the <td> | |||
self._in_td = False | |||
def handle_data(self, data): | |||
# if we are in a td, and we are still interested in the data for something, | |||
# append the current bit to self._data | |||
if self._in_td and (self._address_state < 2 or self._description_state < 2): | |||
self._data += data | |||
# TODO | |||
# find out what time of day this is run - does it matter that | |||
# we aren't being careful with daylight saving time etc. | |||
# Can we check that scraped email address really is | |||
# an email address? |
@@ -0,0 +1,29 @@ | |||
#!/usr/bin/python | |||
list_of_sites_filename = "PublicAccessSites.csv" | |||
template_filename = "CGITemplate" | |||
python_location = "/usr/bin/python" | |||
import csv | |||
from os import chmod | |||
list_of_sites_file = open(list_of_sites_filename) | |||
csv_reader = csv.DictReader(list_of_sites_file, quoting=csv.QUOTE_ALL, skipinitialspace=True) | |||
template_contents = open(template_filename).read() | |||
template = "#!" + python_location +"\n\n" + template_contents | |||
for site_dict in csv_reader: | |||
filename = "%s.cgi" %site_dict["authority_short_name"] | |||
contents = template %site_dict | |||
this_file = open(filename, "w") | |||
print "Writing %s" %filename | |||
this_file.write(contents) | |||
this_file.close() | |||
chmod(filename, 0755) | |||
# need to look at: | |||
# "Perth and Kinross Council", "Perthshire", "http://193.63.61.22/publicaccess/" |