|
|
@@ -1,9 +1,13 @@ |
|
|
|
""" |
|
|
|
The Shetland Isles site shows applications from the last 14 days. |
|
|
|
These are paginated into groups of ten. |
|
|
|
""" |
|
|
|
|
|
|
|
import urllib2 |
|
|
|
import urllib |
|
|
|
import urlparse |
|
|
|
|
|
|
|
import datetime, time |
|
|
|
import cgi |
|
|
|
import re |
|
|
|
|
|
|
|
from BeautifulSoup import BeautifulSoup |
|
|
|
|
|
|
@@ -13,88 +17,110 @@ from PlanningUtils import PlanningApplication, \ |
|
|
|
|
|
|
|
date_format = "%d/%m/%Y" |
|
|
|
|
|
|
|
page_count_regex = re.compile("Records 1 to 10 of (\d*) Records Found") |
|
|
|
|
|
|
|
class ShetlandParser: |
|
|
|
def __init__(self, *args): |
|
|
|
|
|
|
|
self.authority_name = "Shetland Islands Council" |
|
|
|
self.authority_short_name = "Shetland Islands" |
|
|
|
self.base_url = "http://www.shetland.gov.uk/planningcontrol/apps/apps.asp?time=14&Orderby=DESC&parish=All&Pref=&Address=&Applicant=&ApplicantBut=View&sortby=PlanRef&offset=0" |
|
|
|
self.base_url = "http://www.shetland.gov.uk/planningcontrol/apps/apps.asp?time=14&Orderby=DESC&parish=All&Pref=&Address=&Applicant=&ApplicantBut=View&sortby=PlanRef&offset=%d" |
|
|
|
|
|
|
|
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) |
|
|
|
|
|
|
|
|
|
|
|
def getResultsByDayMonthYear(self): |
|
|
|
# Note that we don't take the day, month and year parameters here. |
|
|
|
def getResultsByDayMonthYear(self, day, month, year): |
|
|
|
search_date = datetime.datetime(year, month, day) |
|
|
|
|
|
|
|
offset = 0 |
|
|
|
|
|
|
|
# First get the search page |
|
|
|
request = urllib2.Request(self.base_url) |
|
|
|
response = urllib2.urlopen(request) |
|
|
|
response = urllib2.urlopen(self.base_url %(offset)) |
|
|
|
|
|
|
|
contents = response.read() |
|
|
|
|
|
|
|
soup = BeautifulSoup(response.read()) |
|
|
|
# First let's find out how many records there are (they are displayed ten per page). |
|
|
|
match = page_count_regex.search(contents) |
|
|
|
app_count = int(match.groups()[0]) |
|
|
|
|
|
|
|
# The apps are in the 5th table on the page (not a very good way to get it...) |
|
|
|
results_table = soup.findAll("table")[5] |
|
|
|
while offset < app_count: |
|
|
|
if offset != 0: |
|
|
|
contents = urllib2.urlopen(self.base_url %(offset)).read() |
|
|
|
|
|
|
|
# Now we need to find the trs which contain the apps. |
|
|
|
# The first TR is just headers. |
|
|
|
# After that they alternate between containing an app and just some display graphics |
|
|
|
# until the third from last. After that, they contain more rubbish. |
|
|
|
soup = BeautifulSoup(contents) |
|
|
|
|
|
|
|
# The apps are in the 5th table on the page (not a very good way to get it...) |
|
|
|
results_table = soup.findAll("table")[5] |
|
|
|
|
|
|
|
trs = results_table.findAll("tr")[1:-2] |
|
|
|
# Now we need to find the trs which contain the apps. |
|
|
|
# The first TR is just headers. |
|
|
|
# After that they alternate between containing an app and just some display graphics |
|
|
|
# until the third from last. After that, they contain more rubbish. |
|
|
|
|
|
|
|
for i in range(len(trs)): |
|
|
|
# We are only interested in the trs in even positions in the list. |
|
|
|
if i % 2 == 0: |
|
|
|
tr = trs[i] |
|
|
|
trs = results_table.findAll("tr")[1:-2] |
|
|
|
|
|
|
|
application = PlanningApplication() |
|
|
|
for i in range(len(trs)): |
|
|
|
# We are only interested in the trs in even positions in the list. |
|
|
|
if i % 2 == 0: |
|
|
|
tr = trs[i] |
|
|
|
|
|
|
|
application.date_received = datetime.datetime(*(time.strptime(comment_url_element.findNext("td").string.strip(), date_format)[0:6])) |
|
|
|
application = PlanningApplication() |
|
|
|
|
|
|
|
application.council_reference = tr.a.string |
|
|
|
comment_url_element = tr.find(text="comment on this planning application").parent |
|
|
|
application.date_received = datetime.datetime(*(time.strptime(comment_url_element.findNext("td").string.strip(), date_format)[0:6])) |
|
|
|
|
|
|
|
comment_url_element = tr.find(text="comment on this planning application").parent |
|
|
|
application.comment_url = urlparse.urljoin(self.base_url, comment_url_element['href']) |
|
|
|
# If the date of this application is earlier than the date |
|
|
|
# we are searching for then don't download it. |
|
|
|
# We could optimize this a bit more by not doing the later pages. |
|
|
|
|
|
|
|
application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) |
|
|
|
if application.date_received < search_date: |
|
|
|
break |
|
|
|
|
|
|
|
info_response = urllib2.urlopen(application.info_url) |
|
|
|
application.council_reference = tr.a.string |
|
|
|
|
|
|
|
info_soup = BeautifulSoup(info_response.read()) |
|
|
|
|
|
|
|
info_table = info_soup.findAll("table")[2] |
|
|
|
application.comment_url = urlparse.urljoin(self.base_url, comment_url_element['href']) |
|
|
|
|
|
|
|
application.description = info_table.find(text="Proposal:").findNext("td").contents[0].strip() |
|
|
|
application.postcode = info_table.find(text="Postcode:").findNext("td").contents[0].strip() |
|
|
|
application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) |
|
|
|
|
|
|
|
# Now to get the address. This will be split across several tds. |
|
|
|
info_response = urllib2.urlopen(application.info_url) |
|
|
|
|
|
|
|
address_start_td = info_table.find("td", rowspan="4") |
|
|
|
info_soup = BeautifulSoup(info_response.read()) |
|
|
|
|
|
|
|
# We need the first bit of the address from this tr |
|
|
|
address_bits = [address_start_td.findNext("td").string.strip()] |
|
|
|
info_table = info_soup.findAll("table")[2] |
|
|
|
|
|
|
|
# We will need the first td from the next three trs after this |
|
|
|
for address_tr in address_start_td.findAllNext("tr")[:3]: |
|
|
|
address_line = address_tr.td.string.strip() |
|
|
|
|
|
|
|
if address_line: |
|
|
|
address_bits.append(address_line) |
|
|
|
|
|
|
|
address_bits.append(application.postcode) |
|
|
|
application.description = info_table.find(text="Proposal:").findNext("td").contents[0].strip() |
|
|
|
application.postcode = info_table.find(text="Postcode:").findNext("td").contents[0].strip() |
|
|
|
|
|
|
|
# Now to get the address. This will be split across several tds. |
|
|
|
|
|
|
|
address_start_td = info_table.find("td", rowspan="4") |
|
|
|
|
|
|
|
application.address = ', '.join(address_bits) |
|
|
|
# We need the first bit of the address from this tr |
|
|
|
address_bits = [address_start_td.findNext("td").string.strip()] |
|
|
|
|
|
|
|
self._results.addApplication(application) |
|
|
|
|
|
|
|
# We will need the first td from the next three trs after this |
|
|
|
for address_tr in address_start_td.findAllNext("tr")[:3]: |
|
|
|
address_line = address_tr.td.string.strip() |
|
|
|
|
|
|
|
if address_line: |
|
|
|
address_bits.append(address_line) |
|
|
|
|
|
|
|
address_bits.append(application.postcode) |
|
|
|
|
|
|
|
application.address = ', '.join(address_bits) |
|
|
|
|
|
|
|
self._results.addApplication(application) |
|
|
|
|
|
|
|
offset += 10 |
|
|
|
|
|
|
|
return self._results |
|
|
|
|
|
|
|
def getResults(self, day, month, year): |
|
|
|
return self.getResultsByDayMonthYear().displayXML() |
|
|
|
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() |
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
parser = ShetlandParser() |
|
|
|
print parser.getResults(21,5,2008) |
|
|
|
|
|
|
|
# TODO: Sort out pagination |
|
|
|
# Note: to test this, you will need to pick a current date. |
|
|
|
print parser.getResults(9,6,2008) |
|
|
|
|