From 03084bd2dbbe13d6ac628890df5c805d52791200 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Thu, 12 Jun 2008 17:15:14 +0000 Subject: [PATCH] Add scraper for Shetland Islands. Encode all output as UTF-8. --- python_scrapers/CGITemplate | 2 +- python_scrapers/OtherFilesToCopy.csv | 1 + python_scrapers/PlanningUtils.py | 33 +++---- python_scrapers/Shetland.py | 124 ++++++++++++++++----------- python_scrapers/SitesToGenerate.csv | 1 + 5 files changed, 96 insertions(+), 65 deletions(-) diff --git a/python_scrapers/CGITemplate b/python_scrapers/CGITemplate index 0950f07..03ea055 100644 --- a/python_scrapers/CGITemplate +++ b/python_scrapers/CGITemplate @@ -28,4 +28,4 @@ xml = parser.getResults(day, month, year) print "Content-Type: text/xml" # XML is following print -print xml # print the xml +print xml.encode("utf-8") # print the xml diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index f35d274..e0e0eca 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -23,3 +23,4 @@ "Ocella.py", "420" "IsleOfWight.py", "420" "Barnsley.py", "420" +"Shetland.py", "420" diff --git a/python_scrapers/PlanningUtils.py b/python_scrapers/PlanningUtils.py index 273a652..f97affd 100644 --- a/python_scrapers/PlanningUtils.py +++ b/python_scrapers/PlanningUtils.py @@ -65,12 +65,13 @@ class PlanningAuthorityResults: applications_bit = "".join([x.displayXML() for x in self.planning_applications]) - return "\n" +\ - "%s\n" %self.authority_name +\ - "%s\n" %self.authority_short_name +\ - "\n" + applications_bit +\ - "\n" +\ - "\n" + return u"""\n""" + \ + u"\n" +\ + u"%s\n" %self.authority_name +\ + u"%s\n" %self.authority_short_name +\ + u"\n" + applications_bit +\ + u"\n" +\ + u"\n" @@ -104,12 +105,14 @@ class PlanningApplication: def displayXML(self): #print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received - return "\n" +\ - "%s\n" %xmlQuote(self.council_reference) +\ - "
%s
\n" %xmlQuote(self.address) +\ - "%s\n" %self.postcode +\ - "%s\n" %xmlQuote(self.description) +\ - "%s\n" %xmlQuote(self.info_url) +\ - "%s\n" %xmlQuote(self.comment_url) +\ - "%s\n" %self.date_received.strftime(date_format) +\ - "
\n" + return u"\n" +\ + u"%s\n" %xmlQuote(self.council_reference) +\ + u"
%s
\n" %xmlQuote(self.address) +\ + u"%s\n" %self.postcode +\ + u"%s\n" %xmlQuote(self.description) +\ + u"%s\n" %xmlQuote(self.info_url) +\ + u"%s\n" %xmlQuote(self.comment_url) +\ + u"%s\n" %self.date_received.strftime(date_format) +\ + u"
\n" + + diff --git a/python_scrapers/Shetland.py b/python_scrapers/Shetland.py index c42877e..45a7ae4 100644 --- a/python_scrapers/Shetland.py +++ b/python_scrapers/Shetland.py @@ -1,9 +1,13 @@ +""" +The Shetland Isles site shows applications from the last 14 days. +These are paginated into groups of ten. +""" + import urllib2 -import urllib import urlparse import datetime, time -import cgi +import re from BeautifulSoup import BeautifulSoup @@ -13,88 +17,110 @@ from PlanningUtils import PlanningApplication, \ date_format = "%d/%m/%Y" +page_count_regex = re.compile("Records 1 to 10 of (\d*) Records Found") + class ShetlandParser: def __init__(self, *args): self.authority_name = "Shetland Islands Council" self.authority_short_name = "Shetland Islands" - self.base_url = "http://www.shetland.gov.uk/planningcontrol/apps/apps.asp?time=14&Orderby=DESC&parish=All&Pref=&Address=&Applicant=&ApplicantBut=View&sortby=PlanRef&offset=0" + self.base_url = "http://www.shetland.gov.uk/planningcontrol/apps/apps.asp?time=14&Orderby=DESC&parish=All&Pref=&Address=&Applicant=&ApplicantBut=View&sortby=PlanRef&offset=%d" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) - def getResultsByDayMonthYear(self): - # Note that we don't take the day, month and year parameters here. + def getResultsByDayMonthYear(self, day, month, year): + search_date = datetime.datetime(year, month, day) + + offset = 0 # First get the search page - request = urllib2.Request(self.base_url) - response = urllib2.urlopen(request) + response = urllib2.urlopen(self.base_url %(offset)) + + contents = response.read() - soup = BeautifulSoup(response.read()) + # First let's find out how many records there are (they are displayed ten per page). + match = page_count_regex.search(contents) + app_count = int(match.groups()[0]) - # The apps are in the 5th table on the page (not a very good way to get it...) - results_table = soup.findAll("table")[5] + while offset < app_count: + if offset != 0: + contents = urllib2.urlopen(self.base_url %(offset)).read() - # Now we need to find the trs which contain the apps. - # The first TR is just headers. - # After that they alternate between containing an app and just some display graphics - # until the third from last. After that, they contain more rubbish. + soup = BeautifulSoup(contents) + + # The apps are in the 5th table on the page (not a very good way to get it...) + results_table = soup.findAll("table")[5] - trs = results_table.findAll("tr")[1:-2] + # Now we need to find the trs which contain the apps. + # The first TR is just headers. + # After that they alternate between containing an app and just some display graphics + # until the third from last. After that, they contain more rubbish. - for i in range(len(trs)): - # We are only interested in the trs in even positions in the list. - if i % 2 == 0: - tr = trs[i] + trs = results_table.findAll("tr")[1:-2] - application = PlanningApplication() + for i in range(len(trs)): + # We are only interested in the trs in even positions in the list. + if i % 2 == 0: + tr = trs[i] - application.date_received = datetime.datetime(*(time.strptime(comment_url_element.findNext("td").string.strip(), date_format)[0:6])) + application = PlanningApplication() - application.council_reference = tr.a.string + comment_url_element = tr.find(text="comment on this planning application").parent + application.date_received = datetime.datetime(*(time.strptime(comment_url_element.findNext("td").string.strip(), date_format)[0:6])) - comment_url_element = tr.find(text="comment on this planning application").parent - application.comment_url = urlparse.urljoin(self.base_url, comment_url_element['href']) + # If the date of this application is earlier than the date + # we are searching for then don't download it. + # We could optimize this a bit more by not doing the later pages. - application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) + if application.date_received < search_date: + break - info_response = urllib2.urlopen(application.info_url) + application.council_reference = tr.a.string - info_soup = BeautifulSoup(info_response.read()) - - info_table = info_soup.findAll("table")[2] + application.comment_url = urlparse.urljoin(self.base_url, comment_url_element['href']) - application.description = info_table.find(text="Proposal:").findNext("td").contents[0].strip() - application.postcode = info_table.find(text="Postcode:").findNext("td").contents[0].strip() + application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) - # Now to get the address. This will be split across several tds. + info_response = urllib2.urlopen(application.info_url) - address_start_td = info_table.find("td", rowspan="4") + info_soup = BeautifulSoup(info_response.read()) - # We need the first bit of the address from this tr - address_bits = [address_start_td.findNext("td").string.strip()] + info_table = info_soup.findAll("table")[2] - # We will need the first td from the next three trs after this - for address_tr in address_start_td.findAllNext("tr")[:3]: - address_line = address_tr.td.string.strip() - - if address_line: - address_bits.append(address_line) - - address_bits.append(application.postcode) + application.description = info_table.find(text="Proposal:").findNext("td").contents[0].strip() + application.postcode = info_table.find(text="Postcode:").findNext("td").contents[0].strip() + + # Now to get the address. This will be split across several tds. + + address_start_td = info_table.find("td", rowspan="4") - application.address = ', '.join(address_bits) + # We need the first bit of the address from this tr + address_bits = [address_start_td.findNext("td").string.strip()] - self._results.addApplication(application) - + # We will need the first td from the next three trs after this + for address_tr in address_start_td.findAllNext("tr")[:3]: + address_line = address_tr.td.string.strip() + + if address_line: + address_bits.append(address_line) + + address_bits.append(application.postcode) + + application.address = ', '.join(address_bits) + + self._results.addApplication(application) + + offset += 10 return self._results def getResults(self, day, month, year): - return self.getResultsByDayMonthYear().displayXML() + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() if __name__ == '__main__': parser = ShetlandParser() - print parser.getResults(21,5,2008) -# TODO: Sort out pagination + # Note: to test this, you will need to pick a current date. + print parser.getResults(9,6,2008) + diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index d6cd45d..af74a3d 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -221,3 +221,4 @@ "Isle of Wight Council", "Isle of Wight", "", "IsleOfWight", "IsleOfWightParser" "Barnsley Metropolitan Borough Council", "Barnsley", "", "Barnsley", "BarnsleyParser" "Daventry District Council", "Daventry", "http://www.daventrydc.gov.uk/swiftlg/apas/run/wphappcriteria.display", "SwiftLG", "SwiftLGParser" +"Shetland Islands Council", "Shetland Islands", "", "Shetland", "ShetlandParser"