From 945133bdeb8ad02b69d89248ac380045cebceab9 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Thu, 12 Jun 2008 14:40:28 +0000 Subject: [PATCH] Add mostly done shetland scraper. --- python_scrapers/Shetland.py | 100 ++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 python_scrapers/Shetland.py diff --git a/python_scrapers/Shetland.py b/python_scrapers/Shetland.py new file mode 100644 index 0000000..c42877e --- /dev/null +++ b/python_scrapers/Shetland.py @@ -0,0 +1,100 @@ +import urllib2 +import urllib +import urlparse + +import datetime, time +import cgi + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +date_format = "%d/%m/%Y" + +class ShetlandParser: + def __init__(self, *args): + + self.authority_name = "Shetland Islands Council" + self.authority_short_name = "Shetland Islands" + self.base_url = "http://www.shetland.gov.uk/planningcontrol/apps/apps.asp?time=14&Orderby=DESC&parish=All&Pref=&Address=&Applicant=&ApplicantBut=View&sortby=PlanRef&offset=0" + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self): + # Note that we don't take the day, month and year parameters here. + + # First get the search page + request = urllib2.Request(self.base_url) + response = urllib2.urlopen(request) + + soup = BeautifulSoup(response.read()) + + # The apps are in the 5th table on the page (not a very good way to get it...) + results_table = soup.findAll("table")[5] + + # Now we need to find the trs which contain the apps. + # The first TR is just headers. + # After that they alternate between containing an app and just some display graphics + # until the third from last. After that, they contain more rubbish. + + trs = results_table.findAll("tr")[1:-2] + + for i in range(len(trs)): + # We are only interested in the trs in even positions in the list. + if i % 2 == 0: + tr = trs[i] + + application = PlanningApplication() + + application.date_received = datetime.datetime(*(time.strptime(comment_url_element.findNext("td").string.strip(), date_format)[0:6])) + + application.council_reference = tr.a.string + + comment_url_element = tr.find(text="comment on this planning application").parent + application.comment_url = urlparse.urljoin(self.base_url, comment_url_element['href']) + + application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) + + info_response = urllib2.urlopen(application.info_url) + + info_soup = BeautifulSoup(info_response.read()) + + info_table = info_soup.findAll("table")[2] + + application.description = info_table.find(text="Proposal:").findNext("td").contents[0].strip() + application.postcode = info_table.find(text="Postcode:").findNext("td").contents[0].strip() + + # Now to get the address. This will be split across several tds. + + address_start_td = info_table.find("td", rowspan="4") + + # We need the first bit of the address from this tr + address_bits = [address_start_td.findNext("td").string.strip()] + + # We will need the first td from the next three trs after this + for address_tr in address_start_td.findAllNext("tr")[:3]: + address_line = address_tr.td.string.strip() + + if address_line: + address_bits.append(address_line) + + address_bits.append(application.postcode) + + application.address = ', '.join(address_bits) + + self._results.addApplication(application) + + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear().displayXML() + +if __name__ == '__main__': + parser = ShetlandParser() + print parser.getResults(21,5,2008) + +# TODO: Sort out pagination