| @@ -0,0 +1,100 @@ | |||
| import urllib2 | |||
| import urllib | |||
| import urlparse | |||
| import datetime, time | |||
| import cgi | |||
| from BeautifulSoup import BeautifulSoup | |||
| from PlanningUtils import PlanningApplication, \ | |||
| PlanningAuthorityResults, \ | |||
| getPostcodeFromText | |||
| date_format = "%d/%m/%Y" | |||
| class ShetlandParser: | |||
| def __init__(self, *args): | |||
| self.authority_name = "Shetland Islands Council" | |||
| self.authority_short_name = "Shetland Islands" | |||
| self.base_url = "http://www.shetland.gov.uk/planningcontrol/apps/apps.asp?time=14&Orderby=DESC&parish=All&Pref=&Address=&Applicant=&ApplicantBut=View&sortby=PlanRef&offset=0" | |||
| self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) | |||
| def getResultsByDayMonthYear(self): | |||
| # Note that we don't take the day, month and year parameters here. | |||
| # First get the search page | |||
| request = urllib2.Request(self.base_url) | |||
| response = urllib2.urlopen(request) | |||
| soup = BeautifulSoup(response.read()) | |||
| # The apps are in the 5th table on the page (not a very good way to get it...) | |||
| results_table = soup.findAll("table")[5] | |||
| # Now we need to find the trs which contain the apps. | |||
| # The first TR is just headers. | |||
| # After that they alternate between containing an app and just some display graphics | |||
| # until the third from last. After that, they contain more rubbish. | |||
| trs = results_table.findAll("tr")[1:-2] | |||
| for i in range(len(trs)): | |||
| # We are only interested in the trs in even positions in the list. | |||
| if i % 2 == 0: | |||
| tr = trs[i] | |||
| application = PlanningApplication() | |||
| application.date_received = datetime.datetime(*(time.strptime(comment_url_element.findNext("td").string.strip(), date_format)[0:6])) | |||
| application.council_reference = tr.a.string | |||
| comment_url_element = tr.find(text="comment on this planning application").parent | |||
| application.comment_url = urlparse.urljoin(self.base_url, comment_url_element['href']) | |||
| application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) | |||
| info_response = urllib2.urlopen(application.info_url) | |||
| info_soup = BeautifulSoup(info_response.read()) | |||
| info_table = info_soup.findAll("table")[2] | |||
| application.description = info_table.find(text="Proposal:").findNext("td").contents[0].strip() | |||
| application.postcode = info_table.find(text="Postcode:").findNext("td").contents[0].strip() | |||
| # Now to get the address. This will be split across several tds. | |||
| address_start_td = info_table.find("td", rowspan="4") | |||
| # We need the first bit of the address from this tr | |||
| address_bits = [address_start_td.findNext("td").string.strip()] | |||
| # We will need the first td from the next three trs after this | |||
| for address_tr in address_start_td.findAllNext("tr")[:3]: | |||
| address_line = address_tr.td.string.strip() | |||
| if address_line: | |||
| address_bits.append(address_line) | |||
| address_bits.append(application.postcode) | |||
| application.address = ', '.join(address_bits) | |||
| self._results.addApplication(application) | |||
| return self._results | |||
| def getResults(self, day, month, year): | |||
| return self.getResultsByDayMonthYear().displayXML() | |||
| if __name__ == '__main__': | |||
| parser = ShetlandParser() | |||
| print parser.getResults(21,5,2008) | |||
| # TODO: Sort out pagination | |||