import urllib2 import urllib import urlparse import datetime, time import cgi import re from BeautifulSoup import BeautifulSoup from PlanningUtils import PlanningApplication, \ PlanningAuthorityResults, \ getPostcodeFromText date_format = "%d/%m/%Y" class BrentParser: def __init__(self, *args): self.authority_name = "London Borough of Brent" self.authority_short_name = "Brent" # self.base_url = "" self.base_url = "" self._current_application = None self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_day =, month, day) post_data = [ ("from", search_day.strftime(date_format)), ("until", search_day.strftime(date_format)), ("EXECUTEQUERY", "Query"), # ("auth", "402"), ("st", "PL"), ("periodUnits", "day"), ("periodMultiples", "14"), ("title", "Search+by+Application+Date"), ("instructions", "Enter+a+date+range+to+search+for+existing+applications+by+the+date+of+application.%0D%0A%3Cbr%3E%3Cbr%3E%0D%0A%3Cstrong%3ENote%3A%3C%2Fstrong%3E+Where+%27%28Applicant%27s+Description%29%27+appears+in+the+proposal%2C+the+text+may+subsequently+be+amended+when+the+application+is+checked."), ("byFormat", "N"), ("byOther1", "N"), ("byOther2", "N"), ("byOther3", "N"), ("byOther4", "N"), ("byOther5", "N"), ("byPostcode", "N"), ("byStreet", "N"), ("byHouseNumber", "N"), ("byAddress", "N"), ("byPeriod", "Y"), ("extId", "101149"), # I wonder what this is... ("queried", "Y"), ("other1Label", "Other1"), ("other2Label", "Other2"), ("other3Label", "Other3"), ("other4Label", "Other4"), ("other5Label", "Other5"), ("other1List", ""), ("other2List", ""), ("other3List", ""), ("other4List", ""), ("other5List", ""), ("periodLabel", "From"), ("addressLabel", "Select+Address"), ("print", "") ] # Now get the search page response = urllib2.urlopen(self.base_url, urllib.urlencode(post_data)) soup = BeautifulSoup( trs = soup.find(text="Search Results").findNext("table").findAll("tr")[:-1] # There are six trs per application, ish # The first contains the case no and the application date. # The second contains the address # The third contains the description # The fourth contains the info page link # The fifth contains the comment link (or a note that comments are currently not being accepted # The sixth is a spacer. count = 0 for tr in trs: count +=1 ref = tr.find(text=re.compile("Case No:")) if ref: self._current_application = PlanningApplication() count = 1 self._current_application.council_reference = ref.split(":")[1].strip() self._current_application.date_received = search_day if count % 6 == 2: self._current_application.address = self._current_application.postcode = getPostcodeFromText(self._current_application.address) if count % 6 == 3: self._current_application.description = if count % 6 == 4: self._current_application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) if count % 6 == 5: try: self._current_application.comment_url = urlparse.urljoin(self.base_url, tr.a['href']) except: # Comments are not currently being accepted. We'll leave this app for the moment - we'll pick it up later if they start accepting comments continue if count % 6 == 0 and self._current_application.is_ready(): self._results.addApplication(self._current_application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() if __name__ == '__main__': parser = BrentParser() print parser.getResults(6,8,2008)