import urllib2 import urllib import urlparse import datetime, time import cgi import cookielib cookie_jar = cookielib.CookieJar() from BeautifulSoup import BeautifulSoup from PlanningUtils import PlanningApplication, \ PlanningAuthorityResults, \ getPostcodeFromText #date_format = "%d-%m-%Y" date_format = "%d/%m/%Y" received_date_format = "%d %B %Y" import re # We're going to use this for a re.split # A whitespace char, "of" or "at" (case independent), and then a whitespace char. address_finder_re = re.compile("\s(?:of)|(?:at)\s", re.I) class HaltonParser: def __init__(self, *args): self.authority_name = "Halton Borough Council" self.authority_short_name = "Halton" self.base_url = "http://www.halton.gov.uk/planningapps/index.asp" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) #CaseNo=&AgtName=&AppName=&DateApValFrom=&DateApValTo=&AdrsNo=&StName=&StTown=&DropWeekDate=28-08-2008&DropAppealStatus=0&DateAppealValFrom=&DateAppealValTo=&Action=Search def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) # It seems dates are interpreted as midnight on post_data = urllib.urlencode( [ # ("CaseNo", ""), # ("AppName", ""), ("DateApValFrom", search_day.strftime(date_format)), ("DateApValTo", (search_day + datetime.timedelta(1)).strftime(date_format)), # ("AdrsNo", ""), # ("StName", ""), # ("StTown", ""), ("DropWeekDate", "0"),#search_day.strftime(date_format)), ("DropAppealStatus", "0"), # ("DateAppealValFrom", ""), # ("DateAppealValTo", ""), ("PageSize", "10"), ("Action", "Search"), ] ) request = urllib2.Request(self.base_url, post_data) while request: # Now get the search page # We need to deal with cookies, since pagination depends on them. cookie_jar.add_cookie_header(request) response = urllib2.urlopen(request) cookie_jar.extract_cookies(response, request) soup = BeautifulSoup(response.read()) # This should find us each Case on the current page. caseno_strings = soup.findAll(text="Case No:") for caseno_string in caseno_strings: application = PlanningApplication() application.council_reference = caseno_string.findNext("td").string application.description = caseno_string.findNext(text="Details of proposal:").findNext("td").string.strip() application.date_received = datetime.datetime.strptime(caseno_string.findNext(text="Date Received").findNext("td").string, received_date_format).date() # The address here is included in the description. We'll have to do some heuristics to try to work out where it starts. # As a first go, we'll try splitting the description on the last occurence of " of " or " at ". try: application.address = re.split(address_finder_re, application.description)[-1].strip() except IndexError: # If we can't find of or at, we'll just have the description again, it's better than nothing. application.address = application.description # We may as well get the postcode from the description rather than the address, in case things have gone wrong application.postcode = getPostcodeFromText(application.description) application.comment_url = urlparse.urljoin(self.base_url, caseno_string.findNext("form")['action']) # Now what to have as info url... # There is no way to link to a specific app, so we'll just have the search page. application.info_url = self.base_url self._results.addApplication(application) # Now we need to find the post data for the next page, if there is any. # Find the form with id "formNext", if there is one next_form = soup.find("form", id="formNext") if next_form is not None: action = next_form['action'] # The HTML is borked - the inputs are outside the form, they are all # in a td which follows it. inputs = next_form.findNext("td").findAll("input") post_data = urllib.urlencode([(x['name'], x['value']) for x in inputs]) request = urllib2.Request(urlparse.urljoin(self.base_url, action), post_data) else: request = None return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() if __name__ == '__main__': parser = HaltonParser() print parser.getResults(4,8,2008)