import urllib2 import urllib import urlparse import datetime import re from BeautifulSoup import BeautifulSoup from PlanningUtils import PlanningApplication, \ PlanningAuthorityResults, \ getPostcodeFromText date_format = "%d%%2F%m%%2F%Y" class CalderdaleParser: def __init__(self, *args): self.authority_name = "Calderdale Council" self.authority_short_name = "Calderdale" self.base_url = "http://www.calderdale.gov.uk/environment/planning/search-applications/planapps.jsp?status=0&date1=%(date)s&date2=%(date)s&Search=Search" self.info_url = "http://www.calderdale.gov.uk/environment/planning/search-applications/planapps.jsp?app=%s&Search=Search" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_date = datetime.date(year, month, day) next_page_url = self.base_url %{"date": search_date.strftime(date_format)} while next_page_url: try: response = urllib2.urlopen(next_page_url) except urllib2.HTTPError: # This is what seems to happen if there are no apps break soup = BeautifulSoup(response.read()) next = soup.find(text="Next") if next: next_page_url = urlparse.urljoin(self.base_url, next.parent['href']) else: next_page_url = None # There is an