import urllib2 import urllib import urlparse import datetime, time import cgi from BeautifulSoup import BeautifulSoup from PlanningUtils import PlanningApplication, \ PlanningAuthorityResults, \ getPostcodeFromText search_date_format = "%d+%b+%Y" received_date_format = "%d %b %Y" class ExmoorParser: def __init__(self, *args): self.authority_name = "Exmoor National Park" self.authority_short_name = "Exmoor" self.base_url = "http://www.exmoor-nationalpark.gov.uk/planning_weekly_list.htm?weeklylist=%s" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) response = urllib2.urlopen(self.base_url %(search_day.strftime(search_date_format))) soup = BeautifulSoup(response.read()) # The first contains headers trs = soup.table.findAll("tr")[1:] for tr in trs: application = PlanningApplication() tds = tr.findAll("td") application.date_received = datetime.datetime.strptime(tds[0].string, received_date_format).date() application.info_url = urllib.unquote(urllib.quote_plus(urlparse.urljoin(self.base_url, tds[1].a['href']))) application.council_reference = tds[1].a.string.strip() application.address = tds[2].a.string.strip() application.postcode = getPostcodeFromText(application.address) # Now fetch the info url info_response = urllib.urlopen(application.info_url) info_soup = BeautifulSoup(info_response.read()) application.description = info_soup.find(text="Proposal:").findNext("td").string.strip() try: application.comment_url = urlparse.urljoin(self.base_url, info_soup.find(text="Comment").parent['href']) except: application.comment_url = "No Comments" self._results.addApplication(application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() if __name__ == '__main__': parser = ExmoorParser() print parser.getResults(1,8,2008)