#!/usr/local/bin/python import urllib, urllib2 import HTMLParser #from BeautifulSoup import BeautifulSoup import urlparse import re end_head_regex = re.compile(" 0: self._subtable_depth -= 1 else: # We need to add the last application in the table if self._current_application is not None: #print "adding application" self._results.addApplication(self._current_application) #print self._current_application self._current_application = None self._tr_number = None self._subtable_depth = None elif tag == "td": self._in_td = False def getResultsByDayMonthYear(self, day, month, year): # first we fetch the search page to get ourselves some session info... search_form_response = urllib2.urlopen(self.base_url) search_form_contents = search_form_response.read() #outfile = open("tmpfile", "w") #outfile.write(search_form_contents) # This sometimes causes a problem in HTMLParser, so let's just get the link # out with a regex... groups = self.action_regex.search(search_form_contents).groups() action = groups[0] #print action action_url = urlparse.urljoin(self.base_url, action) #print action_url our_date = date(year, month, day) search_data = {"regdate1": our_date.strftime(date_format), "regdate2": our_date.strftime(date_format), } opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler) response = opener.open(action_url, search_data) results_html = response.read() # This is for doing site specific html cleanup results_html = self._cleanupHTML(results_html) #some javascript garbage in the header upsets HTMLParser, #so we'll just have the body just_body = "" + end_head_regex.split(results_html)[-1] #outfile = open(self.authority_short_name + ".debug", "w") #outfile.write(just_body) self.feed(just_body) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() class BaberghParser(AcolnetParser): #search_url = "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch" case_number_tr = 1 # this one can be got by the td class attribute reg_date_tr = 2 location_tr = 4 proposal_tr = 5 #authority_name = "Babergh District Council" #authority_short_name = "Babergh" # It would be nice to scrape this... comments_email_address = "planning.reception@babergh.gov.uk" action_regex = re.compile("