#!/usr/local/bin/python import urllib, urllib2 import HTMLParser #from BeautifulSoup import BeautifulSoup import urlparse import re end_head_regex = re.compile(" 0: self._subtable_depth -= 1 else: # We need to add the last application in the table if self._current_application is not None: #print "adding application" self._results.addApplication(self._current_application) #print self._current_application self._current_application = None self._tr_number = None self._subtable_depth = None elif tag == "td": self._in_td = False def getResultsByDayMonthYear(self, day, month, year): # first we fetch the search page to get ourselves some session info... search_form_response = urllib2.urlopen(self.base_url) search_form_contents = search_form_response.read() # This sometimes causes a problem in HTMLParser, so let's just get the link # out with a regex... groups = self.action_regex.search(search_form_contents).groups() action = groups[0] #print action action_url = urlparse.urljoin(self.base_url, action) #print action_url our_date = date(year, month, day) search_data = {"regdate1": our_date.strftime(date_format), "regdate2": our_date.strftime(date_format), } opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler) response = opener.open(action_url, search_data) results_html = response.read() # This is for doing site specific html cleanup results_html = self._cleanupHTML(results_html) #some javascript garbage in the header upsets HTMLParser, #so we'll just have the body just_body = "" + end_head_regex.split(results_html)[-1] #outfile = open(self.authority_short_name + ".debug", "w") #outfile.write(just_body) self.feed(just_body) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() class BaberghParser(AcolnetParser): #search_url = "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch" case_number_tr = 1 # this one can be got by the td class attribute reg_date_tr = 2 location_tr = 4 proposal_tr = 5 #authority_name = "Babergh District Council" #authority_short_name = "Babergh" # It would be nice to scrape this... comments_email_address = "planning.reception@babergh.gov.uk" action_regex = re.compile("
") class BasingstokeParser(AcolnetParser): #search_url = "http://planning.basingstoke.gov.uk/DCOnline2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch" case_number_tr = 1 # this one can be got by the td class attribute reg_date_tr = 3 location_tr = 6 proposal_tr = 8 #authority_name = "Basingstoke and Deane Borough Council" #authority_short_name = "Basingstoke and Deane" # It would be nice to scrape this... comments_email_address = "development.control@basingstoke.gov.uk" action_regex = re.compile("") class BassetlawParser(AcolnetParser): #search_url = "http://www.bassetlaw.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch" case_number_tr = 1 # this one can be got by the td class attribute reg_date_tr = 2 location_tr = 5 proposal_tr = 6 #authority_name = "Bassetlaw District Council" #authority_short_name = "Bassetlaw" comments_email_address = "planning@bassetlaw.gov.uk" action_regex = re.compile("", re.IGNORECASE) def _cleanupHTML(self, html): """There is a broken div in this page. We don't need any divs, so let's get rid of them all.""" div_regex = re.compile("]*>", re.IGNORECASE) return div_regex.sub('', html) class BridgenorthParser(AcolnetParser): #search_url = "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch" case_number_tr = 1 # this one can be got by the td class attribute reg_date_tr = 2 location_tr = 4 proposal_tr = 5 #authority_name = "Bridgenorth District Council" #authority_short_name = "Bridgenorth" comments_email_address = "contactus@bridgnorth-dc.gov.uk" action_regex = re.compile("") class BuryParser(AcolnetParser): #search_url = "http://e-planning.bury.gov.uk/ePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch" case_number_tr = 1 # this one can be got by the td class attribute reg_date_tr = 2 location_tr = 4 proposal_tr = 5 #authority_name = "Bury Metropolitan Borough Council" #authority_short_name = "Bury" comments_email_address = "development.control@bury.gov.uk" action_regex = re.compile("") ## class CanterburyParser(AcolnetParser): ## search_url = "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch" ## case_number_tr = 1 # this one can be got by the td class attribute ## reg_date_tr = 2 ## location_tr = 4 ## proposal_tr = 5 ## authority_name = "Canterbury City Council" ## authority_short_name = "Canterbury" ## comments_email_address = "" ## action_regex = re.compile("") class CarlisleParser(AcolnetParser): #search_url = "http://planning.carlisle.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch" case_number_tr = 1 # this one can be got by the td class attribute reg_date_tr = 2 location_tr = 5 proposal_tr = 6 #authority_name = "Carlisle City Council" #authority_short_name = "Carlisle" comments_email_address = "dc@carlisle.gov.uk" action_regex = re.compile("") class DerbyParser(AcolnetParser): #search_url = "http://195.224.106.204/scripts/planningpages02%5CXSLPagesDC_DERBY%5CDCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch" case_number_tr = 1 # this one can be got by the td class attribute reg_date_tr = 3 location_tr = 4 proposal_tr = 5 #authority_name = "Derby City Council" #authority_short_name = "Derby" comments_email_address = "developmentcontrol@derby.gov.uk" action_regex = re.compile("") class CroydonParser(AcolnetParser): case_number_tr = 1 # this one can be got by the td class attribute reg_date_tr = 3 location_tr = 5 proposal_tr = 6 comments_email_address = "planning.control@croydon.gov.uk" action_regex = re.compile("") if __name__ == '__main__': day = 15 month = 3 year = 2007 # working # parser = BasingstokeParser() parser = BaberghParser("Babergh District Council", "Babergh", "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") # works with the divs stripped out #parser = BassetlawParser() # returns error 400 - bad request #parser = BridgenorthParser() # working #parser = BuryParser() # cambridgeshire is a bit different... # no advanced search page # canterbury # results as columns of one table # returns error 400 - bad request #parser = CarlisleParser() # working #parser = DerbyParser() print parser.getResults(day, month, year)