diff --git a/cgi-bin/AcolnetParser.py b/cgi-bin/AcolnetParser.py new file mode 100644 index 0000000..55e2796 --- /dev/null +++ b/cgi-bin/AcolnetParser.py @@ -0,0 +1,364 @@ +#!/usr/local/bin/python + +import urllib, urllib2 +import HTMLParser +#from BeautifulSoup import BeautifulSoup + +import urlparse + +import re + +end_head_regex = re.compile(" 0: + self._subtable_depth -= 1 + else: + # We need to add the last application in the table + if self._current_application is not None: + #print "adding application" + self._results.addApplication(self._current_application) + #print self._current_application + self._current_application = None + self._tr_number = None + self._subtable_depth = None + elif tag == "td": + self._in_td = False + + def getResultsByDayMonthYear(self, day, month, year): + # first we fetch the search page to get ourselves some session info... + search_form_response = urllib2.urlopen(self.base_url) + search_form_contents = search_form_response.read() + + # This sometimes causes a problem in HTMLParser, so let's just get the link + # out with a regex... + + groups = self.action_regex.search(search_form_contents).groups() + + action = groups[0] + #print action + + action_url = urlparse.urljoin(self.base_url, action) + #print action_url + + our_date = date(year, month, day) + + search_data = {"regdate1": our_date.strftime(date_format), + "regdate2": our_date.strftime(date_format), + } + + opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler) + response = opener.open(action_url, search_data) + results_html = response.read() + + # This is for doing site specific html cleanup + results_html = self._cleanupHTML(results_html) + + #some javascript garbage in the header upsets HTMLParser, + #so we'll just have the body + just_body = "" + end_head_regex.split(results_html)[-1] + + #outfile = open(self.authority_short_name + ".debug", "w") + #outfile.write(just_body) + + self.feed(just_body) + + return self._results + + + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + + +class BaberghParser(AcolnetParser): + #search_url = "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch" + + case_number_tr = 1 # this one can be got by the td class attribute + reg_date_tr = 2 + location_tr = 4 + proposal_tr = 5 + + #authority_name = "Babergh District Council" + #authority_short_name = "Babergh" + + # It would be nice to scrape this... + comments_email_address = "planning.reception@babergh.gov.uk" + + action_regex = re.compile("