diff --git a/python_scrapers/AcolnetParser.py b/python_scrapers/AcolnetParser.py index 56eaaa5..768e5cd 100644 --- a/python_scrapers/AcolnetParser.py +++ b/python_scrapers/AcolnetParser.py @@ -1,41 +1,40 @@ #!/usr/local/bin/python -import urllib, urllib2 -import HTMLParser -#from BeautifulSoup import BeautifulSoup +import urllib2 +import urlparse + +from datetime import date +import datetime + +import re + +from BeautifulSoup import BeautifulSoup # Adding this to try to help Surrey Heath - Duncan 14/9/2007 import cookielib cookie_jar = cookielib.CookieJar() ################ -import urlparse - -import re - -end_head_regex = re.compile(" for Bridgnorth, which doesn't have broken html +end_head_regex = re.compile("?", re.IGNORECASE) + + class AcolnetParser(HTMLParser.HTMLParser): - case_number_tr = None # this one can be got by the td class attribute - reg_date_tr = None - location_tr = None - proposal_tr = None + received_date_format = "%d/%m/%Y" + + comment_qs_template = "ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=%s" # There is no online comment facility in these, so we provide an # appropriate email address instead @@ -44,32 +43,60 @@ class AcolnetParser(HTMLParser.HTMLParser): # The optional amp; is to cope with Oldham, which seems to have started # quoting this url. action_regex = re.compile("