From 7ee4ac0326ed53d890fc18fae04211ca154d6b7b Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Sun, 25 Nov 2007 12:54:19 +0000 Subject: [PATCH] Allow & in place of & for one of the urls (to fix Oldham, which has suddenly started to need this...) --- python_scrapers/AcolnetParser.py | 24 +++++++++++++++--------- python_scrapers/SitesToGenerate.csv | 2 +- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/python_scrapers/AcolnetParser.py b/python_scrapers/AcolnetParser.py index 8250369..280f6de 100644 --- a/python_scrapers/AcolnetParser.py +++ b/python_scrapers/AcolnetParser.py @@ -41,7 +41,9 @@ class AcolnetParser(HTMLParser.HTMLParser): # appropriate email address instead comments_email_address = None - action_regex = re.compile("]*action=\"([^\"]*ACTION=UNWRAP&RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE) + # The optional amp; is to cope with Oldham, which seems to have started + # quoting this url. + action_regex = re.compile("]*action=\"([^\"]*ACTION=UNWRAP&(?:amp;)?RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE) def __init__(self, authority_name, @@ -175,9 +177,6 @@ class AcolnetParser(HTMLParser.HTMLParser): search_form_contents = search_form_response.read() - #outfile = open("tmpfile", "w") - #outfile.write(search_form_contents) - # This sometimes causes a problem in HTMLParser, so let's just get the link # out with a regex... @@ -186,8 +185,12 @@ class AcolnetParser(HTMLParser.HTMLParser): action = groups[0] #print action + # This is to handle the amp; which seems to have appeared in this + # url on the Oldham site + action = ''.join(action.split('amp;')) + action_url = urlparse.urljoin(self.base_url, action) - #print action_url + print action_url our_date = date(year, month, day) @@ -199,6 +202,9 @@ class AcolnetParser(HTMLParser.HTMLParser): response = opener.open(action_url, search_data) results_html = response.read() + #outfile = open("tmpfile", "w") + #outfile.write(results_html) + # This is for doing site specific html cleanup results_html = self._cleanupHTML(results_html) @@ -508,9 +514,9 @@ class SurreyHeathParser(AcolnetParser): if __name__ == '__main__': - day = 22 - month = 2 - year = 2005 + day = 20 + month = 11 + year = 2007 # returns error 400 - bad request #parser = BridgenorthParser() @@ -523,6 +529,6 @@ if __name__ == '__main__': #parser = SurreyHeathParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") - parser = GuildfordParser("Guildford", "Guildford", "http://www.guildford.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&Root=PgeSearch") + parser = OldhamParser("Oldham", "Oldham", "http://planning.oldham.gov.uk/planning/AcolNetCGI.gov?ACTION=UNWRAP&Root=PgeSearch") print parser.getResults(day, month, year) diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index 9676f4b..9199177 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -104,7 +104,7 @@ "Carlisle City Council", "Carlisle", "http://planning.carlisle.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "CarlisleParser" "Newcastle City Council", "Newcastle", "http://gispublic.newcastle.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" "North Wiltshire District Council", "North Wiltshire", "http://planning.northwilts.gov.uk/DCOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "NorthWiltshireParser" -"Oldham Metropolitan Borough Council", "Oldham", "http://planning.oldham.gov.uk/planning//acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "OldhamParser" +"Oldham Metropolitan Borough Council", "Oldham", "http://planning.oldham.gov.uk/planning/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "OldhamParser" "Renfrewshire Council", "Renfrewshire", "http://planning.renfrewshire.gov.uk/acolnetDCpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch", "AcolnetParser", "RenfrewshireParser" "Selby District Council", "Selby", "http://publicaccess.selby.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" "South Bedfordshire District Council", "South Bedfordshire", "http://planning.southbeds.gov.uk/plantech/DCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.PgeSearch", "AcolnetParser", "SouthBedfordshireParser"