From 0710aac91c97ee488424295c4cf2bc277e0255f3 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Sat, 14 Aug 2010 16:42:01 +0000 Subject: [PATCH] Fix error with bad info and comment urls for Acolnet scrapers. There is a bug in urlparse.urljoin which means that a relative URL which is just a query string is not correctly added to the base. This is a workaround for that. Once we have Python 2.6 it can all go back to how it was before. --- python_scrapers/AcolnetParser.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/python_scrapers/AcolnetParser.py b/python_scrapers/AcolnetParser.py index ddd5893..a1ce4b8 100644 --- a/python_scrapers/AcolnetParser.py +++ b/python_scrapers/AcolnetParser.py @@ -78,7 +78,20 @@ class AcolnetParser: """ url = app_table.a['href'] self._current_application.system_key = system_key_regex.search(url).groups()[0] - return urlparse.urljoin(self.base_url, url) + + # This is the right way to do this, but it doesn't work in Python 2.5 as + # it doesn't quite implement RFC 3986. This will work fine when we are on + # Python 2.6 + # info_url = urlparse.urljoin(self.base_url, url) + + # In the meantime, we'll have to work around it. Let's assume url + # is a query string + + split_base_url = urlparse.urlsplit(self.base_url) + split_info_url = urlparse.urlsplit(url) + info_url = urlparse.urlunsplit(split_base_url[:3] + (split_info_url.query,) + split_base_url[4:]) + + return info_url def _getCommentUrl(self, app_table): """This must be run after _getInfoUrl""" @@ -341,9 +354,9 @@ class HarlowParser(AcolnetParser): return self._current_application.info_url.replace("PgeResultDetail", "PgeCommentNeighbourForm&hasreference=no") if __name__ == '__main__': - day = 12 - month = 6 - year = 2009 + day = 6 + month = 8 + year = 2010 #parser = AcolnetParser("Babergh", "Babergh", "http://planning.babergh.gov.uk/dcdatav2//acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") #parser = AcolnetParser("Barnet", "Barnet", "http://194.75.183.100/planning-cases/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch") @@ -357,8 +370,8 @@ if __name__ == '__main__': #parser = AcolnetParser("Croydon", "Croydon", "http://planning.croydon.gov.uk/DCWebPages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") #parser = AcolnetParser("Derby", "Derby", "http://eplanning.derby.gov.uk/acolnet/planningpages02/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch") #parser = AcolnetParser("East Lindsey", "East Lindsey", "http://www.e-lindsey.gov.uk/planning/AcolnetCGI.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser") - # parser = AcolnetParser("Exeter City Council", "Exeter", "http://pub.exeter.gov.uk/scripts/Acolnet/dataonlineplanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") - parser = AcolnetParser("Stoke on Trent City Council", "Stoke", "http://www.planning.stoke.gov.uk/dataonlineplanning/AcolNetCGI.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch") + parser = AcolnetParser("Exeter City Council", "Exeter", "http://pub.exeter.gov.uk/scripts/Acolnet/dataonlineplanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") + # parser = AcolnetParser("Stoke on Trent City Council", "Stoke", "http://www.planning.stoke.gov.uk/dataonlineplanning/AcolNetCGI.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch") #parser = BoltonParser("Fylde", "Fylde", "http://www2.fylde.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") #parser = AcolnetParser("Guildford", "Guildford", "http://www.guildford.gov.uk/DLDC_Version_2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch") #parser = AcolnetParser("Harlow", "Harlow", "http://planning.harlow.gov.uk/DLDC_Version_2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")