瀏覽代碼

Fix error with bad info and comment urls for Acolnet scrapers.

There is a bug in urlparse.urljoin which means that a relative
URL which is just a query string is not correctly added to the
base. This is a workaround for that.

Once we have Python 2.6 it can all go back to how it was before.
import/raw
duncan.parkes@gmail.com 14 年之前
父節點
當前提交
07c06c6034
共有 1 個檔案被更改,包括 19 行新增6 行删除
  1. +19
    -6
      trunk/python_scrapers/AcolnetParser.py

+ 19
- 6
trunk/python_scrapers/AcolnetParser.py 查看文件

@@ -78,7 +78,20 @@ class AcolnetParser:
"""
url = app_table.a['href']
self._current_application.system_key = system_key_regex.search(url).groups()[0]
return urlparse.urljoin(self.base_url, url)

# This is the right way to do this, but it doesn't work in Python 2.5 as
# it doesn't quite implement RFC 3986. This will work fine when we are on
# Python 2.6
# info_url = urlparse.urljoin(self.base_url, url)

# In the meantime, we'll have to work around it. Let's assume url
# is a query string

split_base_url = urlparse.urlsplit(self.base_url)
split_info_url = urlparse.urlsplit(url)
info_url = urlparse.urlunsplit(split_base_url[:3] + (split_info_url.query,) + split_base_url[4:])

return info_url

def _getCommentUrl(self, app_table):
"""This must be run after _getInfoUrl"""
@@ -341,9 +354,9 @@ class HarlowParser(AcolnetParser):
return self._current_application.info_url.replace("PgeResultDetail", "PgeCommentNeighbourForm&hasreference=no")

if __name__ == '__main__':
day = 12
month = 6
year = 2009
day = 6
month = 8
year = 2010

#parser = AcolnetParser("Babergh", "Babergh", "http://planning.babergh.gov.uk/dcdatav2//acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
#parser = AcolnetParser("Barnet", "Barnet", "http://194.75.183.100/planning-cases/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
@@ -357,8 +370,8 @@ if __name__ == '__main__':
#parser = AcolnetParser("Croydon", "Croydon", "http://planning.croydon.gov.uk/DCWebPages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
#parser = AcolnetParser("Derby", "Derby", "http://eplanning.derby.gov.uk/acolnet/planningpages02/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
#parser = AcolnetParser("East Lindsey", "East Lindsey", "http://www.e-lindsey.gov.uk/planning/AcolnetCGI.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser")
# parser = AcolnetParser("Exeter City Council", "Exeter", "http://pub.exeter.gov.uk/scripts/Acolnet/dataonlineplanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
parser = AcolnetParser("Stoke on Trent City Council", "Stoke", "http://www.planning.stoke.gov.uk/dataonlineplanning/AcolNetCGI.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
parser = AcolnetParser("Exeter City Council", "Exeter", "http://pub.exeter.gov.uk/scripts/Acolnet/dataonlineplanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
# parser = AcolnetParser("Stoke on Trent City Council", "Stoke", "http://www.planning.stoke.gov.uk/dataonlineplanning/AcolNetCGI.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
#parser = BoltonParser("Fylde", "Fylde", "http://www2.fylde.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
#parser = AcolnetParser("Guildford", "Guildford", "http://www.guildford.gov.uk/DLDC_Version_2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
#parser = AcolnetParser("Harlow", "Harlow", "http://planning.harlow.gov.uk/DLDC_Version_2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")


Loading…
取消
儲存