From 183354af1e2ec9b80e4957b484a394a5298472cf Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Tue, 11 Sep 2007 15:43:14 +0000 Subject: [PATCH] fix some more scrapers, and embarrassing all PublicAccess comment urls broken bug. --- cgi-bin/Bracknell Forest.cgi | 29 +++++++++++++++++++++++++++++ cgi-bin/Broads Authority.cgi | 29 +++++++++++++++++++++++++++++ cgi-bin/Broads.cgi | 29 +++++++++++++++++++++++++++++ cgi-bin/Chiltern.cgi | 29 +++++++++++++++++++++++++++++ cgi-bin/Hinkley and Bosworth.cgi | 29 +++++++++++++++++++++++++++++ cgi-bin/Perthshire.cgi~ | 29 +++++++++++++++++++++++++++++ cgi-bin/PublicAccess.py | 16 ++++++++++------ python_scrapers/PublicAccess.py | 16 ++++++++++------ python_scrapers/SitesToGenerate.csv | 6 +++++- 9 files changed, 199 insertions(+), 13 deletions(-) create mode 100755 cgi-bin/Bracknell Forest.cgi create mode 100755 cgi-bin/Broads Authority.cgi create mode 100755 cgi-bin/Broads.cgi create mode 100755 cgi-bin/Chiltern.cgi create mode 100755 cgi-bin/Hinkley and Bosworth.cgi create mode 100755 cgi-bin/Perthshire.cgi~ diff --git a/cgi-bin/Bracknell Forest.cgi b/cgi-bin/Bracknell Forest.cgi new file mode 100755 index 0000000..197754a --- /dev/null +++ b/cgi-bin/Bracknell Forest.cgi @@ -0,0 +1,29 @@ +#!/usr/local/bin/python + +# This is the parser for Bracknell Forest Borough Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Bracknell Forest Borough Council" +authority_short_name = "Bracknell Forest" +base_url = "https://my.bracknell-forest.gov.uk/publicaccess/tdc/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, authority_short_name, base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/cgi-bin/Broads Authority.cgi b/cgi-bin/Broads Authority.cgi new file mode 100755 index 0000000..efc4aa2 --- /dev/null +++ b/cgi-bin/Broads Authority.cgi @@ -0,0 +1,29 @@ +#!/usr/local/bin/python + +# This is the parser for Broads Authority. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Broads Authority" +authority_short_name = "Broads Authority" +base_url = "https://planning.broads-authority.gov.uk/PublicAccess/tdc/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, authority_short_name, base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/cgi-bin/Broads.cgi b/cgi-bin/Broads.cgi new file mode 100755 index 0000000..fabe99d --- /dev/null +++ b/cgi-bin/Broads.cgi @@ -0,0 +1,29 @@ +#!/usr/local/bin/python + +# This is the parser for Broads Authority. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Broads Authority" +authority_short_name = "Broads" +base_url = "https://planning.broads-authority.gov.uk/PublicAccess/tdc/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, authority_short_name, base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/cgi-bin/Chiltern.cgi b/cgi-bin/Chiltern.cgi new file mode 100755 index 0000000..44b1ce9 --- /dev/null +++ b/cgi-bin/Chiltern.cgi @@ -0,0 +1,29 @@ +#!/usr/local/bin/python + +# This is the parser for Chiltern District Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Chiltern District Council" +authority_short_name = "Chiltern" +base_url = "https://isa.chiltern.gov.uk/publicaccess/tdc/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, authority_short_name, base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/cgi-bin/Hinkley and Bosworth.cgi b/cgi-bin/Hinkley and Bosworth.cgi new file mode 100755 index 0000000..b9be750 --- /dev/null +++ b/cgi-bin/Hinkley and Bosworth.cgi @@ -0,0 +1,29 @@ +#!/usr/local/bin/python + +# This is the parser for Hinkley and Bosworth Borough Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Hinkley and Bosworth Borough Council" +authority_short_name = "Hinkley and Bosworth" +base_url = "https://cx.hinckley-bosworth.gov.uk/PublicAccess/tdc/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, authority_short_name, base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/cgi-bin/Perthshire.cgi~ b/cgi-bin/Perthshire.cgi~ new file mode 100755 index 0000000..79907b8 --- /dev/null +++ b/cgi-bin/Perthshire.cgi~ @@ -0,0 +1,29 @@ +#!/usr/local/bin/python + +# This is the parser for Perth and Kinross Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Perth and Kinross Council" +authority_short_name = "Perthshire" +base_url = "http://193.63.61.22/publicaccess/tdc/" + +import PublicAccess + +parser = PublicAccess.PublicAccessParser(authority_name, authority_short_name, base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/cgi-bin/PublicAccess.py b/cgi-bin/PublicAccess.py index 5ff7baf..22621e8 100644 --- a/cgi-bin/PublicAccess.py +++ b/cgi-bin/PublicAccess.py @@ -144,7 +144,7 @@ class PublicAccessParser(HTMLParser.HTMLParser): # Join this query string to the comments URL, and store this as # the comments URL of the current planning application comments_url = urlparse.urljoin(self.base_url, comments_url_end) - self._current_application.comment_url = urlparse.urljoin(comments_url, query_string) + self._current_application.comment_url = "?".join([comments_url, query_string]) # while we're here, let's follow some links to find the postcode... # the postcode is in an input tag in the property page. This page @@ -300,20 +300,24 @@ class PublicAccessInfoPageParser(HTMLParser.HTMLParser): Once we have got the URL, there is no need for us to look at any more tags. """ if tag == "a" and self.property_page_url is None: + + #print attrs if attrs.count(("id","A_btnPropertyDetails")) > 0: for attr,value in attrs: if attr == "href": the_link = value - # this has some garbage on either side of it... + # this may have some garbage on either side of it... # let's strip that off + # If the stripping fails, take the whole link + # the garbage on the left is separated by whitespace. # the garbage on the right is separated by a "'". - - self.property_page_url = the_link.split()[1].split("'")[0] - - + try: + self.property_page_url = the_link.split()[1].split("'")[0] + except IndexError: + self.property_page_url = the_link class PublicAccessPropertyPageParser(HTMLParser.HTMLParser): diff --git a/python_scrapers/PublicAccess.py b/python_scrapers/PublicAccess.py index 5ff7baf..22621e8 100644 --- a/python_scrapers/PublicAccess.py +++ b/python_scrapers/PublicAccess.py @@ -144,7 +144,7 @@ class PublicAccessParser(HTMLParser.HTMLParser): # Join this query string to the comments URL, and store this as # the comments URL of the current planning application comments_url = urlparse.urljoin(self.base_url, comments_url_end) - self._current_application.comment_url = urlparse.urljoin(comments_url, query_string) + self._current_application.comment_url = "?".join([comments_url, query_string]) # while we're here, let's follow some links to find the postcode... # the postcode is in an input tag in the property page. This page @@ -300,20 +300,24 @@ class PublicAccessInfoPageParser(HTMLParser.HTMLParser): Once we have got the URL, there is no need for us to look at any more tags. """ if tag == "a" and self.property_page_url is None: + + #print attrs if attrs.count(("id","A_btnPropertyDetails")) > 0: for attr,value in attrs: if attr == "href": the_link = value - # this has some garbage on either side of it... + # this may have some garbage on either side of it... # let's strip that off + # If the stripping fails, take the whole link + # the garbage on the left is separated by whitespace. # the garbage on the right is separated by a "'". - - self.property_page_url = the_link.split()[1].split("'")[0] - - + try: + self.property_page_url = the_link.split()[1].split("'")[0] + except IndexError: + self.property_page_url = the_link class PublicAccessPropertyPageParser(HTMLParser.HTMLParser): diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index 141dbc3..426eefb 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -126,4 +126,8 @@ "Caradon District Council", "Caradon", "http://publicaccess.caradon.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" "Hambleton District Council", "Hambleton", "http://planning.hambleton.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" "Moray Council", "Moray", "http://public.moray.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" -"Perth and Kinross Council", "Perthshire", "http://193.63.61.22/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" \ No newline at end of file +"Perth and Kinross Council", "Perthshire", "http://193.63.61.22/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Broads Authority", "Broads", "https://planning.broads-authority.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" +"Bracknell Forest Borough Council", "Bracknell Forest", "https://my.bracknell-forest.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Chiltern District Council", "Chiltern", "https://isa.chiltern.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Hinkley and Bosworth Borough Council", "Hinkley and Bosworth", "https://cx.hinckley-bosworth.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" \ No newline at end of file