fix some more scrapers, and embarrassing all PublicAccess comment urls

broken bug.
19 년 전 · 183354af1e
--- a/cgi-bin/Bracknell
+++ b/cgi-bin/Bracknell
@@ -0,0 +1,29 @@
 #!/usr/local/bin/python

 # This is the parser for Bracknell Forest Borough Council.
 # it is generated from the file CGITemplate

 import cgi
 import cgitb
 #cgitb.enable(display=0, logdir="/tmp")


 form = cgi.FieldStorage()
 day = form.getfirst('day')
 month = form.getfirst('month')
 year = form.getfirst('year')


 authority_name = "Bracknell Forest Borough Council"
 authority_short_name = "Bracknell Forest"
 base_url = "https://my.bracknell-forest.gov.uk/publicaccess/tdc/"

 import PublicAccess

 parser = PublicAccess.PublicAccessParser(authority_name, authority_short_name, base_url)

 xml = parser.getResults(day, month, year)

 print "Content-Type: text/xml"     # XML is following
 print
 print xml                          # print the xml
--- a/cgi-bin/Broads
+++ b/cgi-bin/Broads
@@ -0,0 +1,29 @@
 #!/usr/local/bin/python

 # This is the parser for Broads Authority.
 # it is generated from the file CGITemplate

 import cgi
 import cgitb
 #cgitb.enable(display=0, logdir="/tmp")


 form = cgi.FieldStorage()
 day = form.getfirst('day')
 month = form.getfirst('month')
 year = form.getfirst('year')


 authority_name = "Broads Authority"
 authority_short_name = "Broads Authority"
 base_url = "https://planning.broads-authority.gov.uk/PublicAccess/tdc/"

 import PublicAccess

 parser = PublicAccess.PublicAccessParser(authority_name, authority_short_name, base_url)

 xml = parser.getResults(day, month, year)

 print "Content-Type: text/xml"     # XML is following
 print
 print xml                          # print the xml
--- a/cgi-bin/Broads.cgi
+++ b/cgi-bin/Broads.cgi
@@ -0,0 +1,29 @@
 #!/usr/local/bin/python

 # This is the parser for Broads Authority.
 # it is generated from the file CGITemplate

 import cgi
 import cgitb
 #cgitb.enable(display=0, logdir="/tmp")


 form = cgi.FieldStorage()
 day = form.getfirst('day')
 month = form.getfirst('month')
 year = form.getfirst('year')


 authority_name = "Broads Authority"
 authority_short_name = "Broads"
 base_url = "https://planning.broads-authority.gov.uk/PublicAccess/tdc/"

 import PublicAccess

 parser = PublicAccess.PublicAccessParser(authority_name, authority_short_name, base_url)

 xml = parser.getResults(day, month, year)

 print "Content-Type: text/xml"     # XML is following
 print
 print xml                          # print the xml
--- a/cgi-bin/Chiltern.cgi
+++ b/cgi-bin/Chiltern.cgi
@@ -0,0 +1,29 @@
 #!/usr/local/bin/python

 # This is the parser for Chiltern District Council.
 # it is generated from the file CGITemplate

 import cgi
 import cgitb
 #cgitb.enable(display=0, logdir="/tmp")


 form = cgi.FieldStorage()
 day = form.getfirst('day')
 month = form.getfirst('month')
 year = form.getfirst('year')


 authority_name = "Chiltern District Council"
 authority_short_name = "Chiltern"
 base_url = "https://isa.chiltern.gov.uk/publicaccess/tdc/"

 import PublicAccess

 parser = PublicAccess.PublicAccessParser(authority_name, authority_short_name, base_url)

 xml = parser.getResults(day, month, year)

 print "Content-Type: text/xml"     # XML is following
 print
 print xml                          # print the xml
--- a/cgi-bin/Hinkley
+++ b/cgi-bin/Hinkley
@@ -0,0 +1,29 @@
 #!/usr/local/bin/python

 # This is the parser for Hinkley and Bosworth Borough Council.
 # it is generated from the file CGITemplate

 import cgi
 import cgitb
 #cgitb.enable(display=0, logdir="/tmp")


 form = cgi.FieldStorage()
 day = form.getfirst('day')
 month = form.getfirst('month')
 year = form.getfirst('year')


 authority_name = "Hinkley and Bosworth Borough Council"
 authority_short_name = "Hinkley and Bosworth"
 base_url = "https://cx.hinckley-bosworth.gov.uk/PublicAccess/tdc/"

 import PublicAccess

 parser = PublicAccess.PublicAccessParser(authority_name, authority_short_name, base_url)

 xml = parser.getResults(day, month, year)

 print "Content-Type: text/xml"     # XML is following
 print
 print xml                          # print the xml
--- a/cgi-bin/Perthshire.cgi~
+++ b/cgi-bin/Perthshire.cgi~
@@ -0,0 +1,29 @@
 #!/usr/local/bin/python

 # This is the parser for Perth and Kinross Council.
 # it is generated from the file CGITemplate

 import cgi
 import cgitb
 #cgitb.enable(display=0, logdir="/tmp")


 form = cgi.FieldStorage()
 day = form.getfirst('day')
 month = form.getfirst('month')
 year = form.getfirst('year')


 authority_name = "Perth and Kinross Council"
 authority_short_name = "Perthshire"
 base_url = "http://193.63.61.22/publicaccess/tdc/"

 import PublicAccess

 parser = PublicAccess.PublicAccessParser(authority_name, authority_short_name, base_url)

 xml = parser.getResults(day, month, year)

 print "Content-Type: text/xml"     # XML is following
 print
 print xml                          # print the xml
--- a/cgi-bin/PublicAccess.py
+++ b/cgi-bin/PublicAccess.py
@@ -144,7 +144,7 @@ class PublicAccessParser(HTMLParser.HTMLParser):
                # Join this query string to the comments URL, and store this as
                # the comments URL of the current planning application
                comments_url = urlparse.urljoin(self.base_url, comments_url_end)
 		self._current_application.comment_url = urlparse.urljoin(comments_url, query_string)
                self._current_application.comment_url = "?".join([comments_url, query_string])

 		# while we're here, let's follow some links to find the postcode...
                # the postcode is in an input tag in the property page. This page
@@ -300,20 +300,24 @@ class PublicAccessInfoPageParser(HTMLParser.HTMLParser):
        Once we have got the URL, there is no need for us to look at any more <a> tags.
        """
 	if tag == "a" and self.property_page_url is None:
            
            #print attrs
 	    if attrs.count(("id","A_btnPropertyDetails")) > 0:
 		for attr,value in attrs:
 		    if attr == "href":
 			the_link = value

 			# this has some garbage on either side of it...
 			# this may have some garbage on either side of it...
 			# let's strip that off

                        # If the stripping fails, take the whole link

                        # the garbage on the left is separated by whitespace.
                        # the garbage on the right is separated by a "'".

 			self.property_page_url = the_link.split()[1].split("'")[0]


                        try:
                            self.property_page_url = the_link.split()[1].split("'")[0]
                        except IndexError:
                            self.property_page_url = the_link


 class PublicAccessPropertyPageParser(HTMLParser.HTMLParser):
--- a/python_scrapers/PublicAccess.py
+++ b/python_scrapers/PublicAccess.py
@@ -144,7 +144,7 @@ class PublicAccessParser(HTMLParser.HTMLParser):
                # Join this query string to the comments URL, and store this as
                # the comments URL of the current planning application
                comments_url = urlparse.urljoin(self.base_url, comments_url_end)
 		self._current_application.comment_url = urlparse.urljoin(comments_url, query_string)
                self._current_application.comment_url = "?".join([comments_url, query_string])

 		# while we're here, let's follow some links to find the postcode...
                # the postcode is in an input tag in the property page. This page
@@ -300,20 +300,24 @@ class PublicAccessInfoPageParser(HTMLParser.HTMLParser):
        Once we have got the URL, there is no need for us to look at any more <a> tags.
        """
 	if tag == "a" and self.property_page_url is None:
            
            #print attrs
 	    if attrs.count(("id","A_btnPropertyDetails")) > 0:
 		for attr,value in attrs:
 		    if attr == "href":
 			the_link = value

 			# this has some garbage on either side of it...
 			# this may have some garbage on either side of it...
 			# let's strip that off

                        # If the stripping fails, take the whole link

                        # the garbage on the left is separated by whitespace.
                        # the garbage on the right is separated by a "'".

 			self.property_page_url = the_link.split()[1].split("'")[0]


                        try:
                            self.property_page_url = the_link.split()[1].split("'")[0]
                        except IndexError:
                            self.property_page_url = the_link


 class PublicAccessPropertyPageParser(HTMLParser.HTMLParser):
--- a/python_scrapers/SitesToGenerate.csv
+++ b/python_scrapers/SitesToGenerate.csv
@@ -126,4 +126,8 @@
 "Caradon District Council", "Caradon", "http://publicaccess.caradon.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Hambleton District Council", "Hambleton", "http://planning.hambleton.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Moray Council", "Moray", "http://public.moray.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Perth and Kinross Council", "Perthshire", "http://193.63.61.22/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Perth and Kinross Council", "Perthshire", "http://193.63.61.22/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Broads Authority", "Broads", "https://planning.broads-authority.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Bracknell Forest Borough Council", "Bracknell Forest", "https://my.bracknell-forest.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Chiltern District Council", "Chiltern", "https://isa.chiltern.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
 "Hinkley and Bosworth Borough Council", "Hinkley and Bosworth", "https://cx.hinckley-bosworth.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"