소스 검색

fix some more scrapers, and embarrassing all PublicAccess comment urls

broken bug.
master
duncan.parkes 17 년 전
부모
커밋
183354af1e
9개의 변경된 파일199개의 추가작업 그리고 13개의 파일을 삭제
  1. +29
    -0
      cgi-bin/Bracknell Forest.cgi
  2. +29
    -0
      cgi-bin/Broads Authority.cgi
  3. +29
    -0
      cgi-bin/Broads.cgi
  4. +29
    -0
      cgi-bin/Chiltern.cgi
  5. +29
    -0
      cgi-bin/Hinkley and Bosworth.cgi
  6. +29
    -0
      cgi-bin/Perthshire.cgi~
  7. +10
    -6
      cgi-bin/PublicAccess.py
  8. +10
    -6
      python_scrapers/PublicAccess.py
  9. +5
    -1
      python_scrapers/SitesToGenerate.csv

+ 29
- 0
cgi-bin/Bracknell Forest.cgi 파일 보기

@@ -0,0 +1,29 @@
#!/usr/local/bin/python

# This is the parser for Bracknell Forest Borough Council.
# it is generated from the file CGITemplate

import cgi
import cgitb
#cgitb.enable(display=0, logdir="/tmp")


form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')


authority_name = "Bracknell Forest Borough Council"
authority_short_name = "Bracknell Forest"
base_url = "https://my.bracknell-forest.gov.uk/publicaccess/tdc/"

import PublicAccess

parser = PublicAccess.PublicAccessParser(authority_name, authority_short_name, base_url)

xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

+ 29
- 0
cgi-bin/Broads Authority.cgi 파일 보기

@@ -0,0 +1,29 @@
#!/usr/local/bin/python

# This is the parser for Broads Authority.
# it is generated from the file CGITemplate

import cgi
import cgitb
#cgitb.enable(display=0, logdir="/tmp")


form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')


authority_name = "Broads Authority"
authority_short_name = "Broads Authority"
base_url = "https://planning.broads-authority.gov.uk/PublicAccess/tdc/"

import PublicAccess

parser = PublicAccess.PublicAccessParser(authority_name, authority_short_name, base_url)

xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

+ 29
- 0
cgi-bin/Broads.cgi 파일 보기

@@ -0,0 +1,29 @@
#!/usr/local/bin/python

# This is the parser for Broads Authority.
# it is generated from the file CGITemplate

import cgi
import cgitb
#cgitb.enable(display=0, logdir="/tmp")


form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')


authority_name = "Broads Authority"
authority_short_name = "Broads"
base_url = "https://planning.broads-authority.gov.uk/PublicAccess/tdc/"

import PublicAccess

parser = PublicAccess.PublicAccessParser(authority_name, authority_short_name, base_url)

xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

+ 29
- 0
cgi-bin/Chiltern.cgi 파일 보기

@@ -0,0 +1,29 @@
#!/usr/local/bin/python

# This is the parser for Chiltern District Council.
# it is generated from the file CGITemplate

import cgi
import cgitb
#cgitb.enable(display=0, logdir="/tmp")


form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')


authority_name = "Chiltern District Council"
authority_short_name = "Chiltern"
base_url = "https://isa.chiltern.gov.uk/publicaccess/tdc/"

import PublicAccess

parser = PublicAccess.PublicAccessParser(authority_name, authority_short_name, base_url)

xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

+ 29
- 0
cgi-bin/Hinkley and Bosworth.cgi 파일 보기

@@ -0,0 +1,29 @@
#!/usr/local/bin/python

# This is the parser for Hinkley and Bosworth Borough Council.
# it is generated from the file CGITemplate

import cgi
import cgitb
#cgitb.enable(display=0, logdir="/tmp")


form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')


authority_name = "Hinkley and Bosworth Borough Council"
authority_short_name = "Hinkley and Bosworth"
base_url = "https://cx.hinckley-bosworth.gov.uk/PublicAccess/tdc/"

import PublicAccess

parser = PublicAccess.PublicAccessParser(authority_name, authority_short_name, base_url)

xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

+ 29
- 0
cgi-bin/Perthshire.cgi~ 파일 보기

@@ -0,0 +1,29 @@
#!/usr/local/bin/python

# This is the parser for Perth and Kinross Council.
# it is generated from the file CGITemplate

import cgi
import cgitb
#cgitb.enable(display=0, logdir="/tmp")


form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')


authority_name = "Perth and Kinross Council"
authority_short_name = "Perthshire"
base_url = "http://193.63.61.22/publicaccess/tdc/"

import PublicAccess

parser = PublicAccess.PublicAccessParser(authority_name, authority_short_name, base_url)

xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

+ 10
- 6
cgi-bin/PublicAccess.py 파일 보기

@@ -144,7 +144,7 @@ class PublicAccessParser(HTMLParser.HTMLParser):
# Join this query string to the comments URL, and store this as
# the comments URL of the current planning application
comments_url = urlparse.urljoin(self.base_url, comments_url_end)
self._current_application.comment_url = urlparse.urljoin(comments_url, query_string)
self._current_application.comment_url = "?".join([comments_url, query_string])

# while we're here, let's follow some links to find the postcode...
# the postcode is in an input tag in the property page. This page
@@ -300,20 +300,24 @@ class PublicAccessInfoPageParser(HTMLParser.HTMLParser):
Once we have got the URL, there is no need for us to look at any more <a> tags.
"""
if tag == "a" and self.property_page_url is None:
#print attrs
if attrs.count(("id","A_btnPropertyDetails")) > 0:
for attr,value in attrs:
if attr == "href":
the_link = value

# this has some garbage on either side of it...
# this may have some garbage on either side of it...
# let's strip that off

# If the stripping fails, take the whole link

# the garbage on the left is separated by whitespace.
# the garbage on the right is separated by a "'".
self.property_page_url = the_link.split()[1].split("'")[0]
try:
self.property_page_url = the_link.split()[1].split("'")[0]
except IndexError:
self.property_page_url = the_link


class PublicAccessPropertyPageParser(HTMLParser.HTMLParser):


+ 10
- 6
python_scrapers/PublicAccess.py 파일 보기

@@ -144,7 +144,7 @@ class PublicAccessParser(HTMLParser.HTMLParser):
# Join this query string to the comments URL, and store this as
# the comments URL of the current planning application
comments_url = urlparse.urljoin(self.base_url, comments_url_end)
self._current_application.comment_url = urlparse.urljoin(comments_url, query_string)
self._current_application.comment_url = "?".join([comments_url, query_string])

# while we're here, let's follow some links to find the postcode...
# the postcode is in an input tag in the property page. This page
@@ -300,20 +300,24 @@ class PublicAccessInfoPageParser(HTMLParser.HTMLParser):
Once we have got the URL, there is no need for us to look at any more <a> tags.
"""
if tag == "a" and self.property_page_url is None:
#print attrs
if attrs.count(("id","A_btnPropertyDetails")) > 0:
for attr,value in attrs:
if attr == "href":
the_link = value

# this has some garbage on either side of it...
# this may have some garbage on either side of it...
# let's strip that off

# If the stripping fails, take the whole link

# the garbage on the left is separated by whitespace.
# the garbage on the right is separated by a "'".
self.property_page_url = the_link.split()[1].split("'")[0]
try:
self.property_page_url = the_link.split()[1].split("'")[0]
except IndexError:
self.property_page_url = the_link


class PublicAccessPropertyPageParser(HTMLParser.HTMLParser):


+ 5
- 1
python_scrapers/SitesToGenerate.csv 파일 보기

@@ -126,4 +126,8 @@
"Caradon District Council", "Caradon", "http://publicaccess.caradon.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
"Hambleton District Council", "Hambleton", "http://planning.hambleton.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
"Moray Council", "Moray", "http://public.moray.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
"Perth and Kinross Council", "Perthshire", "http://193.63.61.22/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
"Perth and Kinross Council", "Perthshire", "http://193.63.61.22/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
"Broads Authority", "Broads", "https://planning.broads-authority.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"Bracknell Forest Borough Council", "Bracknell Forest", "https://my.bracknell-forest.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
"Chiltern District Council", "Chiltern", "https://isa.chiltern.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
"Hinkley and Bosworth Borough Council", "Hinkley and Bosworth", "https://cx.hinckley-bosworth.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"

불러오는 중...
취소
저장