duncan.parkes 17 роки тому
джерело
коміт
45bcc3539f
5 змінених файлів з 59 додано та 13 видалено
  1. +13
    -5
      cgi-bin/AcolnetParser.py
  2. +29
    -0
      cgi-bin/New Forest DC.cgi
  3. +2
    -2
      cgi-bin/New Forest NP.cgi
  4. +13
    -5
      python_scrapers/AcolnetParser.py
  5. +2
    -1
      python_scrapers/PublicAccessSites.csv

+ 13
- 5
cgi-bin/AcolnetParser.py Переглянути файл

@@ -159,8 +159,8 @@ class AcolnetParser(HTMLParser.HTMLParser):
search_form_response = urllib2.urlopen(self.base_url)
search_form_contents = search_form_response.read()

outfile = open("tmpfile", "w")
outfile.write(search_form_contents)
#outfile = open("tmpfile", "w")
#outfile.write(search_form_contents)

# This sometimes causes a problem in HTMLParser, so let's just get the link
# out with a regex...
@@ -360,7 +360,7 @@ class LewishamParser(AcolnetParser):
## comments_email_address = "planning@lewisham.gov.uk"
## #action_regex = re.compile("<FORM .*action=\"(.*ACTION=UNWRAP&RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)

class NewForestParser(AcolnetParser):
class NewForestNPParser(AcolnetParser):
# In this case there is an online comment facility at the
# bottom of each view app page...
case_number_tr = 1 # this one can be got by the td class attribute
@@ -368,6 +368,14 @@ class NewForestParser(AcolnetParser):
location_tr = 4
proposal_tr = 5

class NewForestDCParser(AcolnetParser):
# In this case there is an online comment facility at the
# bottom of each view app page...
case_number_tr = 1 # this one can be got by the td class attribute
reg_date_tr = 2
location_tr = 5
proposal_tr = 6

class NorthWiltshireParser(AcolnetParser):
case_number_tr = 1 # this one can be got by the td class attribute
reg_date_tr = 3
@@ -379,8 +387,8 @@ class OldhamParser(AcolnetParser):
reg_date_tr = 3
location_tr = 6
proposal_tr = 7
def _cleanupHTML(self, html):
def _cleanupHTML(self, html):
"""There is a bad table end tag in this one.
Fix it before we start"""


+ 29
- 0
cgi-bin/New Forest DC.cgi Переглянути файл

@@ -0,0 +1,29 @@
#!/usr/local/bin/python

# This is the parser for New Forest District Council.
# it is generated from the file CGITemplate

import cgi
import cgitb
#cgitb.enable(display=0, logdir="/tmp")


form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')


authority_name = "New Forest District Council"
authority_short_name = "New Forest DC"
base_url = "http://web3.newforest.gov.uk/planningonline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

import AcolnetParser

parser = AcolnetParser.NewForestDCParser(authority_name, authority_short_name, base_url)

xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

cgi-bin/New Forest.cgi → cgi-bin/New Forest NP.cgi Переглянути файл

@@ -15,12 +15,12 @@ year = form.getfirst('year')


authority_name = "New Forest National Park"
authority_short_name = "New Forest"
authority_short_name = "New Forest NP"
base_url = "http://web01.newforestnpa.gov.uk/planningpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

import AcolnetParser

parser = AcolnetParser.NewForestParser(authority_name, authority_short_name, base_url)
parser = AcolnetParser.NewForestNPParser(authority_name, authority_short_name, base_url)

xml = parser.getResults(day, month, year)


+ 13
- 5
python_scrapers/AcolnetParser.py Переглянути файл

@@ -159,8 +159,8 @@ class AcolnetParser(HTMLParser.HTMLParser):
search_form_response = urllib2.urlopen(self.base_url)
search_form_contents = search_form_response.read()

outfile = open("tmpfile", "w")
outfile.write(search_form_contents)
#outfile = open("tmpfile", "w")
#outfile.write(search_form_contents)

# This sometimes causes a problem in HTMLParser, so let's just get the link
# out with a regex...
@@ -360,7 +360,7 @@ class LewishamParser(AcolnetParser):
## comments_email_address = "planning@lewisham.gov.uk"
## #action_regex = re.compile("<FORM .*action=\"(.*ACTION=UNWRAP&RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)

class NewForestParser(AcolnetParser):
class NewForestNPParser(AcolnetParser):
# In this case there is an online comment facility at the
# bottom of each view app page...
case_number_tr = 1 # this one can be got by the td class attribute
@@ -368,6 +368,14 @@ class NewForestParser(AcolnetParser):
location_tr = 4
proposal_tr = 5

class NewForestDCParser(AcolnetParser):
# In this case there is an online comment facility at the
# bottom of each view app page...
case_number_tr = 1 # this one can be got by the td class attribute
reg_date_tr = 2
location_tr = 5
proposal_tr = 6

class NorthWiltshireParser(AcolnetParser):
case_number_tr = 1 # this one can be got by the td class attribute
reg_date_tr = 3
@@ -379,8 +387,8 @@ class OldhamParser(AcolnetParser):
reg_date_tr = 3
location_tr = 6
proposal_tr = 7
def _cleanupHTML(self, html):
def _cleanupHTML(self, html):
"""There is a bad table end tag in this one.
Fix it before we start"""


+ 2
- 1
python_scrapers/PublicAccessSites.csv Переглянути файл

@@ -101,7 +101,7 @@
"Hertsmere Borough Council", "Hertsmere", "http://www2.hertsmere.gov.uk/ACOLNET/DCOnline//acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "HertsmereParser"
"London Borough of Lewisham", "Lewisham", "http://acolnet.lewisham.gov.uk/lewis-xslpagesdc/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.PgeSearch", "AcolnetParser", "LewishamParser"
"North Hertfordshire District Council", "North Hertfordshire", "http://www.north-herts.gov.uk/dcdataonline/Pages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch", "AcolnetParser", "NorthHertfordshireParser"
"New Forest National Park", "New Forest", "http://web01.newforestnpa.gov.uk/planningpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "NewForestParser"
"New Forest National Park", "New Forest NP", "http://web01.newforestnpa.gov.uk/planningpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "NewForestNPParser"
"Bridgnorth District Council", "Bridgnorth", "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch", "AcolnetParser", "BridgnorthParser"
"Carlisle City Council", "Carlisle", "http://planning.carlisle.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "CarlisleParser"
"Newcastle City Council", "Newcastle", "http://gispublic.newcastle.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
@@ -112,3 +112,4 @@
"South Bedfordshire District Council", "South Bedfordshire", "http://planning.southbeds.gov.uk/plantech/DCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.PgeSearch", "AcolnetParser", "SouthBedfordshireParser"
"Suffolk Coastal District Council", "Suffolk Coastal", "https://apps3.suffolkcoastal.gov.uk/planningonline/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "SuffolkCoastalParser"
"Surrey Heath Borough Council", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "SurreyHeathParser"
"New Forest District Council", "New Forest DC", "http://web3.newforest.gov.uk/planningonline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "NewForestDCParser"

Завантаження…
Відмінити
Зберегти