Browse Source

add some Acolnet sites:

Babergh
Basingstoke
Bassetlaw
Bury
Derby
master
duncan.parkes 17 years ago
parent
commit
43dceae0c6
100 changed files with 1006 additions and 514 deletions
  1. +364
    -0
      cgi-bin/AcolnetParser.py
  2. +0
    -0
      cgi-bin/Allerdale.cgi
  3. +0
    -0
      cgi-bin/Alnwick.cgi
  4. +0
    -0
      cgi-bin/Angus.cgi
  5. +0
    -0
      cgi-bin/Aylesbury Vale.cgi
  6. +29
    -0
      cgi-bin/Babergh.cgi
  7. +0
    -0
      cgi-bin/Barrow.cgi
  8. +0
    -0
      cgi-bin/Basildon.cgi
  9. +29
    -0
      cgi-bin/Basingstoke and Deane.cgi
  10. +29
    -0
      cgi-bin/Bassetlaw.cgi
  11. +0
    -0
      cgi-bin/Bath.cgi
  12. +0
    -0
      cgi-bin/Bexley.cgi
  13. +0
    -0
      cgi-bin/Blaby.cgi
  14. +0
    -0
      cgi-bin/Bolsover.cgi
  15. +0
    -0
      cgi-bin/Bristol.cgi
  16. +0
    -0
      cgi-bin/Buckinghamshire.cgi
  17. +29
    -0
      cgi-bin/Bury.cgi
  18. +0
    -0
      cgi-bin/Chelmsford.cgi
  19. +0
    -0
      cgi-bin/Cherwell.cgi
  20. +0
    -0
      cgi-bin/Chorley.cgi
  21. +0
    -0
      cgi-bin/City of London.cgi
  22. +0
    -0
      cgi-bin/Cornwall.cgi
  23. +0
    -0
      cgi-bin/Coventry.cgi
  24. +0
    -108
      cgi-bin/Dacorum.cgi
  25. +0
    -0
      cgi-bin/Denbighshire.cgi
  26. +29
    -0
      cgi-bin/Derby.cgi
  27. +0
    -0
      cgi-bin/Doncaster.cgi
  28. +0
    -0
      cgi-bin/Dundee.cgi
  29. +0
    -0
      cgi-bin/Durham.cgi
  30. +0
    -0
      cgi-bin/Ealing.cgi
  31. +0
    -0
      cgi-bin/Easington.cgi
  32. +0
    -0
      cgi-bin/East Devon.cgi
  33. +0
    -0
      cgi-bin/East Dorset.cgi
  34. +0
    -122
      cgi-bin/EastHerts.cgi
  35. +0
    -0
      cgi-bin/Edinburgh.cgi
  36. +0
    -121
      cgi-bin/Enfield.cgi
  37. +0
    -0
      cgi-bin/Epsom and Ewell.cgi
  38. +0
    -0
      cgi-bin/Fenland.cgi
  39. +0
    -0
      cgi-bin/Gateshead.cgi
  40. +0
    -0
      cgi-bin/Gedling.cgi
  41. +0
    -0
      cgi-bin/Gloucestershire.cgi
  42. +0
    -0
      cgi-bin/Gravesham.cgi
  43. +0
    -0
      cgi-bin/Hammersmith and Fulham.cgi
  44. +0
    -0
      cgi-bin/Haringey.cgi
  45. +0
    -0
      cgi-bin/Harrogate.cgi
  46. +0
    -0
      cgi-bin/Hart.cgi
  47. +0
    -0
      cgi-bin/Hartlepool.cgi
  48. +0
    -0
      cgi-bin/High Peak.cgi
  49. +0
    -0
      cgi-bin/Huntingdonshire.cgi
  50. +0
    -0
      cgi-bin/Kerrier.cgi
  51. +0
    -0
      cgi-bin/Knowsley.cgi
  52. +0
    -0
      cgi-bin/Lancaster.cgi
  53. +0
    -0
      cgi-bin/Luton.cgi
  54. +0
    -0
      cgi-bin/Malvern Hills.cgi
  55. +0
    -0
      cgi-bin/Mid Devon.cgi
  56. +0
    -0
      cgi-bin/Milton Keynes.cgi
  57. +133
    -0
      cgi-bin/MultipartPostHandler.py
  58. +0
    -0
      cgi-bin/NW Leicestershire.cgi
  59. +0
    -0
      cgi-bin/Newcastle-under-Lyme.cgi
  60. +0
    -0
      cgi-bin/Newham.cgi
  61. +0
    -0
      cgi-bin/North Tyneside.cgi
  62. +0
    -0
      cgi-bin/North Warwickshire.cgi
  63. +0
    -0
      cgi-bin/Northumberland.cgi
  64. +0
    -0
      cgi-bin/Oadby and Wigston.cgi
  65. +0
    -0
      cgi-bin/Oswestry.cgi
  66. +0
    -0
      cgi-bin/Peterborough.cgi
  67. +0
    -0
      cgi-bin/Portsmouth.cgi
  68. +0
    -0
      cgi-bin/Redditch.cgi
  69. +0
    -0
      cgi-bin/Rushmoor.cgi
  70. +0
    -0
      cgi-bin/Scarborough.cgi
  71. +0
    -0
      cgi-bin/Sevenoaks.cgi
  72. +0
    -0
      cgi-bin/South Bucks.cgi
  73. +0
    -0
      cgi-bin/South Ribble.cgi
  74. +0
    -0
      cgi-bin/South Staffordshire.cgi
  75. +0
    -0
      cgi-bin/SouthOxfordshire.cgi
  76. +0
    -0
      cgi-bin/Southampton.cgi
  77. +0
    -0
      cgi-bin/Spelthorne.cgi
  78. +0
    -0
      cgi-bin/St Helens.cgi
  79. +0
    -0
      cgi-bin/Stevenage.cgi
  80. +0
    -0
      cgi-bin/Stirling.cgi
  81. +0
    -0
      cgi-bin/Stockton-On-Tees.cgi
  82. +0
    -0
      cgi-bin/Stratford.cgi
  83. +0
    -0
      cgi-bin/Sunderland.cgi
  84. +0
    -0
      cgi-bin/Teignbridge.cgi
  85. +0
    -0
      cgi-bin/Test Valley.cgi
  86. +0
    -0
      cgi-bin/Tonbridge.cgi
  87. +0
    -0
      cgi-bin/Torbay.cgi
  88. +0
    -0
      cgi-bin/Vale Royal.cgi
  89. +0
    -0
      cgi-bin/Waveney.cgi
  90. +0
    -0
      cgi-bin/Wear Valley.cgi
  91. +0
    -0
      cgi-bin/Wellingborough.cgi
  92. +0
    -0
      cgi-bin/West Berkshire.cgi
  93. +0
    -0
      cgi-bin/West Lancashire.cgi
  94. +0
    -0
      cgi-bin/West Norfolk.cgi
  95. +0
    -0
      cgi-bin/Winchester.cgi
  96. +0
    -0
      cgi-bin/Woking.cgi
  97. +0
    -0
      cgi-bin/Wolverhampton.cgi
  98. +0
    -0
      cgi-bin/York.cgi
  99. +0
    -163
      cgi-bin/broxbourne.cgi
  100. +364
    -0
      python_scrapers/AcolnetParser.py

+ 364
- 0
cgi-bin/AcolnetParser.py View File

@@ -0,0 +1,364 @@
#!/usr/local/bin/python

import urllib, urllib2
import HTMLParser
#from BeautifulSoup import BeautifulSoup

import urlparse

import re

end_head_regex = re.compile("</head", re.IGNORECASE)

import MultipartPostHandler
# this is not mine, or part of standard python (though it should be!)
# it comes from http://pipe.scs.fsu.edu/PostHandler/MultipartPostHandler.py

from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication

from datetime import date
from time import strptime


date_format = "%d/%m/%Y"
our_date = date(2007,4,25)


class AcolnetParser(HTMLParser.HTMLParser):
case_number_tr = None # this one can be got by the td class attribute
reg_date_tr = None
location_tr = None
proposal_tr = None

# There is no online comment facility in these, so we provide an
# appropriate email address instead
comments_email_address = None

def __init__(self,
authority_name,
authority_short_name,
base_url,
debug=False):


HTMLParser.HTMLParser.__init__(self)

self.authority_name = authority_name
self.authority_short_name = authority_short_name
self.base_url = base_url

self.debug = debug

self._tr_number = 0

# This will be used to track the subtable depth
# when we are in a results-table, in order to
# avoid adding an application before we have got to
# the end of the results-table
self._subtable_depth = None

self._in_td = False

# This in where we store the results
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

# This will store the planning application we are currently working on.
self._current_application = None


def _cleanupHTML(self, html):
"""This method should be overridden in subclasses to perform site specific
HTML cleanup."""
return html

def handle_starttag(self, tag, attrs):
#print tag, attrs
if tag == "table":
if self._current_application is None:
# Each application is in a separate table with class "results-table"
for key, value in attrs:
if key == "class" and value == "results-table":
#print "found results-table"
self._current_application = PlanningApplication()
self._tr_number = 0
self._subtable_depth = 0
self._current_application.comment_url = self.comments_email_address
break
else:
# We are already in a results-table, and this is the start of a subtable,
# so increment the subtable depth.
self._subtable_depth += 1

elif self._current_application is not None:
if tag == "tr" and self._subtable_depth == 0:
self._tr_number += 1
if tag == "td":
self._in_td = True
if self._tr_number == self.case_number_tr:
#get the reference and the info link here
pass
elif self._tr_number == self.reg_date_tr:
#get the registration date here
pass
elif self._tr_number == self.location_tr:
#get the address and postcode here
pass
elif self._tr_number == self.proposal_tr:
#get the description here
pass
if tag == "a" and self._tr_number == self.case_number_tr:
# this is where we get the info link and the case number
for key, value in attrs:
if key == "href":
self._current_application.info_url = value
def handle_data(self, data):
# If we are in the tr which contains the case number,
# then data is the council reference, so
# add it to self._current_application.
if self._in_td:
if self._tr_number == self.case_number_tr:
self._current_application.council_reference = data.strip()
elif self._tr_number == self.reg_date_tr:
# we need to make a date object out of data
date_as_str = ''.join(data.strip().split())
received_date = date(*strptime(date_as_str, date_format)[0:3])

#print received_date

self._current_application.date_received = received_date

elif self._tr_number == self.location_tr:
location = data.strip()

self._current_application.address = location
self._current_application.postcode = getPostcodeFromText(location)
elif self._tr_number == self.proposal_tr:
self._current_application.description = data.strip()


def handle_endtag(self, tag):
#print "ending: ", tag
if tag == "table" and self._current_application is not None:
if self._subtable_depth > 0:
self._subtable_depth -= 1
else:
# We need to add the last application in the table
if self._current_application is not None:
#print "adding application"
self._results.addApplication(self._current_application)
#print self._current_application
self._current_application = None
self._tr_number = None
self._subtable_depth = None
elif tag == "td":
self._in_td = False

def getResultsByDayMonthYear(self, day, month, year):
# first we fetch the search page to get ourselves some session info...
search_form_response = urllib2.urlopen(self.base_url)
search_form_contents = search_form_response.read()

# This sometimes causes a problem in HTMLParser, so let's just get the link
# out with a regex...

groups = self.action_regex.search(search_form_contents).groups()

action = groups[0]
#print action

action_url = urlparse.urljoin(self.base_url, action)
#print action_url

our_date = date(year, month, day)
search_data = {"regdate1": our_date.strftime(date_format),
"regdate2": our_date.strftime(date_format),
}
opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
response = opener.open(action_url, search_data)
results_html = response.read()

# This is for doing site specific html cleanup
results_html = self._cleanupHTML(results_html)

#some javascript garbage in the header upsets HTMLParser,
#so we'll just have the body
just_body = "<html>" + end_head_regex.split(results_html)[-1]

#outfile = open(self.authority_short_name + ".debug", "w")
#outfile.write(just_body)

self.feed(just_body)
return self._results



def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()


class BaberghParser(AcolnetParser):
#search_url = "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

case_number_tr = 1 # this one can be got by the td class attribute
reg_date_tr = 2
location_tr = 4
proposal_tr = 5

#authority_name = "Babergh District Council"
#authority_short_name = "Babergh"

# It would be nice to scrape this...
comments_email_address = "planning.reception@babergh.gov.uk"

action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">")

class BasingstokeParser(AcolnetParser):
#search_url = "http://planning.basingstoke.gov.uk/DCOnline2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
case_number_tr = 1 # this one can be got by the td class attribute
reg_date_tr = 3
location_tr = 6
proposal_tr = 8

#authority_name = "Basingstoke and Deane Borough Council"
#authority_short_name = "Basingstoke and Deane"

# It would be nice to scrape this...
comments_email_address = "development.control@basingstoke.gov.uk"

action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">")

class BassetlawParser(AcolnetParser):
#search_url = "http://www.bassetlaw.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

case_number_tr = 1 # this one can be got by the td class attribute
reg_date_tr = 2
location_tr = 5
proposal_tr = 6

#authority_name = "Bassetlaw District Council"
#authority_short_name = "Bassetlaw"

comments_email_address = "planning@bassetlaw.gov.uk"

action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)

def _cleanupHTML(self, html):
"""There is a broken div in this page. We don't need any divs, so
let's get rid of them all."""

div_regex = re.compile("</?div[^>]*>", re.IGNORECASE)
return div_regex.sub('', html)


class BridgenorthParser(AcolnetParser):
#search_url = "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"

case_number_tr = 1 # this one can be got by the td class attribute
reg_date_tr = 2
location_tr = 4
proposal_tr = 5

#authority_name = "Bridgenorth District Council"
#authority_short_name = "Bridgenorth"

comments_email_address = "contactus@bridgnorth-dc.gov.uk"

action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">")

class BuryParser(AcolnetParser):
#search_url = "http://e-planning.bury.gov.uk/ePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"

case_number_tr = 1 # this one can be got by the td class attribute
reg_date_tr = 2
location_tr = 4
proposal_tr = 5

#authority_name = "Bury Metropolitan Borough Council"
#authority_short_name = "Bury"

comments_email_address = "development.control@bury.gov.uk"
action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">")

## class CanterburyParser(AcolnetParser):
## search_url = "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

## case_number_tr = 1 # this one can be got by the td class attribute
## reg_date_tr = 2
## location_tr = 4
## proposal_tr = 5

## authority_name = "Canterbury City Council"
## authority_short_name = "Canterbury"

## comments_email_address = ""
## action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">")

class CarlisleParser(AcolnetParser):
#search_url = "http://planning.carlisle.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"

case_number_tr = 1 # this one can be got by the td class attribute
reg_date_tr = 2
location_tr = 5
proposal_tr = 6

#authority_name = "Carlisle City Council"
#authority_short_name = "Carlisle"

comments_email_address = "dc@carlisle.gov.uk"
action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">")


class DerbyParser(AcolnetParser):
#search_url = "http://195.224.106.204/scripts/planningpages02%5CXSLPagesDC_DERBY%5CDCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

case_number_tr = 1 # this one can be got by the td class attribute
reg_date_tr = 3
location_tr = 4
proposal_tr = 5

#authority_name = "Derby City Council"
#authority_short_name = "Derby"

comments_email_address = "developmentcontrol@derby.gov.uk"
action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">")


if __name__ == '__main__':
day = 15
month = 3
year = 2007

# working
# parser = BasingstokeParser()
parser = BaberghParser("Babergh District Council", "Babergh", "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")

# works with the divs stripped out
#parser = BassetlawParser()

# returns error 400 - bad request
#parser = BridgenorthParser()

# working
#parser = BuryParser()

# cambridgeshire is a bit different...
# no advanced search page

# canterbury
# results as columns of one table

# returns error 400 - bad request
#parser = CarlisleParser()

# working
#parser = DerbyParser()
print parser.getResults(day, month, year)

+ 0
- 0
cgi-bin/Allerdale.cgi View File


+ 0
- 0
cgi-bin/Alnwick.cgi View File


+ 0
- 0
cgi-bin/Angus.cgi View File


+ 0
- 0
cgi-bin/Aylesbury Vale.cgi View File


+ 29
- 0
cgi-bin/Babergh.cgi View File

@@ -0,0 +1,29 @@
#!/usr/local/bin/python

# This is the parser for Babergh District Council.
# it is generated from the file CGITemplate

import cgi
import cgitb
#cgitb.enable(display=0, logdir="/tmp")


form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')


authority_name = "Babergh District Council"
authority_short_name = "Babergh"
base_url = "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

import AcolnetParser

parser = AcolnetParser.BaberghParser(authority_name, authority_short_name, base_url)

xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

+ 0
- 0
cgi-bin/Barrow.cgi View File


+ 0
- 0
cgi-bin/Basildon.cgi View File


+ 29
- 0
cgi-bin/Basingstoke and Deane.cgi View File

@@ -0,0 +1,29 @@
#!/usr/local/bin/python

# This is the parser for Basingstoke and Deane Borough Council.
# it is generated from the file CGITemplate

import cgi
import cgitb
#cgitb.enable(display=0, logdir="/tmp")


form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')


authority_name = "Basingstoke and Deane Borough Council"
authority_short_name = "Basingstoke and Deane"
base_url = "http://planning.basingstoke.gov.uk/DCOnline2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

import AcolnetParser

parser = AcolnetParser.BasingstokeParser(authority_name, authority_short_name, base_url)

xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

+ 29
- 0
cgi-bin/Bassetlaw.cgi View File

@@ -0,0 +1,29 @@
#!/usr/local/bin/python

# This is the parser for Bassetlaw District Council.
# it is generated from the file CGITemplate

import cgi
import cgitb
#cgitb.enable(display=0, logdir="/tmp")


form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')


authority_name = "Bassetlaw District Council"
authority_short_name = "Bassetlaw"
base_url = "http://www.bassetlaw.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

import AcolnetParser

parser = AcolnetParser.BassetlawParser(authority_name, authority_short_name, base_url)

xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

+ 0
- 0
cgi-bin/Bath.cgi View File


+ 0
- 0
cgi-bin/Bexley.cgi View File


+ 0
- 0
cgi-bin/Blaby.cgi View File


+ 0
- 0
cgi-bin/Bolsover.cgi View File


+ 0
- 0
cgi-bin/Bristol.cgi View File


+ 0
- 0
cgi-bin/Buckinghamshire.cgi View File


+ 29
- 0
cgi-bin/Bury.cgi View File

@@ -0,0 +1,29 @@
#!/usr/local/bin/python

# This is the parser for Bury Metropolitan Borough Council.
# it is generated from the file CGITemplate

import cgi
import cgitb
#cgitb.enable(display=0, logdir="/tmp")


form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')


authority_name = "Bury Metropolitan Borough Council"
authority_short_name = "Bury"
base_url = "http://e-planning.bury.gov.uk/ePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"

import AcolnetParser

parser = AcolnetParser.BuryParser(authority_name, authority_short_name, base_url)

xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

+ 0
- 0
cgi-bin/Chelmsford.cgi View File


+ 0
- 0
cgi-bin/Cherwell.cgi View File


+ 0
- 0
cgi-bin/Chorley.cgi View File


+ 0
- 0
cgi-bin/City of London.cgi View File


+ 0
- 0
cgi-bin/Cornwall.cgi View File


+ 0
- 0
cgi-bin/Coventry.cgi View File


+ 0
- 108
cgi-bin/Dacorum.cgi View File

@@ -1,108 +0,0 @@
#!/usr/bin/perl

use strict;
use warnings;

use CGI qw(:cgi);
use HTML::TreeBuilder;
use LWP::UserAgent;
use XML::Writer;

# The master URLs for the Dacorum planning search
our $SearchURL = "http://www.dacorum.gov.uk/default.aspx?page=1495";
our $InfoURL = "http://www.dacorum.gov.uk/Default.aspx?page=1497&ID=";
our $CommentURL = "http://www.dacorum.gov.uk/Default.aspx?page=2847&ID=";

# We're a CGI script...
my $query = CGI->new();

# Construct an LWP user agent
our $UA = LWP::UserAgent->new(env_proxy => 1,
cookie_jar => {},
requests_redirectable => [ 'GET', 'HEAD', 'POST' ]);

# Post the URL to get an initial blank form
my $state = get_state(do_post());

# Do the search
my $page = do_post({"__VIEWSTATE" => $state,
"Template:_ctl10:_ctl0:btnSearch" => "Search",
"Template:_ctl10:_ctl0:tbRegistrationFromDay" => $query->param("day"),
"Template:_ctl10:_ctl0:tbRegistrationFromMon" => $query->param("month"),
"Template:_ctl10:_ctl0:tbRegistrationFromYear" => $query->param("year"),
"Template:_ctl10:_ctl0:tbRegistrationToDay" => $query->param("day"),
"Template:_ctl10:_ctl0:tbRegistrationToMon" => $query->param("month"),
"Template:_ctl10:_ctl0:tbRegistrationToYear" => $query->param("year")});

# Output an HTTP response header
print $query->header(-type => "text/xml");

# Create an XML output stream
my $Writer = XML::Writer->new(DATA_MODE => 1);

# Output the XML header data
$Writer->xmlDecl("UTF-8");
$Writer->startTag("planning");
$Writer->dataElement("authority_name", "Dacorum Borough Council");
$Writer->dataElement("authority_short_name", "Dacorum");
$Writer->startTag("applications");

# Find the result table
my $table = $page->look_down("_tag" => "table", "class" => "FormDataGrid");

# Process each row of the results
foreach my $row ($table->look_down("_tag" => "tr"))
{
my @cells = $row->look_down("_tag" => "td");

if ($cells[0]->attr("class") eq "FormGridDataItem" ||
$cells[0]->attr("class") eq "FormGridAlternatingDataItem")
{
my $reference = $cells[0]->as_trimmed_text;
my $address = $cells[1]->as_trimmed_text;
my $description = $cells[2]->as_trimmed_text;
my $date = $cells[3]->as_trimmed_text;
my $postcode;

if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/)
{
$postcode = $1;
}

$Writer->startTag("application");
$Writer->dataElement("council_reference", $reference);
$Writer->dataElement("address", $address);
$Writer->dataElement("postcode", $postcode);
$Writer->dataElement("description", $description);
$Writer->dataElement("info_url", $InfoURL . $reference);
$Writer->dataElement("comment_url", $CommentURL . $reference);
$Writer->dataElement("date_received", $date);
$Writer->endTag("application");
}
}

# Finish off XML output
$Writer->endTag("applications");
$Writer->endTag("planning");
$Writer->end();

exit 0;

# Extract the state from a page so we can repost it
sub get_state
{
my $page = shift;
my $viewstate = $page->look_down("_tag" => "input", "name" => "__VIEWSTATE");

return $viewstate->attr("value");
}

# Post to the planning search page
sub do_post
{
my $response = $UA->post($SearchURL, @_);

die $response->status_line unless $response->is_success;

return HTML::TreeBuilder->new_from_content($response->content);
}

+ 0
- 0
cgi-bin/Denbighshire.cgi View File


+ 29
- 0
cgi-bin/Derby.cgi View File

@@ -0,0 +1,29 @@
#!/usr/local/bin/python

# This is the parser for Derby City Council.
# it is generated from the file CGITemplate

import cgi
import cgitb
#cgitb.enable(display=0, logdir="/tmp")


form = cgi.FieldStorage()
day = form.getfirst('day')
month = form.getfirst('month')
year = form.getfirst('year')


authority_name = "Derby City Council"
authority_short_name = "Derby"
base_url = "http://195.224.106.204/scripts/planningpages02%5CXSLPagesDC_DERBY%5CDCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

import AcolnetParser

parser = AcolnetParser.DerbyParser(authority_name, authority_short_name, base_url)

xml = parser.getResults(day, month, year)

print "Content-Type: text/xml" # XML is following
print
print xml # print the xml

+ 0
- 0
cgi-bin/Doncaster.cgi View File


+ 0
- 0
cgi-bin/Dundee.cgi View File


+ 0
- 0
cgi-bin/Durham.cgi View File


+ 0
- 0
cgi-bin/Ealing.cgi View File


+ 0
- 0
cgi-bin/Easington.cgi View File


+ 0
- 0
cgi-bin/East Devon.cgi View File


+ 0
- 0
cgi-bin/East Dorset.cgi View File


+ 0
- 122
cgi-bin/EastHerts.cgi View File

@@ -1,122 +0,0 @@
#!/usr/bin/perl

use strict;
use warnings;

use CGI qw(:cgi);
use HTML::TreeBuilder;
use LWP::UserAgent;
use XML::Writer;

# The master URLs for the East Herts planning search
our $SearchURL = "http://e-services.eastherts.gov.uk/swiftlg/apas/run/WPHAPPCRITERIA";
our $InfoURL = "http://e-services.eastherts.gov.uk/swiftlg/apas/run/WPHAPPDETAIL.DisplayUrl?theApnID=";
our $CommentURL = "http://e-services.eastherts.gov.uk/swiftlg/apas/run/wphmakerep.displayURL?ApnID=";

# We're a CGI script...
my $query = CGI->new();

# Get the date to fetch
my $date = $query->param("day") . "/" . $query->param("month") . "/" . $query->param("year");

# Construct an LWP user agent
our $UA = LWP::UserAgent->new(env_proxy => 1);

# Do the search
my $page = do_post($SearchURL,
{"REGFROMDATE.MAINBODY.WPACIS.1." => $date,
"REGTODATE.MAINBODY.WPACIS.1." => $date,
"SEARCHBUTTON.MAINBODY.WPACIS.1." => "Search"});

# Output an HTTP response header
print $query->header(-type => "text/xml");

# Create an XML output stream
my $Writer = XML::Writer->new(DATA_MODE => 1);

# Output the XML header data
$Writer->xmlDecl("UTF-8");
$Writer->startTag("planning");
$Writer->dataElement("authority_name", "East Herts Council");
$Writer->dataElement("authority_short_name", "East Herts");
$Writer->startTag("applications");

# Output any applications on the first page
output_applications($page);

# Loop over any additional results pages
foreach my $link ($page->look_down("_tag" => "a", "href" => qr/^WPHAPPSEARCHRES\.displayResultsURL/))
{
# Fetch this page...
$page = do_get(URI->new_abs($link->attr("href"), $SearchURL));

# ...and output the applications from it
output_applications($page);
}

# Finish off XML output
$Writer->endTag("applications");
$Writer->endTag("planning");
$Writer->end();

exit 0;

# Make a GET request
sub do_get
{
my $response = $UA->get(@_);

die $response->status_line unless $response->is_success;

return HTML::TreeBuilder->new_from_content($response->content);
}

# Make a POST request
sub do_post
{
my $response = $UA->post(@_);

die $response->status_line unless $response->is_success;

return HTML::TreeBuilder->new_from_content($response->content);
}

# Output applications from a results page
sub output_applications
{
my $page = shift;

# Find the result table
my $table = $page->look_down("_tag" => "table", "cellspacing" => "2", "cellpadding" => "2");

# Process each row of the results
foreach my $row ($table->look_down("_tag" => "tr"))
{
my @cells = $row->look_down("_tag" => "td");

if (@cells >= 3)
{
my $reference = $cells[0]->as_trimmed_text;
my $description = $cells[1]->as_trimmed_text;
my $address = $cells[2]->as_trimmed_text;
my $postcode;

if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/)
{
$postcode = $1;
}

$Writer->startTag("application");
$Writer->dataElement("council_reference", $reference);
$Writer->dataElement("address", $address);
$Writer->dataElement("postcode", $postcode);
$Writer->dataElement("description", $description);
$Writer->dataElement("info_url", $InfoURL . $reference);
$Writer->dataElement("comment_url", $CommentURL . $reference);
$Writer->dataElement("date_received", $date);
$Writer->endTag("application");
}
}

return;
}

+ 0
- 0
cgi-bin/Edinburgh.cgi View File


+ 0
- 121
cgi-bin/Enfield.cgi View File

@@ -1,121 +0,0 @@
#!/usr/bin/perl

use strict;
use warnings;

use CGI qw(:cgi);
use HTML::TreeBuilder;
use LWP::UserAgent;
use XML::Writer;


# The master URLs for the Enfield planning search
our $SearchURL = "http://forms.enfield.gov.uk/swiftlg/apas/run/WPHAPPCRITERIA";
our $InfoURL = "http://forms.enfield.gov.uk/swiftlg/apas/run/WPHAPPDETAIL.DisplayUrl?theApnID=";

# We're a CGI script...
my $query = CGI->new();

# Get the date to fetch
my $date = $query->param("day") . "/" . $query->param("month") . "/" . $query->param("year");

# Construct an LWP user agent
our $UA = LWP::UserAgent->new(env_proxy => 1);

# Do the search
my $page = do_post($SearchURL,
{"REGFROMDATE.MAINBODY.WPACIS.1." => $date,
"REGTODATE.MAINBODY.WPACIS.1." => $date,
"SEARCHBUTTON.MAINBODY.WPACIS.1." => "Search"});

# Output an HTTP response header
print $query->header(-type => "text/xml");

# Create an XML output stream
my $Writer = XML::Writer->new(DATA_MODE => 1);

# Output the XML header data
$Writer->xmlDecl("UTF-8");
$Writer->startTag("planning");
$Writer->dataElement("authority_name", "Enfield Council");
$Writer->dataElement("authority_short_name", "Enfield");
$Writer->startTag("applications");

# Output any applications on the first page
output_applications($page);

# Loop over any additional results pages
foreach my $link ($page->look_down("_tag" => "a", "href" => qr/^WPHAPPSEARCHRES\.displayResultsURL/))
{
# Fetch this page...
$page = do_get(URI->new_abs($link->attr("href"), $SearchURL));

# ...and output the applications from it
output_applications($page);
}

# Finish off XML output
$Writer->endTag("applications");
$Writer->endTag("planning");
$Writer->end();

exit 0;

# Make a GET request
sub do_get
{
my $response = $UA->get(@_);

die $response->status_line unless $response->is_success;

return HTML::TreeBuilder->new_from_content($response->content);
}

# Make a POST request
sub do_post
{
my $response = $UA->post(@_);

die $response->status_line unless $response->is_success;

return HTML::TreeBuilder->new_from_content($response->content);
}

# Output applications from a results page
sub output_applications
{
my $page = shift;

# Find the result table
my $table = $page->look_down("_tag" => "table", "class" => "apas_tbl");

# Process each row of the results
foreach my $row ($table->look_down("_tag" => "tr"))
{
my @cells = $row->look_down("_tag" => "td");

if (@cells >= 3)
{
my $reference = $cells[0]->as_trimmed_text;
my $description = $cells[1]->as_trimmed_text;
my $address = $cells[2]->as_trimmed_text;
my $postcode;

if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/)
{
$postcode = $1;
}

$Writer->startTag("application");
$Writer->dataElement("council_reference", $reference);
$Writer->dataElement("address", $address);
$Writer->dataElement("postcode", $postcode);
$Writer->dataElement("description", $description);
$Writer->dataElement("info_url", $InfoURL . $reference);
$Writer->dataElement("date_received", $date);
$Writer->endTag("application");
}
}

return;
}

+ 0
- 0
cgi-bin/Epsom and Ewell.cgi View File


+ 0
- 0
cgi-bin/Fenland.cgi View File


+ 0
- 0
cgi-bin/Gateshead.cgi View File


+ 0
- 0
cgi-bin/Gedling.cgi View File


+ 0
- 0
cgi-bin/Gloucestershire.cgi View File


+ 0
- 0
cgi-bin/Gravesham.cgi View File


+ 0
- 0
cgi-bin/Hammersmith and Fulham.cgi View File


+ 0
- 0
cgi-bin/Haringey.cgi View File


+ 0
- 0
cgi-bin/Harrogate.cgi View File


+ 0
- 0
cgi-bin/Hart.cgi View File


+ 0
- 0
cgi-bin/Hartlepool.cgi View File


+ 0
- 0
cgi-bin/High Peak.cgi View File


+ 0
- 0
cgi-bin/Huntingdonshire.cgi View File


+ 0
- 0
cgi-bin/Kerrier.cgi View File


+ 0
- 0
cgi-bin/Knowsley.cgi View File


+ 0
- 0
cgi-bin/Lancaster.cgi View File


+ 0
- 0
cgi-bin/Luton.cgi View File


+ 0
- 0
cgi-bin/Malvern Hills.cgi View File


+ 0
- 0
cgi-bin/Mid Devon.cgi View File


+ 0
- 0
cgi-bin/Milton Keynes.cgi View File


+ 133
- 0
cgi-bin/MultipartPostHandler.py View File

@@ -0,0 +1,133 @@
####
# 02/2006 Will Holcomb <wholcomb@gmail.com>
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#

# I have edited out a bit in the middle of this which reverts to a normal
# post with "application/x-www-form-urlencoded" content-type when there are
# no files.
# Duncan 5/5/2007

"""
Usage:
Enables the use of multipart/form-data for posting forms

Inspirations:
Upload files in python:
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/146306
urllib2_file:
Fabien Seisen: <fabien@seisen.org>

Example:
import MultipartPostHandler, urllib2, cookielib

cookies = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookies),
MultipartPostHandler.MultipartPostHandler)
params = { "username" : "bob", "password" : "riviera",
"file" : open("filename", "rb") }
opener.open("http://wwww.bobsite.com/upload/", params)

Further Example:
The main function of this file is a sample which downloads a page and
then uploads it to the W3C validator.
"""

import urllib
import urllib2
import mimetools, mimetypes
import os, stat

class Callable:
def __init__(self, anycallable):
self.__call__ = anycallable

# Controls how sequences are uncoded. If true, elements may be given multiple values by
# assigning a sequence.
doseq = 1

class MultipartPostHandler(urllib2.BaseHandler):
handler_order = urllib2.HTTPHandler.handler_order - 10 # needs to run first

def http_request(self, request):
data = request.get_data()
if data is not None and type(data) != str:
v_files = []
v_vars = []
try:
for(key, value) in data.items():
if type(value) == file:
v_files.append((key, value))
else:
v_vars.append((key, value))
except TypeError:
systype, value, traceback = sys.exc_info()
raise TypeError, "not a valid non-string sequence or mapping object", traceback

boundary, data = self.multipart_encode(v_vars, v_files)
contenttype = 'multipart/form-data; boundary=%s' % boundary
if(request.has_header('Content-Type')
and request.get_header('Content-Type').find('multipart/form-data') != 0):
print "Replacing %s with %s" % (request.get_header('content-type'), 'multipart/form-data')
request.add_unredirected_header('Content-Type', contenttype)

request.add_data(data)
return request

def multipart_encode(vars, files, boundary = None, buffer = None):
if boundary is None:
boundary = mimetools.choose_boundary()
if buffer is None:
buffer = ''
for(key, value) in vars:
buffer += '--%s\r\n' % boundary
buffer += 'Content-Disposition: form-data; name="%s"' % key
buffer += '\r\n\r\n' + value + '\r\n'
for(key, fd) in files:
file_size = os.fstat(fd.fileno())[stat.ST_SIZE]
filename = fd.name.split('/')[-1]
contenttype = mimetypes.guess_type(filename)[0] or 'application/octet-stream'
buffer += '--%s\r\n' % boundary
buffer += 'Content-Disposition: form-data; name="%s"; filename="%s"\r\n' % (key, filename)
buffer += 'Content-Type: %s\r\n' % contenttype
# buffer += 'Content-Length: %s\r\n' % file_size
fd.seek(0)
buffer += '\r\n' + fd.read() + '\r\n'
buffer += '--%s--\r\n\r\n' % boundary
return boundary, buffer
multipart_encode = Callable(multipart_encode)

https_request = http_request

## def main():
## import tempfile, sys

## validatorURL = "http://validator.w3.org/check"
## opener = urllib2.build_opener(MultipartPostHandler)

## def validateFile(url):
## temp = tempfile.mkstemp(suffix=".html")
## os.write(temp[0], opener.open(url).read())
## params = { "ss" : "0", # show source
## "doctype" : "Inline",
## "uploaded_file" : open(temp[1], "rb") }
## print opener.open(validatorURL, params).read()
## os.remove(temp[1])

## if len(sys.argv[1:]) > 0:
## for arg in sys.argv[1:]:
## validateFile(arg)
## else:
## validateFile("http://www.google.com")

## if __name__=="__main__":
## main()

+ 0
- 0
cgi-bin/NW Leicestershire.cgi View File


+ 0
- 0
cgi-bin/Newcastle-under-Lyme.cgi View File


+ 0
- 0
cgi-bin/Newham.cgi View File


+ 0
- 0
cgi-bin/North Tyneside.cgi View File


+ 0
- 0
cgi-bin/North Warwickshire.cgi View File


+ 0
- 0
cgi-bin/Northumberland.cgi View File


+ 0
- 0
cgi-bin/Oadby and Wigston.cgi View File


+ 0
- 0
cgi-bin/Oswestry.cgi View File


+ 0
- 0
cgi-bin/Peterborough.cgi View File


+ 0
- 0
cgi-bin/Portsmouth.cgi View File


+ 0
- 0
cgi-bin/Redditch.cgi View File


+ 0
- 0
cgi-bin/Rushmoor.cgi View File


+ 0
- 0
cgi-bin/Scarborough.cgi View File


+ 0
- 0
cgi-bin/Sevenoaks.cgi View File


+ 0
- 0
cgi-bin/South Bucks.cgi View File


+ 0
- 0
cgi-bin/South Ribble.cgi View File


+ 0
- 0
cgi-bin/South Staffordshire.cgi View File


+ 0
- 0
cgi-bin/SouthOxfordshire.cgi View File


+ 0
- 0
cgi-bin/Southampton.cgi View File


+ 0
- 0
cgi-bin/Spelthorne.cgi View File


+ 0
- 0
cgi-bin/St Helens.cgi View File


+ 0
- 0
cgi-bin/Stevenage.cgi View File


+ 0
- 0
cgi-bin/Stirling.cgi View File


+ 0
- 0
cgi-bin/Stockton-On-Tees.cgi View File


+ 0
- 0
cgi-bin/Stratford.cgi View File


+ 0
- 0
cgi-bin/Sunderland.cgi View File


+ 0
- 0
cgi-bin/Teignbridge.cgi View File


+ 0
- 0
cgi-bin/Test Valley.cgi View File


+ 0
- 0
cgi-bin/Tonbridge.cgi View File


+ 0
- 0
cgi-bin/Torbay.cgi View File


+ 0
- 0
cgi-bin/Vale Royal.cgi View File


+ 0
- 0
cgi-bin/Waveney.cgi View File


+ 0
- 0
cgi-bin/Wear Valley.cgi View File


+ 0
- 0
cgi-bin/Wellingborough.cgi View File


+ 0
- 0
cgi-bin/West Berkshire.cgi View File


+ 0
- 0
cgi-bin/West Lancashire.cgi View File


+ 0
- 0
cgi-bin/West Norfolk.cgi View File


+ 0
- 0
cgi-bin/Winchester.cgi View File


+ 0
- 0
cgi-bin/Woking.cgi View File


+ 0
- 0
cgi-bin/Wolverhampton.cgi View File


+ 0
- 0
cgi-bin/York.cgi View File


+ 0
- 163
cgi-bin/broxbourne.cgi View File

@@ -1,163 +0,0 @@
#!/usr/bin/perl

use strict;
use warnings;

use CGI qw(:cgi);
use DateTime;
#use DateTime::Format::DateParse;
use HTML::TreeBuilder;
use LWP::UserAgent;
use XML::Writer;

# The master URL for the Broxbourne planning search
our $SearchURL = "http://www2.broxbourne.gov.uk/planningsearch/webform1.aspx";

# We're a CGI script...
my $query = CGI->new();

# Get the date as an offset from 2000-01-01
my $epoch = DateTime->new(year => 2000, month => 1, day => 1);
my $querydate = DateTime->new(year => $query->param("year"),
month => $query->param("month"),
day => $query->param("day"));
$querydate = $querydate->delta_days($epoch)->delta_days;

# Construct an LWP user agent
our $UA = LWP::UserAgent->new(env_proxy => 1);

# Post the URL to get an initial blank form
my $state = get_state(do_post());

# Post each date in turn to build up the state - you can thank
# Microsoft and ASP.NET for the horrible way we have to do this
# by posting each argument in turn to build up the state
$state = get_state(do_post_back($state, 'DateSelector1$Calendar1', $querydate));
$state = get_state(do_post_back($state, 'DateSelector2$Calendar1', $querydate));

# Output an HTTP response header
print $query->header(-type => "text/xml");

# Create an XML output stream
my $Writer = XML::Writer->new(DATA_MODE => 1);

# Output the XML header data
$Writer->xmlDecl("UTF-8");
$Writer->startTag("planning");
$Writer->dataElement("authority_name", "Borough of Broxbourne");
$Writer->dataElement("authority_short_name", "Broxbourne");
$Writer->startTag("applications");

# Get the arguments for the search...
my $args = {
"Srch" => "rb1",
"__VIEWSTATE" => $state,
"btnSearch" => "Search",
"tbReference" => "",
"tbRef2" => ""
};

# ...and then (at last) we can do the search!
my $page = do_post($args);

# Loop processing pages of results
while ($page)
{
my $table = $page->look_down("_tag" => "table", "id" => "DataGrid1");

# Remember the state
$state = get_state($page);

# Clear the page for now - this will be reinitialised if we
# find another page of results to make us go round the loop
# all over again
undef $page;

# Check that we found a table - searches that find no results
# produce a page with no table in it
if ($table)
{
# Process each row of the results
foreach my $row ($table->look_down("_tag" => "tr"))
{
my @cells = $row->look_down("_tag" => "td");

if ($cells[0]->look_down("_tag" => "input"))
{
my $reference = $cells[1]->as_trimmed_text;
my $date = $cells[2]->as_trimmed_text;
my $address = $cells[3]->as_trimmed_text;
my $description = $cells[4]->as_trimmed_text;
my $postcode;

if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/)
{
$postcode = $1;
}

$Writer->startTag("application");
$Writer->dataElement("council_reference", $reference);
$Writer->dataElement("address", $address);
$Writer->dataElement("postcode", $postcode);
$Writer->dataElement("description", $description);
$Writer->dataElement("date_received", $date);
$Writer->endTag("application");
}
elsif ($cells[0]->attr("colspan") && $cells[0]->attr("colspan") eq "5")
{
foreach my $link ($cells[0]->look_down("_tag" => "a"))
{
if ($link->as_trimmed_text eq ">" &&
$link->attr("href") =~ /^javascript:__doPostBack\('([^\']*)','([^\']*)'\)$/)
{
$page = do_post_back($state, $1, $2);
}
}
}
}
}
}

# Finish off XML output
$Writer->endTag("applications");
$Writer->endTag("planning");
$Writer->end();

exit 0;

# Extract the state from a page so we can repost it
sub get_state
{
my $page = shift;
my $viewstate = $page->look_down("_tag" => "input", "name" => "__VIEWSTATE");

return $viewstate->attr("value");
}

# Fake up what the doPostBack javascript function in the page does...
sub do_post_back
{
my $state = shift;
my $target = shift;
my $argument = shift;

$target =~ s/\$/:/g;

my $args = {
"__EVENTTARGET" => $target,
"__EVENTARGUMENT" => $argument,
"__VIEWSTATE" => $state
};

return do_post($args);
}

# Post to the planning search page
sub do_post
{
my $response = $UA->post($SearchURL, @_);

die $response->status_line unless $response->is_success;

return HTML::TreeBuilder->new_from_content($response->content);
}

+ 364
- 0
python_scrapers/AcolnetParser.py View File

@@ -0,0 +1,364 @@
#!/usr/local/bin/python

import urllib, urllib2
import HTMLParser
#from BeautifulSoup import BeautifulSoup

import urlparse

import re

end_head_regex = re.compile("</head", re.IGNORECASE)

import MultipartPostHandler
# this is not mine, or part of standard python (though it should be!)
# it comes from http://pipe.scs.fsu.edu/PostHandler/MultipartPostHandler.py

from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication

from datetime import date
from time import strptime


date_format = "%d/%m/%Y"
our_date = date(2007,4,25)


class AcolnetParser(HTMLParser.HTMLParser):
case_number_tr = None # this one can be got by the td class attribute
reg_date_tr = None
location_tr = None
proposal_tr = None

# There is no online comment facility in these, so we provide an
# appropriate email address instead
comments_email_address = None

def __init__(self,
authority_name,
authority_short_name,
base_url,
debug=False):


HTMLParser.HTMLParser.__init__(self)

self.authority_name = authority_name
self.authority_short_name = authority_short_name
self.base_url = base_url

self.debug = debug

self._tr_number = 0

# This will be used to track the subtable depth
# when we are in a results-table, in order to
# avoid adding an application before we have got to
# the end of the results-table
self._subtable_depth = None

self._in_td = False

# This in where we store the results
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

# This will store the planning application we are currently working on.
self._current_application = None


def _cleanupHTML(self, html):
"""This method should be overridden in subclasses to perform site specific
HTML cleanup."""
return html

def handle_starttag(self, tag, attrs):
#print tag, attrs
if tag == "table":
if self._current_application is None:
# Each application is in a separate table with class "results-table"
for key, value in attrs:
if key == "class" and value == "results-table":
#print "found results-table"
self._current_application = PlanningApplication()
self._tr_number = 0
self._subtable_depth = 0
self._current_application.comment_url = self.comments_email_address
break
else:
# We are already in a results-table, and this is the start of a subtable,
# so increment the subtable depth.
self._subtable_depth += 1

elif self._current_application is not None:
if tag == "tr" and self._subtable_depth == 0:
self._tr_number += 1
if tag == "td":
self._in_td = True
if self._tr_number == self.case_number_tr:
#get the reference and the info link here
pass
elif self._tr_number == self.reg_date_tr:
#get the registration date here
pass
elif self._tr_number == self.location_tr:
#get the address and postcode here
pass
elif self._tr_number == self.proposal_tr:
#get the description here
pass
if tag == "a" and self._tr_number == self.case_number_tr:
# this is where we get the info link and the case number
for key, value in attrs:
if key == "href":
self._current_application.info_url = value
def handle_data(self, data):
# If we are in the tr which contains the case number,
# then data is the council reference, so
# add it to self._current_application.
if self._in_td:
if self._tr_number == self.case_number_tr:
self._current_application.council_reference = data.strip()
elif self._tr_number == self.reg_date_tr:
# we need to make a date object out of data
date_as_str = ''.join(data.strip().split())
received_date = date(*strptime(date_as_str, date_format)[0:3])

#print received_date

self._current_application.date_received = received_date

elif self._tr_number == self.location_tr:
location = data.strip()

self._current_application.address = location
self._current_application.postcode = getPostcodeFromText(location)
elif self._tr_number == self.proposal_tr:
self._current_application.description = data.strip()


def handle_endtag(self, tag):
#print "ending: ", tag
if tag == "table" and self._current_application is not None:
if self._subtable_depth > 0:
self._subtable_depth -= 1
else:
# We need to add the last application in the table
if self._current_application is not None:
#print "adding application"
self._results.addApplication(self._current_application)
#print self._current_application
self._current_application = None
self._tr_number = None
self._subtable_depth = None
elif tag == "td":
self._in_td = False

def getResultsByDayMonthYear(self, day, month, year):
# first we fetch the search page to get ourselves some session info...
search_form_response = urllib2.urlopen(self.base_url)
search_form_contents = search_form_response.read()

# This sometimes causes a problem in HTMLParser, so let's just get the link
# out with a regex...

groups = self.action_regex.search(search_form_contents).groups()

action = groups[0]
#print action

action_url = urlparse.urljoin(self.base_url, action)
#print action_url

our_date = date(year, month, day)
search_data = {"regdate1": our_date.strftime(date_format),
"regdate2": our_date.strftime(date_format),
}
opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
response = opener.open(action_url, search_data)
results_html = response.read()

# This is for doing site specific html cleanup
results_html = self._cleanupHTML(results_html)

#some javascript garbage in the header upsets HTMLParser,
#so we'll just have the body
just_body = "<html>" + end_head_regex.split(results_html)[-1]

#outfile = open(self.authority_short_name + ".debug", "w")
#outfile.write(just_body)

self.feed(just_body)
return self._results



def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()


class BaberghParser(AcolnetParser):
#search_url = "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

case_number_tr = 1 # this one can be got by the td class attribute
reg_date_tr = 2
location_tr = 4
proposal_tr = 5

#authority_name = "Babergh District Council"
#authority_short_name = "Babergh"

# It would be nice to scrape this...
comments_email_address = "planning.reception@babergh.gov.uk"

action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">")

class BasingstokeParser(AcolnetParser):
#search_url = "http://planning.basingstoke.gov.uk/DCOnline2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
case_number_tr = 1 # this one can be got by the td class attribute
reg_date_tr = 3
location_tr = 6
proposal_tr = 8

#authority_name = "Basingstoke and Deane Borough Council"
#authority_short_name = "Basingstoke and Deane"

# It would be nice to scrape this...
comments_email_address = "development.control@basingstoke.gov.uk"

action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">")

class BassetlawParser(AcolnetParser):
#search_url = "http://www.bassetlaw.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

case_number_tr = 1 # this one can be got by the td class attribute
reg_date_tr = 2
location_tr = 5
proposal_tr = 6

#authority_name = "Bassetlaw District Council"
#authority_short_name = "Bassetlaw"

comments_email_address = "planning@bassetlaw.gov.uk"

action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)

def _cleanupHTML(self, html):
"""There is a broken div in this page. We don't need any divs, so
let's get rid of them all."""

div_regex = re.compile("</?div[^>]*>", re.IGNORECASE)
return div_regex.sub('', html)


class BridgenorthParser(AcolnetParser):
#search_url = "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"

case_number_tr = 1 # this one can be got by the td class attribute
reg_date_tr = 2
location_tr = 4
proposal_tr = 5

#authority_name = "Bridgenorth District Council"
#authority_short_name = "Bridgenorth"

comments_email_address = "contactus@bridgnorth-dc.gov.uk"

action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">")

class BuryParser(AcolnetParser):
#search_url = "http://e-planning.bury.gov.uk/ePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"

case_number_tr = 1 # this one can be got by the td class attribute
reg_date_tr = 2
location_tr = 4
proposal_tr = 5

#authority_name = "Bury Metropolitan Borough Council"
#authority_short_name = "Bury"

comments_email_address = "development.control@bury.gov.uk"
action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">")

## class CanterburyParser(AcolnetParser):
## search_url = "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

## case_number_tr = 1 # this one can be got by the td class attribute
## reg_date_tr = 2
## location_tr = 4
## proposal_tr = 5

## authority_name = "Canterbury City Council"
## authority_short_name = "Canterbury"

## comments_email_address = ""
## action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">")

class CarlisleParser(AcolnetParser):
#search_url = "http://planning.carlisle.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"

case_number_tr = 1 # this one can be got by the td class attribute
reg_date_tr = 2
location_tr = 5
proposal_tr = 6

#authority_name = "Carlisle City Council"
#authority_short_name = "Carlisle"

comments_email_address = "dc@carlisle.gov.uk"
action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">")


class DerbyParser(AcolnetParser):
#search_url = "http://195.224.106.204/scripts/planningpages02%5CXSLPagesDC_DERBY%5CDCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"

case_number_tr = 1 # this one can be got by the td class attribute
reg_date_tr = 3
location_tr = 4
proposal_tr = 5

#authority_name = "Derby City Council"
#authority_short_name = "Derby"

comments_email_address = "developmentcontrol@derby.gov.uk"
action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">")


if __name__ == '__main__':
day = 15
month = 3
year = 2007

# working
# parser = BasingstokeParser()
parser = BaberghParser("Babergh District Council", "Babergh", "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")

# works with the divs stripped out
#parser = BassetlawParser()

# returns error 400 - bad request
#parser = BridgenorthParser()

# working
#parser = BuryParser()

# cambridgeshire is a bit different...
# no advanced search page

# canterbury
# results as columns of one table

# returns error 400 - bad request
#parser = CarlisleParser()

# working
#parser = DerbyParser()
print parser.getResults(day, month, year)

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save