From 43dceae0c60b3215ebf61db94bd89e528a24a136 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Tue, 8 May 2007 09:30:40 +0000 Subject: [PATCH] add some Acolnet sites: Babergh Basingstoke Bassetlaw Bury Derby --- cgi-bin/AcolnetParser.py | 364 ++++++++++++++++++++++++ cgi-bin/Allerdale.cgi | 0 cgi-bin/Alnwick.cgi | 0 cgi-bin/Angus.cgi | 0 cgi-bin/Aylesbury Vale.cgi | 0 cgi-bin/Babergh.cgi | 29 ++ cgi-bin/Barrow.cgi | 0 cgi-bin/Basildon.cgi | 0 cgi-bin/Basingstoke and Deane.cgi | 29 ++ cgi-bin/Bassetlaw.cgi | 29 ++ cgi-bin/Bath.cgi | 0 cgi-bin/Bexley.cgi | 0 cgi-bin/Blaby.cgi | 0 cgi-bin/Bolsover.cgi | 0 cgi-bin/Bristol.cgi | 0 cgi-bin/Buckinghamshire.cgi | 0 cgi-bin/Bury.cgi | 29 ++ cgi-bin/Chelmsford.cgi | 0 cgi-bin/Cherwell.cgi | 0 cgi-bin/Chorley.cgi | 0 cgi-bin/City of London.cgi | 0 cgi-bin/Cornwall.cgi | 0 cgi-bin/Coventry.cgi | 0 cgi-bin/Dacorum.cgi | 108 ------- cgi-bin/Denbighshire.cgi | 0 cgi-bin/Derby.cgi | 29 ++ cgi-bin/Doncaster.cgi | 0 cgi-bin/Dundee.cgi | 0 cgi-bin/Durham.cgi | 0 cgi-bin/Ealing.cgi | 0 cgi-bin/Easington.cgi | 0 cgi-bin/East Devon.cgi | 0 cgi-bin/East Dorset.cgi | 0 cgi-bin/EastHerts.cgi | 122 -------- cgi-bin/Edinburgh.cgi | 0 cgi-bin/Enfield.cgi | 121 -------- cgi-bin/Epsom and Ewell.cgi | 0 cgi-bin/Fenland.cgi | 0 cgi-bin/Gateshead.cgi | 0 cgi-bin/Gedling.cgi | 0 cgi-bin/Gloucestershire.cgi | 0 cgi-bin/Gravesham.cgi | 0 cgi-bin/Hammersmith and Fulham.cgi | 0 cgi-bin/Haringey.cgi | 0 cgi-bin/Harrogate.cgi | 0 cgi-bin/Hart.cgi | 0 cgi-bin/Hartlepool.cgi | 0 cgi-bin/High Peak.cgi | 0 cgi-bin/Huntingdonshire.cgi | 0 cgi-bin/Kerrier.cgi | 0 cgi-bin/Knowsley.cgi | 0 cgi-bin/Lancaster.cgi | 0 cgi-bin/Luton.cgi | 0 cgi-bin/Malvern Hills.cgi | 0 cgi-bin/Mid Devon.cgi | 0 cgi-bin/Milton Keynes.cgi | 0 cgi-bin/MultipartPostHandler.py | 133 +++++++++ cgi-bin/NW Leicestershire.cgi | 0 cgi-bin/Newcastle-under-Lyme.cgi | 0 cgi-bin/Newham.cgi | 0 cgi-bin/North Tyneside.cgi | 0 cgi-bin/North Warwickshire.cgi | 0 cgi-bin/Northumberland.cgi | 0 cgi-bin/Oadby and Wigston.cgi | 0 cgi-bin/Oswestry.cgi | 0 cgi-bin/Peterborough.cgi | 0 cgi-bin/Portsmouth.cgi | 0 cgi-bin/Redditch.cgi | 0 cgi-bin/Rushmoor.cgi | 0 cgi-bin/Scarborough.cgi | 0 cgi-bin/Sevenoaks.cgi | 0 cgi-bin/South Bucks.cgi | 0 cgi-bin/South Ribble.cgi | 0 cgi-bin/South Staffordshire.cgi | 0 cgi-bin/SouthOxfordshire.cgi | 0 cgi-bin/Southampton.cgi | 0 cgi-bin/Spelthorne.cgi | 0 cgi-bin/St Helens.cgi | 0 cgi-bin/Stevenage.cgi | 0 cgi-bin/Stirling.cgi | 0 cgi-bin/Stockton-On-Tees.cgi | 0 cgi-bin/Stratford.cgi | 0 cgi-bin/Sunderland.cgi | 0 cgi-bin/Teignbridge.cgi | 0 cgi-bin/Test Valley.cgi | 0 cgi-bin/Tonbridge.cgi | 0 cgi-bin/Torbay.cgi | 0 cgi-bin/Vale Royal.cgi | 0 cgi-bin/Waveney.cgi | 0 cgi-bin/Wear Valley.cgi | 0 cgi-bin/Wellingborough.cgi | 0 cgi-bin/West Berkshire.cgi | 0 cgi-bin/West Lancashire.cgi | 0 cgi-bin/West Norfolk.cgi | 0 cgi-bin/Winchester.cgi | 0 cgi-bin/Woking.cgi | 0 cgi-bin/Wolverhampton.cgi | 0 cgi-bin/York.cgi | 0 cgi-bin/broxbourne.cgi | 163 ----------- python_scrapers/AcolnetParser.py | 364 ++++++++++++++++++++++++ python_scrapers/MultipartPostHandler.py | 133 +++++++++ python_scrapers/OtherFilesToCopy.csv | 3 +- python_scrapers/PublicAccessSites.csv | 7 +- 103 files changed, 1146 insertions(+), 517 deletions(-) create mode 100644 cgi-bin/AcolnetParser.py mode change 100644 => 100755 cgi-bin/Allerdale.cgi mode change 100644 => 100755 cgi-bin/Alnwick.cgi mode change 100644 => 100755 cgi-bin/Angus.cgi mode change 100644 => 100755 cgi-bin/Aylesbury Vale.cgi create mode 100755 cgi-bin/Babergh.cgi mode change 100644 => 100755 cgi-bin/Barrow.cgi mode change 100644 => 100755 cgi-bin/Basildon.cgi create mode 100755 cgi-bin/Basingstoke and Deane.cgi create mode 100755 cgi-bin/Bassetlaw.cgi mode change 100644 => 100755 cgi-bin/Bath.cgi mode change 100644 => 100755 cgi-bin/Bexley.cgi mode change 100644 => 100755 cgi-bin/Blaby.cgi mode change 100644 => 100755 cgi-bin/Bolsover.cgi mode change 100644 => 100755 cgi-bin/Bristol.cgi mode change 100644 => 100755 cgi-bin/Buckinghamshire.cgi create mode 100755 cgi-bin/Bury.cgi mode change 100644 => 100755 cgi-bin/Chelmsford.cgi mode change 100644 => 100755 cgi-bin/Cherwell.cgi mode change 100644 => 100755 cgi-bin/Chorley.cgi mode change 100644 => 100755 cgi-bin/City of London.cgi mode change 100644 => 100755 cgi-bin/Cornwall.cgi mode change 100644 => 100755 cgi-bin/Coventry.cgi delete mode 100644 cgi-bin/Dacorum.cgi mode change 100644 => 100755 cgi-bin/Denbighshire.cgi create mode 100755 cgi-bin/Derby.cgi mode change 100644 => 100755 cgi-bin/Doncaster.cgi mode change 100644 => 100755 cgi-bin/Dundee.cgi mode change 100644 => 100755 cgi-bin/Durham.cgi mode change 100644 => 100755 cgi-bin/Ealing.cgi mode change 100644 => 100755 cgi-bin/Easington.cgi mode change 100644 => 100755 cgi-bin/East Devon.cgi mode change 100755 => 100644 cgi-bin/East Dorset.cgi delete mode 100644 cgi-bin/EastHerts.cgi mode change 100644 => 100755 cgi-bin/Edinburgh.cgi delete mode 100644 cgi-bin/Enfield.cgi mode change 100644 => 100755 cgi-bin/Epsom and Ewell.cgi mode change 100644 => 100755 cgi-bin/Fenland.cgi mode change 100755 => 100644 cgi-bin/Gateshead.cgi mode change 100644 => 100755 cgi-bin/Gedling.cgi mode change 100755 => 100644 cgi-bin/Gloucestershire.cgi mode change 100644 => 100755 cgi-bin/Gravesham.cgi mode change 100644 => 100755 cgi-bin/Hammersmith and Fulham.cgi mode change 100644 => 100755 cgi-bin/Haringey.cgi mode change 100644 => 100755 cgi-bin/Harrogate.cgi mode change 100644 => 100755 cgi-bin/Hart.cgi mode change 100644 => 100755 cgi-bin/Hartlepool.cgi mode change 100644 => 100755 cgi-bin/High Peak.cgi mode change 100644 => 100755 cgi-bin/Huntingdonshire.cgi mode change 100644 => 100755 cgi-bin/Kerrier.cgi mode change 100644 => 100755 cgi-bin/Knowsley.cgi mode change 100644 => 100755 cgi-bin/Lancaster.cgi mode change 100644 => 100755 cgi-bin/Luton.cgi mode change 100644 => 100755 cgi-bin/Malvern Hills.cgi mode change 100644 => 100755 cgi-bin/Mid Devon.cgi mode change 100644 => 100755 cgi-bin/Milton Keynes.cgi create mode 100644 cgi-bin/MultipartPostHandler.py mode change 100644 => 100755 cgi-bin/NW Leicestershire.cgi mode change 100755 => 100644 cgi-bin/Newcastle-under-Lyme.cgi mode change 100644 => 100755 cgi-bin/Newham.cgi mode change 100644 => 100755 cgi-bin/North Tyneside.cgi mode change 100644 => 100755 cgi-bin/North Warwickshire.cgi mode change 100644 => 100755 cgi-bin/Northumberland.cgi mode change 100644 => 100755 cgi-bin/Oadby and Wigston.cgi mode change 100644 => 100755 cgi-bin/Oswestry.cgi mode change 100644 => 100755 cgi-bin/Peterborough.cgi mode change 100644 => 100755 cgi-bin/Portsmouth.cgi mode change 100644 => 100755 cgi-bin/Redditch.cgi mode change 100644 => 100755 cgi-bin/Rushmoor.cgi mode change 100644 => 100755 cgi-bin/Scarborough.cgi mode change 100644 => 100755 cgi-bin/Sevenoaks.cgi mode change 100644 => 100755 cgi-bin/South Bucks.cgi mode change 100644 => 100755 cgi-bin/South Ribble.cgi mode change 100644 => 100755 cgi-bin/South Staffordshire.cgi mode change 100644 => 100755 cgi-bin/SouthOxfordshire.cgi mode change 100644 => 100755 cgi-bin/Southampton.cgi mode change 100644 => 100755 cgi-bin/Spelthorne.cgi mode change 100644 => 100755 cgi-bin/St Helens.cgi mode change 100644 => 100755 cgi-bin/Stevenage.cgi mode change 100644 => 100755 cgi-bin/Stirling.cgi mode change 100644 => 100755 cgi-bin/Stockton-On-Tees.cgi mode change 100644 => 100755 cgi-bin/Stratford.cgi mode change 100644 => 100755 cgi-bin/Sunderland.cgi mode change 100644 => 100755 cgi-bin/Teignbridge.cgi mode change 100644 => 100755 cgi-bin/Test Valley.cgi mode change 100644 => 100755 cgi-bin/Tonbridge.cgi mode change 100644 => 100755 cgi-bin/Torbay.cgi mode change 100755 => 100644 cgi-bin/Vale Royal.cgi mode change 100644 => 100755 cgi-bin/Waveney.cgi mode change 100644 => 100755 cgi-bin/Wear Valley.cgi mode change 100644 => 100755 cgi-bin/Wellingborough.cgi mode change 100644 => 100755 cgi-bin/West Berkshire.cgi mode change 100644 => 100755 cgi-bin/West Lancashire.cgi mode change 100644 => 100755 cgi-bin/West Norfolk.cgi mode change 100755 => 100644 cgi-bin/Winchester.cgi mode change 100644 => 100755 cgi-bin/Woking.cgi mode change 100755 => 100644 cgi-bin/Wolverhampton.cgi mode change 100644 => 100755 cgi-bin/York.cgi delete mode 100755 cgi-bin/broxbourne.cgi create mode 100644 python_scrapers/AcolnetParser.py create mode 100644 python_scrapers/MultipartPostHandler.py diff --git a/cgi-bin/AcolnetParser.py b/cgi-bin/AcolnetParser.py new file mode 100644 index 0000000..55e2796 --- /dev/null +++ b/cgi-bin/AcolnetParser.py @@ -0,0 +1,364 @@ +#!/usr/local/bin/python + +import urllib, urllib2 +import HTMLParser +#from BeautifulSoup import BeautifulSoup + +import urlparse + +import re + +end_head_regex = re.compile(" 0: + self._subtable_depth -= 1 + else: + # We need to add the last application in the table + if self._current_application is not None: + #print "adding application" + self._results.addApplication(self._current_application) + #print self._current_application + self._current_application = None + self._tr_number = None + self._subtable_depth = None + elif tag == "td": + self._in_td = False + + def getResultsByDayMonthYear(self, day, month, year): + # first we fetch the search page to get ourselves some session info... + search_form_response = urllib2.urlopen(self.base_url) + search_form_contents = search_form_response.read() + + # This sometimes causes a problem in HTMLParser, so let's just get the link + # out with a regex... + + groups = self.action_regex.search(search_form_contents).groups() + + action = groups[0] + #print action + + action_url = urlparse.urljoin(self.base_url, action) + #print action_url + + our_date = date(year, month, day) + + search_data = {"regdate1": our_date.strftime(date_format), + "regdate2": our_date.strftime(date_format), + } + + opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler) + response = opener.open(action_url, search_data) + results_html = response.read() + + # This is for doing site specific html cleanup + results_html = self._cleanupHTML(results_html) + + #some javascript garbage in the header upsets HTMLParser, + #so we'll just have the body + just_body = "" + end_head_regex.split(results_html)[-1] + + #outfile = open(self.authority_short_name + ".debug", "w") + #outfile.write(just_body) + + self.feed(just_body) + + return self._results + + + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + + +class BaberghParser(AcolnetParser): + #search_url = "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch" + + case_number_tr = 1 # this one can be got by the td class attribute + reg_date_tr = 2 + location_tr = 4 + proposal_tr = 5 + + #authority_name = "Babergh District Council" + #authority_short_name = "Babergh" + + # It would be nice to scrape this... + comments_email_address = "planning.reception@babergh.gov.uk" + + action_regex = re.compile("
") + +class BasingstokeParser(AcolnetParser): + #search_url = "http://planning.basingstoke.gov.uk/DCOnline2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch" + + case_number_tr = 1 # this one can be got by the td class attribute + reg_date_tr = 3 + location_tr = 6 + proposal_tr = 8 + + #authority_name = "Basingstoke and Deane Borough Council" + #authority_short_name = "Basingstoke and Deane" + + # It would be nice to scrape this... + comments_email_address = "development.control@basingstoke.gov.uk" + + action_regex = re.compile("") + +class BassetlawParser(AcolnetParser): + #search_url = "http://www.bassetlaw.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch" + + case_number_tr = 1 # this one can be got by the td class attribute + reg_date_tr = 2 + location_tr = 5 + proposal_tr = 6 + + #authority_name = "Bassetlaw District Council" + #authority_short_name = "Bassetlaw" + + comments_email_address = "planning@bassetlaw.gov.uk" + + action_regex = re.compile("", re.IGNORECASE) + + def _cleanupHTML(self, html): + """There is a broken div in this page. We don't need any divs, so + let's get rid of them all.""" + + div_regex = re.compile("]*>", re.IGNORECASE) + return div_regex.sub('', html) + + +class BridgenorthParser(AcolnetParser): + #search_url = "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch" + + case_number_tr = 1 # this one can be got by the td class attribute + reg_date_tr = 2 + location_tr = 4 + proposal_tr = 5 + + #authority_name = "Bridgenorth District Council" + #authority_short_name = "Bridgenorth" + + comments_email_address = "contactus@bridgnorth-dc.gov.uk" + + action_regex = re.compile("") + +class BuryParser(AcolnetParser): + #search_url = "http://e-planning.bury.gov.uk/ePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch" + + case_number_tr = 1 # this one can be got by the td class attribute + reg_date_tr = 2 + location_tr = 4 + proposal_tr = 5 + + #authority_name = "Bury Metropolitan Borough Council" + #authority_short_name = "Bury" + + comments_email_address = "development.control@bury.gov.uk" + action_regex = re.compile("") + +## class CanterburyParser(AcolnetParser): +## search_url = "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch" + +## case_number_tr = 1 # this one can be got by the td class attribute +## reg_date_tr = 2 +## location_tr = 4 +## proposal_tr = 5 + +## authority_name = "Canterbury City Council" +## authority_short_name = "Canterbury" + +## comments_email_address = "" +## action_regex = re.compile("") + +class CarlisleParser(AcolnetParser): + #search_url = "http://planning.carlisle.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch" + + case_number_tr = 1 # this one can be got by the td class attribute + reg_date_tr = 2 + location_tr = 5 + proposal_tr = 6 + + #authority_name = "Carlisle City Council" + #authority_short_name = "Carlisle" + + comments_email_address = "dc@carlisle.gov.uk" + action_regex = re.compile("") + + +class DerbyParser(AcolnetParser): + #search_url = "http://195.224.106.204/scripts/planningpages02%5CXSLPagesDC_DERBY%5CDCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch" + + case_number_tr = 1 # this one can be got by the td class attribute + reg_date_tr = 3 + location_tr = 4 + proposal_tr = 5 + + #authority_name = "Derby City Council" + #authority_short_name = "Derby" + + comments_email_address = "developmentcontrol@derby.gov.uk" + action_regex = re.compile("") + + +if __name__ == '__main__': + day = 15 + month = 3 + year = 2007 + + # working + # parser = BasingstokeParser() + parser = BaberghParser("Babergh District Council", "Babergh", "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") + + # works with the divs stripped out + #parser = BassetlawParser() + + # returns error 400 - bad request + #parser = BridgenorthParser() + + # working + #parser = BuryParser() + + # cambridgeshire is a bit different... + # no advanced search page + + # canterbury + # results as columns of one table + + # returns error 400 - bad request + #parser = CarlisleParser() + + # working + #parser = DerbyParser() + + print parser.getResults(day, month, year) + diff --git a/cgi-bin/Allerdale.cgi b/cgi-bin/Allerdale.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Alnwick.cgi b/cgi-bin/Alnwick.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Angus.cgi b/cgi-bin/Angus.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Aylesbury Vale.cgi b/cgi-bin/Aylesbury Vale.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Babergh.cgi b/cgi-bin/Babergh.cgi new file mode 100755 index 0000000..28ecb9e --- /dev/null +++ b/cgi-bin/Babergh.cgi @@ -0,0 +1,29 @@ +#!/usr/local/bin/python + +# This is the parser for Babergh District Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Babergh District Council" +authority_short_name = "Babergh" +base_url = "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch" + +import AcolnetParser + +parser = AcolnetParser.BaberghParser(authority_name, authority_short_name, base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/cgi-bin/Barrow.cgi b/cgi-bin/Barrow.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Basildon.cgi b/cgi-bin/Basildon.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Basingstoke and Deane.cgi b/cgi-bin/Basingstoke and Deane.cgi new file mode 100755 index 0000000..d4be3d1 --- /dev/null +++ b/cgi-bin/Basingstoke and Deane.cgi @@ -0,0 +1,29 @@ +#!/usr/local/bin/python + +# This is the parser for Basingstoke and Deane Borough Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Basingstoke and Deane Borough Council" +authority_short_name = "Basingstoke and Deane" +base_url = "http://planning.basingstoke.gov.uk/DCOnline2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch" + +import AcolnetParser + +parser = AcolnetParser.BasingstokeParser(authority_name, authority_short_name, base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/cgi-bin/Bassetlaw.cgi b/cgi-bin/Bassetlaw.cgi new file mode 100755 index 0000000..409f7a8 --- /dev/null +++ b/cgi-bin/Bassetlaw.cgi @@ -0,0 +1,29 @@ +#!/usr/local/bin/python + +# This is the parser for Bassetlaw District Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Bassetlaw District Council" +authority_short_name = "Bassetlaw" +base_url = "http://www.bassetlaw.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch" + +import AcolnetParser + +parser = AcolnetParser.BassetlawParser(authority_name, authority_short_name, base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/cgi-bin/Bath.cgi b/cgi-bin/Bath.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Bexley.cgi b/cgi-bin/Bexley.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Blaby.cgi b/cgi-bin/Blaby.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Bolsover.cgi b/cgi-bin/Bolsover.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Bristol.cgi b/cgi-bin/Bristol.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Buckinghamshire.cgi b/cgi-bin/Buckinghamshire.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Bury.cgi b/cgi-bin/Bury.cgi new file mode 100755 index 0000000..f6454f3 --- /dev/null +++ b/cgi-bin/Bury.cgi @@ -0,0 +1,29 @@ +#!/usr/local/bin/python + +# This is the parser for Bury Metropolitan Borough Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Bury Metropolitan Borough Council" +authority_short_name = "Bury" +base_url = "http://e-planning.bury.gov.uk/ePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch" + +import AcolnetParser + +parser = AcolnetParser.BuryParser(authority_name, authority_short_name, base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/cgi-bin/Chelmsford.cgi b/cgi-bin/Chelmsford.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Cherwell.cgi b/cgi-bin/Cherwell.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Chorley.cgi b/cgi-bin/Chorley.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/City of London.cgi b/cgi-bin/City of London.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Cornwall.cgi b/cgi-bin/Cornwall.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Coventry.cgi b/cgi-bin/Coventry.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Dacorum.cgi b/cgi-bin/Dacorum.cgi deleted file mode 100644 index 1a27715..0000000 --- a/cgi-bin/Dacorum.cgi +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; - -use CGI qw(:cgi); -use HTML::TreeBuilder; -use LWP::UserAgent; -use XML::Writer; - -# The master URLs for the Dacorum planning search -our $SearchURL = "http://www.dacorum.gov.uk/default.aspx?page=1495"; -our $InfoURL = "http://www.dacorum.gov.uk/Default.aspx?page=1497&ID="; -our $CommentURL = "http://www.dacorum.gov.uk/Default.aspx?page=2847&ID="; - -# We're a CGI script... -my $query = CGI->new(); - -# Construct an LWP user agent -our $UA = LWP::UserAgent->new(env_proxy => 1, - cookie_jar => {}, - requests_redirectable => [ 'GET', 'HEAD', 'POST' ]); - -# Post the URL to get an initial blank form -my $state = get_state(do_post()); - -# Do the search -my $page = do_post({"__VIEWSTATE" => $state, - "Template:_ctl10:_ctl0:btnSearch" => "Search", - "Template:_ctl10:_ctl0:tbRegistrationFromDay" => $query->param("day"), - "Template:_ctl10:_ctl0:tbRegistrationFromMon" => $query->param("month"), - "Template:_ctl10:_ctl0:tbRegistrationFromYear" => $query->param("year"), - "Template:_ctl10:_ctl0:tbRegistrationToDay" => $query->param("day"), - "Template:_ctl10:_ctl0:tbRegistrationToMon" => $query->param("month"), - "Template:_ctl10:_ctl0:tbRegistrationToYear" => $query->param("year")}); - -# Output an HTTP response header -print $query->header(-type => "text/xml"); - -# Create an XML output stream -my $Writer = XML::Writer->new(DATA_MODE => 1); - -# Output the XML header data -$Writer->xmlDecl("UTF-8"); -$Writer->startTag("planning"); -$Writer->dataElement("authority_name", "Dacorum Borough Council"); -$Writer->dataElement("authority_short_name", "Dacorum"); -$Writer->startTag("applications"); - -# Find the result table -my $table = $page->look_down("_tag" => "table", "class" => "FormDataGrid"); - -# Process each row of the results -foreach my $row ($table->look_down("_tag" => "tr")) -{ - my @cells = $row->look_down("_tag" => "td"); - - if ($cells[0]->attr("class") eq "FormGridDataItem" || - $cells[0]->attr("class") eq "FormGridAlternatingDataItem") - { - my $reference = $cells[0]->as_trimmed_text; - my $address = $cells[1]->as_trimmed_text; - my $description = $cells[2]->as_trimmed_text; - my $date = $cells[3]->as_trimmed_text; - my $postcode; - - if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/) - { - $postcode = $1; - } - - $Writer->startTag("application"); - $Writer->dataElement("council_reference", $reference); - $Writer->dataElement("address", $address); - $Writer->dataElement("postcode", $postcode); - $Writer->dataElement("description", $description); - $Writer->dataElement("info_url", $InfoURL . $reference); - $Writer->dataElement("comment_url", $CommentURL . $reference); - $Writer->dataElement("date_received", $date); - $Writer->endTag("application"); - } -} - -# Finish off XML output -$Writer->endTag("applications"); -$Writer->endTag("planning"); -$Writer->end(); - -exit 0; - -# Extract the state from a page so we can repost it -sub get_state -{ - my $page = shift; - my $viewstate = $page->look_down("_tag" => "input", "name" => "__VIEWSTATE"); - - return $viewstate->attr("value"); -} - -# Post to the planning search page -sub do_post -{ - my $response = $UA->post($SearchURL, @_); - - die $response->status_line unless $response->is_success; - - return HTML::TreeBuilder->new_from_content($response->content); -} diff --git a/cgi-bin/Denbighshire.cgi b/cgi-bin/Denbighshire.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Derby.cgi b/cgi-bin/Derby.cgi new file mode 100755 index 0000000..1ae552e --- /dev/null +++ b/cgi-bin/Derby.cgi @@ -0,0 +1,29 @@ +#!/usr/local/bin/python + +# This is the parser for Derby City Council. +# it is generated from the file CGITemplate + +import cgi +import cgitb +#cgitb.enable(display=0, logdir="/tmp") + + +form = cgi.FieldStorage() +day = form.getfirst('day') +month = form.getfirst('month') +year = form.getfirst('year') + + +authority_name = "Derby City Council" +authority_short_name = "Derby" +base_url = "http://195.224.106.204/scripts/planningpages02%5CXSLPagesDC_DERBY%5CDCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch" + +import AcolnetParser + +parser = AcolnetParser.DerbyParser(authority_name, authority_short_name, base_url) + +xml = parser.getResults(day, month, year) + +print "Content-Type: text/xml" # XML is following +print +print xml # print the xml diff --git a/cgi-bin/Doncaster.cgi b/cgi-bin/Doncaster.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Dundee.cgi b/cgi-bin/Dundee.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Durham.cgi b/cgi-bin/Durham.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Ealing.cgi b/cgi-bin/Ealing.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Easington.cgi b/cgi-bin/Easington.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/East Devon.cgi b/cgi-bin/East Devon.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/East Dorset.cgi b/cgi-bin/East Dorset.cgi old mode 100755 new mode 100644 diff --git a/cgi-bin/EastHerts.cgi b/cgi-bin/EastHerts.cgi deleted file mode 100644 index 34dec27..0000000 --- a/cgi-bin/EastHerts.cgi +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; - -use CGI qw(:cgi); -use HTML::TreeBuilder; -use LWP::UserAgent; -use XML::Writer; - -# The master URLs for the East Herts planning search -our $SearchURL = "http://e-services.eastherts.gov.uk/swiftlg/apas/run/WPHAPPCRITERIA"; -our $InfoURL = "http://e-services.eastherts.gov.uk/swiftlg/apas/run/WPHAPPDETAIL.DisplayUrl?theApnID="; -our $CommentURL = "http://e-services.eastherts.gov.uk/swiftlg/apas/run/wphmakerep.displayURL?ApnID="; - -# We're a CGI script... -my $query = CGI->new(); - -# Get the date to fetch -my $date = $query->param("day") . "/" . $query->param("month") . "/" . $query->param("year"); - -# Construct an LWP user agent -our $UA = LWP::UserAgent->new(env_proxy => 1); - -# Do the search -my $page = do_post($SearchURL, - {"REGFROMDATE.MAINBODY.WPACIS.1." => $date, - "REGTODATE.MAINBODY.WPACIS.1." => $date, - "SEARCHBUTTON.MAINBODY.WPACIS.1." => "Search"}); - -# Output an HTTP response header -print $query->header(-type => "text/xml"); - -# Create an XML output stream -my $Writer = XML::Writer->new(DATA_MODE => 1); - -# Output the XML header data -$Writer->xmlDecl("UTF-8"); -$Writer->startTag("planning"); -$Writer->dataElement("authority_name", "East Herts Council"); -$Writer->dataElement("authority_short_name", "East Herts"); -$Writer->startTag("applications"); - -# Output any applications on the first page -output_applications($page); - -# Loop over any additional results pages -foreach my $link ($page->look_down("_tag" => "a", "href" => qr/^WPHAPPSEARCHRES\.displayResultsURL/)) -{ - # Fetch this page... - $page = do_get(URI->new_abs($link->attr("href"), $SearchURL)); - - # ...and output the applications from it - output_applications($page); -} - -# Finish off XML output -$Writer->endTag("applications"); -$Writer->endTag("planning"); -$Writer->end(); - -exit 0; - -# Make a GET request -sub do_get -{ - my $response = $UA->get(@_); - - die $response->status_line unless $response->is_success; - - return HTML::TreeBuilder->new_from_content($response->content); -} - -# Make a POST request -sub do_post -{ - my $response = $UA->post(@_); - - die $response->status_line unless $response->is_success; - - return HTML::TreeBuilder->new_from_content($response->content); -} - -# Output applications from a results page -sub output_applications -{ - my $page = shift; - - # Find the result table - my $table = $page->look_down("_tag" => "table", "cellspacing" => "2", "cellpadding" => "2"); - - # Process each row of the results - foreach my $row ($table->look_down("_tag" => "tr")) - { - my @cells = $row->look_down("_tag" => "td"); - - if (@cells >= 3) - { - my $reference = $cells[0]->as_trimmed_text; - my $description = $cells[1]->as_trimmed_text; - my $address = $cells[2]->as_trimmed_text; - my $postcode; - - if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/) - { - $postcode = $1; - } - - $Writer->startTag("application"); - $Writer->dataElement("council_reference", $reference); - $Writer->dataElement("address", $address); - $Writer->dataElement("postcode", $postcode); - $Writer->dataElement("description", $description); - $Writer->dataElement("info_url", $InfoURL . $reference); - $Writer->dataElement("comment_url", $CommentURL . $reference); - $Writer->dataElement("date_received", $date); - $Writer->endTag("application"); - } - } - - return; -} diff --git a/cgi-bin/Edinburgh.cgi b/cgi-bin/Edinburgh.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Enfield.cgi b/cgi-bin/Enfield.cgi deleted file mode 100644 index 7462ebd..0000000 --- a/cgi-bin/Enfield.cgi +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; - -use CGI qw(:cgi); -use HTML::TreeBuilder; -use LWP::UserAgent; -use XML::Writer; - - -# The master URLs for the Enfield planning search -our $SearchURL = "http://forms.enfield.gov.uk/swiftlg/apas/run/WPHAPPCRITERIA"; -our $InfoURL = "http://forms.enfield.gov.uk/swiftlg/apas/run/WPHAPPDETAIL.DisplayUrl?theApnID="; - -# We're a CGI script... -my $query = CGI->new(); - -# Get the date to fetch -my $date = $query->param("day") . "/" . $query->param("month") . "/" . $query->param("year"); - -# Construct an LWP user agent -our $UA = LWP::UserAgent->new(env_proxy => 1); - -# Do the search -my $page = do_post($SearchURL, - {"REGFROMDATE.MAINBODY.WPACIS.1." => $date, - "REGTODATE.MAINBODY.WPACIS.1." => $date, - "SEARCHBUTTON.MAINBODY.WPACIS.1." => "Search"}); - -# Output an HTTP response header -print $query->header(-type => "text/xml"); - -# Create an XML output stream -my $Writer = XML::Writer->new(DATA_MODE => 1); - -# Output the XML header data -$Writer->xmlDecl("UTF-8"); -$Writer->startTag("planning"); -$Writer->dataElement("authority_name", "Enfield Council"); -$Writer->dataElement("authority_short_name", "Enfield"); -$Writer->startTag("applications"); - -# Output any applications on the first page -output_applications($page); - -# Loop over any additional results pages -foreach my $link ($page->look_down("_tag" => "a", "href" => qr/^WPHAPPSEARCHRES\.displayResultsURL/)) -{ - # Fetch this page... - $page = do_get(URI->new_abs($link->attr("href"), $SearchURL)); - - # ...and output the applications from it - output_applications($page); -} - -# Finish off XML output -$Writer->endTag("applications"); -$Writer->endTag("planning"); -$Writer->end(); - -exit 0; - -# Make a GET request -sub do_get -{ - my $response = $UA->get(@_); - - die $response->status_line unless $response->is_success; - - return HTML::TreeBuilder->new_from_content($response->content); -} - -# Make a POST request -sub do_post -{ - my $response = $UA->post(@_); - - die $response->status_line unless $response->is_success; - - return HTML::TreeBuilder->new_from_content($response->content); -} - -# Output applications from a results page -sub output_applications -{ - my $page = shift; - - # Find the result table - my $table = $page->look_down("_tag" => "table", "class" => "apas_tbl"); - - # Process each row of the results - foreach my $row ($table->look_down("_tag" => "tr")) - { - my @cells = $row->look_down("_tag" => "td"); - - if (@cells >= 3) - { - my $reference = $cells[0]->as_trimmed_text; - my $description = $cells[1]->as_trimmed_text; - my $address = $cells[2]->as_trimmed_text; - my $postcode; - - if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/) - { - $postcode = $1; - } - - $Writer->startTag("application"); - $Writer->dataElement("council_reference", $reference); - $Writer->dataElement("address", $address); - $Writer->dataElement("postcode", $postcode); - $Writer->dataElement("description", $description); - $Writer->dataElement("info_url", $InfoURL . $reference); - $Writer->dataElement("date_received", $date); - $Writer->endTag("application"); - } - } - - return; -} diff --git a/cgi-bin/Epsom and Ewell.cgi b/cgi-bin/Epsom and Ewell.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Fenland.cgi b/cgi-bin/Fenland.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Gateshead.cgi b/cgi-bin/Gateshead.cgi old mode 100755 new mode 100644 diff --git a/cgi-bin/Gedling.cgi b/cgi-bin/Gedling.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Gloucestershire.cgi b/cgi-bin/Gloucestershire.cgi old mode 100755 new mode 100644 diff --git a/cgi-bin/Gravesham.cgi b/cgi-bin/Gravesham.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Hammersmith and Fulham.cgi b/cgi-bin/Hammersmith and Fulham.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Haringey.cgi b/cgi-bin/Haringey.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Harrogate.cgi b/cgi-bin/Harrogate.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Hart.cgi b/cgi-bin/Hart.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Hartlepool.cgi b/cgi-bin/Hartlepool.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/High Peak.cgi b/cgi-bin/High Peak.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Huntingdonshire.cgi b/cgi-bin/Huntingdonshire.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Kerrier.cgi b/cgi-bin/Kerrier.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Knowsley.cgi b/cgi-bin/Knowsley.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Lancaster.cgi b/cgi-bin/Lancaster.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Luton.cgi b/cgi-bin/Luton.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Malvern Hills.cgi b/cgi-bin/Malvern Hills.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Mid Devon.cgi b/cgi-bin/Mid Devon.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Milton Keynes.cgi b/cgi-bin/Milton Keynes.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/MultipartPostHandler.py b/cgi-bin/MultipartPostHandler.py new file mode 100644 index 0000000..c427613 --- /dev/null +++ b/cgi-bin/MultipartPostHandler.py @@ -0,0 +1,133 @@ +#### +# 02/2006 Will Holcomb +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# + +# I have edited out a bit in the middle of this which reverts to a normal +# post with "application/x-www-form-urlencoded" content-type when there are +# no files. +# Duncan 5/5/2007 + +""" +Usage: + Enables the use of multipart/form-data for posting forms + +Inspirations: + Upload files in python: + http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/146306 + urllib2_file: + Fabien Seisen: + +Example: + import MultipartPostHandler, urllib2, cookielib + + cookies = cookielib.CookieJar() + opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookies), + MultipartPostHandler.MultipartPostHandler) + params = { "username" : "bob", "password" : "riviera", + "file" : open("filename", "rb") } + opener.open("http://wwww.bobsite.com/upload/", params) + +Further Example: + The main function of this file is a sample which downloads a page and + then uploads it to the W3C validator. +""" + +import urllib +import urllib2 +import mimetools, mimetypes +import os, stat + +class Callable: + def __init__(self, anycallable): + self.__call__ = anycallable + +# Controls how sequences are uncoded. If true, elements may be given multiple values by +# assigning a sequence. +doseq = 1 + +class MultipartPostHandler(urllib2.BaseHandler): + handler_order = urllib2.HTTPHandler.handler_order - 10 # needs to run first + + def http_request(self, request): + data = request.get_data() + if data is not None and type(data) != str: + v_files = [] + v_vars = [] + try: + for(key, value) in data.items(): + if type(value) == file: + v_files.append((key, value)) + else: + v_vars.append((key, value)) + except TypeError: + systype, value, traceback = sys.exc_info() + raise TypeError, "not a valid non-string sequence or mapping object", traceback + + boundary, data = self.multipart_encode(v_vars, v_files) + contenttype = 'multipart/form-data; boundary=%s' % boundary + if(request.has_header('Content-Type') + and request.get_header('Content-Type').find('multipart/form-data') != 0): + print "Replacing %s with %s" % (request.get_header('content-type'), 'multipart/form-data') + request.add_unredirected_header('Content-Type', contenttype) + + request.add_data(data) + return request + + def multipart_encode(vars, files, boundary = None, buffer = None): + if boundary is None: + boundary = mimetools.choose_boundary() + if buffer is None: + buffer = '' + for(key, value) in vars: + buffer += '--%s\r\n' % boundary + buffer += 'Content-Disposition: form-data; name="%s"' % key + buffer += '\r\n\r\n' + value + '\r\n' + for(key, fd) in files: + file_size = os.fstat(fd.fileno())[stat.ST_SIZE] + filename = fd.name.split('/')[-1] + contenttype = mimetypes.guess_type(filename)[0] or 'application/octet-stream' + buffer += '--%s\r\n' % boundary + buffer += 'Content-Disposition: form-data; name="%s"; filename="%s"\r\n' % (key, filename) + buffer += 'Content-Type: %s\r\n' % contenttype + # buffer += 'Content-Length: %s\r\n' % file_size + fd.seek(0) + buffer += '\r\n' + fd.read() + '\r\n' + buffer += '--%s--\r\n\r\n' % boundary + return boundary, buffer + multipart_encode = Callable(multipart_encode) + + https_request = http_request + +## def main(): +## import tempfile, sys + +## validatorURL = "http://validator.w3.org/check" +## opener = urllib2.build_opener(MultipartPostHandler) + +## def validateFile(url): +## temp = tempfile.mkstemp(suffix=".html") +## os.write(temp[0], opener.open(url).read()) +## params = { "ss" : "0", # show source +## "doctype" : "Inline", +## "uploaded_file" : open(temp[1], "rb") } +## print opener.open(validatorURL, params).read() +## os.remove(temp[1]) + +## if len(sys.argv[1:]) > 0: +## for arg in sys.argv[1:]: +## validateFile(arg) +## else: +## validateFile("http://www.google.com") + +## if __name__=="__main__": +## main() diff --git a/cgi-bin/NW Leicestershire.cgi b/cgi-bin/NW Leicestershire.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Newcastle-under-Lyme.cgi b/cgi-bin/Newcastle-under-Lyme.cgi old mode 100755 new mode 100644 diff --git a/cgi-bin/Newham.cgi b/cgi-bin/Newham.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/North Tyneside.cgi b/cgi-bin/North Tyneside.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/North Warwickshire.cgi b/cgi-bin/North Warwickshire.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Northumberland.cgi b/cgi-bin/Northumberland.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Oadby and Wigston.cgi b/cgi-bin/Oadby and Wigston.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Oswestry.cgi b/cgi-bin/Oswestry.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Peterborough.cgi b/cgi-bin/Peterborough.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Portsmouth.cgi b/cgi-bin/Portsmouth.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Redditch.cgi b/cgi-bin/Redditch.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Rushmoor.cgi b/cgi-bin/Rushmoor.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Scarborough.cgi b/cgi-bin/Scarborough.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Sevenoaks.cgi b/cgi-bin/Sevenoaks.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/South Bucks.cgi b/cgi-bin/South Bucks.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/South Ribble.cgi b/cgi-bin/South Ribble.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/South Staffordshire.cgi b/cgi-bin/South Staffordshire.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/SouthOxfordshire.cgi b/cgi-bin/SouthOxfordshire.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Southampton.cgi b/cgi-bin/Southampton.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Spelthorne.cgi b/cgi-bin/Spelthorne.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/St Helens.cgi b/cgi-bin/St Helens.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Stevenage.cgi b/cgi-bin/Stevenage.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Stirling.cgi b/cgi-bin/Stirling.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Stockton-On-Tees.cgi b/cgi-bin/Stockton-On-Tees.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Stratford.cgi b/cgi-bin/Stratford.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Sunderland.cgi b/cgi-bin/Sunderland.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Teignbridge.cgi b/cgi-bin/Teignbridge.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Test Valley.cgi b/cgi-bin/Test Valley.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Tonbridge.cgi b/cgi-bin/Tonbridge.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Torbay.cgi b/cgi-bin/Torbay.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Vale Royal.cgi b/cgi-bin/Vale Royal.cgi old mode 100755 new mode 100644 diff --git a/cgi-bin/Waveney.cgi b/cgi-bin/Waveney.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Wear Valley.cgi b/cgi-bin/Wear Valley.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Wellingborough.cgi b/cgi-bin/Wellingborough.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/West Berkshire.cgi b/cgi-bin/West Berkshire.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/West Lancashire.cgi b/cgi-bin/West Lancashire.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/West Norfolk.cgi b/cgi-bin/West Norfolk.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Winchester.cgi b/cgi-bin/Winchester.cgi old mode 100755 new mode 100644 diff --git a/cgi-bin/Woking.cgi b/cgi-bin/Woking.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/Wolverhampton.cgi b/cgi-bin/Wolverhampton.cgi old mode 100755 new mode 100644 diff --git a/cgi-bin/York.cgi b/cgi-bin/York.cgi old mode 100644 new mode 100755 diff --git a/cgi-bin/broxbourne.cgi b/cgi-bin/broxbourne.cgi deleted file mode 100755 index ffe3063..0000000 --- a/cgi-bin/broxbourne.cgi +++ /dev/null @@ -1,163 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; - -use CGI qw(:cgi); -use DateTime; -#use DateTime::Format::DateParse; -use HTML::TreeBuilder; -use LWP::UserAgent; -use XML::Writer; - -# The master URL for the Broxbourne planning search -our $SearchURL = "http://www2.broxbourne.gov.uk/planningsearch/webform1.aspx"; - -# We're a CGI script... -my $query = CGI->new(); - -# Get the date as an offset from 2000-01-01 -my $epoch = DateTime->new(year => 2000, month => 1, day => 1); -my $querydate = DateTime->new(year => $query->param("year"), - month => $query->param("month"), - day => $query->param("day")); -$querydate = $querydate->delta_days($epoch)->delta_days; - -# Construct an LWP user agent -our $UA = LWP::UserAgent->new(env_proxy => 1); - -# Post the URL to get an initial blank form -my $state = get_state(do_post()); - -# Post each date in turn to build up the state - you can thank -# Microsoft and ASP.NET for the horrible way we have to do this -# by posting each argument in turn to build up the state -$state = get_state(do_post_back($state, 'DateSelector1$Calendar1', $querydate)); -$state = get_state(do_post_back($state, 'DateSelector2$Calendar1', $querydate)); - -# Output an HTTP response header -print $query->header(-type => "text/xml"); - -# Create an XML output stream -my $Writer = XML::Writer->new(DATA_MODE => 1); - -# Output the XML header data -$Writer->xmlDecl("UTF-8"); -$Writer->startTag("planning"); -$Writer->dataElement("authority_name", "Borough of Broxbourne"); -$Writer->dataElement("authority_short_name", "Broxbourne"); -$Writer->startTag("applications"); - -# Get the arguments for the search... -my $args = { - "Srch" => "rb1", - "__VIEWSTATE" => $state, - "btnSearch" => "Search", - "tbReference" => "", - "tbRef2" => "" -}; - -# ...and then (at last) we can do the search! -my $page = do_post($args); - -# Loop processing pages of results -while ($page) -{ - my $table = $page->look_down("_tag" => "table", "id" => "DataGrid1"); - - # Remember the state - $state = get_state($page); - - # Clear the page for now - this will be reinitialised if we - # find another page of results to make us go round the loop - # all over again - undef $page; - - # Check that we found a table - searches that find no results - # produce a page with no table in it - if ($table) - { - # Process each row of the results - foreach my $row ($table->look_down("_tag" => "tr")) - { - my @cells = $row->look_down("_tag" => "td"); - - if ($cells[0]->look_down("_tag" => "input")) - { - my $reference = $cells[1]->as_trimmed_text; - my $date = $cells[2]->as_trimmed_text; - my $address = $cells[3]->as_trimmed_text; - my $description = $cells[4]->as_trimmed_text; - my $postcode; - - if ($address =~ /\s+([A-Z]+\d+\s+\d+[A-Z]+)$/) - { - $postcode = $1; - } - - $Writer->startTag("application"); - $Writer->dataElement("council_reference", $reference); - $Writer->dataElement("address", $address); - $Writer->dataElement("postcode", $postcode); - $Writer->dataElement("description", $description); - $Writer->dataElement("date_received", $date); - $Writer->endTag("application"); - } - elsif ($cells[0]->attr("colspan") && $cells[0]->attr("colspan") eq "5") - { - foreach my $link ($cells[0]->look_down("_tag" => "a")) - { - if ($link->as_trimmed_text eq ">" && - $link->attr("href") =~ /^javascript:__doPostBack\('([^\']*)','([^\']*)'\)$/) - { - $page = do_post_back($state, $1, $2); - } - } - } - } - } -} - -# Finish off XML output -$Writer->endTag("applications"); -$Writer->endTag("planning"); -$Writer->end(); - -exit 0; - -# Extract the state from a page so we can repost it -sub get_state -{ - my $page = shift; - my $viewstate = $page->look_down("_tag" => "input", "name" => "__VIEWSTATE"); - - return $viewstate->attr("value"); -} - -# Fake up what the doPostBack javascript function in the page does... -sub do_post_back -{ - my $state = shift; - my $target = shift; - my $argument = shift; - - $target =~ s/\$/:/g; - - my $args = { - "__EVENTTARGET" => $target, - "__EVENTARGUMENT" => $argument, - "__VIEWSTATE" => $state - }; - - return do_post($args); -} - -# Post to the planning search page -sub do_post -{ - my $response = $UA->post($SearchURL, @_); - - die $response->status_line unless $response->is_success; - - return HTML::TreeBuilder->new_from_content($response->content); -} diff --git a/python_scrapers/AcolnetParser.py b/python_scrapers/AcolnetParser.py new file mode 100644 index 0000000..55e2796 --- /dev/null +++ b/python_scrapers/AcolnetParser.py @@ -0,0 +1,364 @@ +#!/usr/local/bin/python + +import urllib, urllib2 +import HTMLParser +#from BeautifulSoup import BeautifulSoup + +import urlparse + +import re + +end_head_regex = re.compile(" 0: + self._subtable_depth -= 1 + else: + # We need to add the last application in the table + if self._current_application is not None: + #print "adding application" + self._results.addApplication(self._current_application) + #print self._current_application + self._current_application = None + self._tr_number = None + self._subtable_depth = None + elif tag == "td": + self._in_td = False + + def getResultsByDayMonthYear(self, day, month, year): + # first we fetch the search page to get ourselves some session info... + search_form_response = urllib2.urlopen(self.base_url) + search_form_contents = search_form_response.read() + + # This sometimes causes a problem in HTMLParser, so let's just get the link + # out with a regex... + + groups = self.action_regex.search(search_form_contents).groups() + + action = groups[0] + #print action + + action_url = urlparse.urljoin(self.base_url, action) + #print action_url + + our_date = date(year, month, day) + + search_data = {"regdate1": our_date.strftime(date_format), + "regdate2": our_date.strftime(date_format), + } + + opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler) + response = opener.open(action_url, search_data) + results_html = response.read() + + # This is for doing site specific html cleanup + results_html = self._cleanupHTML(results_html) + + #some javascript garbage in the header upsets HTMLParser, + #so we'll just have the body + just_body = "" + end_head_regex.split(results_html)[-1] + + #outfile = open(self.authority_short_name + ".debug", "w") + #outfile.write(just_body) + + self.feed(just_body) + + return self._results + + + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + + +class BaberghParser(AcolnetParser): + #search_url = "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch" + + case_number_tr = 1 # this one can be got by the td class attribute + reg_date_tr = 2 + location_tr = 4 + proposal_tr = 5 + + #authority_name = "Babergh District Council" + #authority_short_name = "Babergh" + + # It would be nice to scrape this... + comments_email_address = "planning.reception@babergh.gov.uk" + + action_regex = re.compile("") + +class BasingstokeParser(AcolnetParser): + #search_url = "http://planning.basingstoke.gov.uk/DCOnline2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch" + + case_number_tr = 1 # this one can be got by the td class attribute + reg_date_tr = 3 + location_tr = 6 + proposal_tr = 8 + + #authority_name = "Basingstoke and Deane Borough Council" + #authority_short_name = "Basingstoke and Deane" + + # It would be nice to scrape this... + comments_email_address = "development.control@basingstoke.gov.uk" + + action_regex = re.compile("") + +class BassetlawParser(AcolnetParser): + #search_url = "http://www.bassetlaw.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch" + + case_number_tr = 1 # this one can be got by the td class attribute + reg_date_tr = 2 + location_tr = 5 + proposal_tr = 6 + + #authority_name = "Bassetlaw District Council" + #authority_short_name = "Bassetlaw" + + comments_email_address = "planning@bassetlaw.gov.uk" + + action_regex = re.compile("", re.IGNORECASE) + + def _cleanupHTML(self, html): + """There is a broken div in this page. We don't need any divs, so + let's get rid of them all.""" + + div_regex = re.compile("]*>", re.IGNORECASE) + return div_regex.sub('', html) + + +class BridgenorthParser(AcolnetParser): + #search_url = "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch" + + case_number_tr = 1 # this one can be got by the td class attribute + reg_date_tr = 2 + location_tr = 4 + proposal_tr = 5 + + #authority_name = "Bridgenorth District Council" + #authority_short_name = "Bridgenorth" + + comments_email_address = "contactus@bridgnorth-dc.gov.uk" + + action_regex = re.compile("") + +class BuryParser(AcolnetParser): + #search_url = "http://e-planning.bury.gov.uk/ePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch" + + case_number_tr = 1 # this one can be got by the td class attribute + reg_date_tr = 2 + location_tr = 4 + proposal_tr = 5 + + #authority_name = "Bury Metropolitan Borough Council" + #authority_short_name = "Bury" + + comments_email_address = "development.control@bury.gov.uk" + action_regex = re.compile("") + +## class CanterburyParser(AcolnetParser): +## search_url = "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch" + +## case_number_tr = 1 # this one can be got by the td class attribute +## reg_date_tr = 2 +## location_tr = 4 +## proposal_tr = 5 + +## authority_name = "Canterbury City Council" +## authority_short_name = "Canterbury" + +## comments_email_address = "" +## action_regex = re.compile("") + +class CarlisleParser(AcolnetParser): + #search_url = "http://planning.carlisle.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch" + + case_number_tr = 1 # this one can be got by the td class attribute + reg_date_tr = 2 + location_tr = 5 + proposal_tr = 6 + + #authority_name = "Carlisle City Council" + #authority_short_name = "Carlisle" + + comments_email_address = "dc@carlisle.gov.uk" + action_regex = re.compile("") + + +class DerbyParser(AcolnetParser): + #search_url = "http://195.224.106.204/scripts/planningpages02%5CXSLPagesDC_DERBY%5CDCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch" + + case_number_tr = 1 # this one can be got by the td class attribute + reg_date_tr = 3 + location_tr = 4 + proposal_tr = 5 + + #authority_name = "Derby City Council" + #authority_short_name = "Derby" + + comments_email_address = "developmentcontrol@derby.gov.uk" + action_regex = re.compile("") + + +if __name__ == '__main__': + day = 15 + month = 3 + year = 2007 + + # working + # parser = BasingstokeParser() + parser = BaberghParser("Babergh District Council", "Babergh", "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") + + # works with the divs stripped out + #parser = BassetlawParser() + + # returns error 400 - bad request + #parser = BridgenorthParser() + + # working + #parser = BuryParser() + + # cambridgeshire is a bit different... + # no advanced search page + + # canterbury + # results as columns of one table + + # returns error 400 - bad request + #parser = CarlisleParser() + + # working + #parser = DerbyParser() + + print parser.getResults(day, month, year) + diff --git a/python_scrapers/MultipartPostHandler.py b/python_scrapers/MultipartPostHandler.py new file mode 100644 index 0000000..c427613 --- /dev/null +++ b/python_scrapers/MultipartPostHandler.py @@ -0,0 +1,133 @@ +#### +# 02/2006 Will Holcomb +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# + +# I have edited out a bit in the middle of this which reverts to a normal +# post with "application/x-www-form-urlencoded" content-type when there are +# no files. +# Duncan 5/5/2007 + +""" +Usage: + Enables the use of multipart/form-data for posting forms + +Inspirations: + Upload files in python: + http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/146306 + urllib2_file: + Fabien Seisen: + +Example: + import MultipartPostHandler, urllib2, cookielib + + cookies = cookielib.CookieJar() + opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookies), + MultipartPostHandler.MultipartPostHandler) + params = { "username" : "bob", "password" : "riviera", + "file" : open("filename", "rb") } + opener.open("http://wwww.bobsite.com/upload/", params) + +Further Example: + The main function of this file is a sample which downloads a page and + then uploads it to the W3C validator. +""" + +import urllib +import urllib2 +import mimetools, mimetypes +import os, stat + +class Callable: + def __init__(self, anycallable): + self.__call__ = anycallable + +# Controls how sequences are uncoded. If true, elements may be given multiple values by +# assigning a sequence. +doseq = 1 + +class MultipartPostHandler(urllib2.BaseHandler): + handler_order = urllib2.HTTPHandler.handler_order - 10 # needs to run first + + def http_request(self, request): + data = request.get_data() + if data is not None and type(data) != str: + v_files = [] + v_vars = [] + try: + for(key, value) in data.items(): + if type(value) == file: + v_files.append((key, value)) + else: + v_vars.append((key, value)) + except TypeError: + systype, value, traceback = sys.exc_info() + raise TypeError, "not a valid non-string sequence or mapping object", traceback + + boundary, data = self.multipart_encode(v_vars, v_files) + contenttype = 'multipart/form-data; boundary=%s' % boundary + if(request.has_header('Content-Type') + and request.get_header('Content-Type').find('multipart/form-data') != 0): + print "Replacing %s with %s" % (request.get_header('content-type'), 'multipart/form-data') + request.add_unredirected_header('Content-Type', contenttype) + + request.add_data(data) + return request + + def multipart_encode(vars, files, boundary = None, buffer = None): + if boundary is None: + boundary = mimetools.choose_boundary() + if buffer is None: + buffer = '' + for(key, value) in vars: + buffer += '--%s\r\n' % boundary + buffer += 'Content-Disposition: form-data; name="%s"' % key + buffer += '\r\n\r\n' + value + '\r\n' + for(key, fd) in files: + file_size = os.fstat(fd.fileno())[stat.ST_SIZE] + filename = fd.name.split('/')[-1] + contenttype = mimetypes.guess_type(filename)[0] or 'application/octet-stream' + buffer += '--%s\r\n' % boundary + buffer += 'Content-Disposition: form-data; name="%s"; filename="%s"\r\n' % (key, filename) + buffer += 'Content-Type: %s\r\n' % contenttype + # buffer += 'Content-Length: %s\r\n' % file_size + fd.seek(0) + buffer += '\r\n' + fd.read() + '\r\n' + buffer += '--%s--\r\n\r\n' % boundary + return boundary, buffer + multipart_encode = Callable(multipart_encode) + + https_request = http_request + +## def main(): +## import tempfile, sys + +## validatorURL = "http://validator.w3.org/check" +## opener = urllib2.build_opener(MultipartPostHandler) + +## def validateFile(url): +## temp = tempfile.mkstemp(suffix=".html") +## os.write(temp[0], opener.open(url).read()) +## params = { "ss" : "0", # show source +## "doctype" : "Inline", +## "uploaded_file" : open(temp[1], "rb") } +## print opener.open(validatorURL, params).read() +## os.remove(temp[1]) + +## if len(sys.argv[1:]) > 0: +## for arg in sys.argv[1:]: +## validateFile(arg) +## else: +## validateFile("http://www.google.com") + +## if __name__=="__main__": +## main() diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index 09922e0..f60c5db 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -4,4 +4,5 @@ "SouthOxfordshireParser.py", "420" "SouthOxfordshire.cgi", "493" "ApplicationSearchServletParser.py", "420" - +"AcolnetParser.py", "420" +"MultipartPostHandler.py", "420" diff --git a/python_scrapers/PublicAccessSites.csv b/python_scrapers/PublicAccessSites.csv index 5db8d69..70c20f3 100644 --- a/python_scrapers/PublicAccessSites.csv +++ b/python_scrapers/PublicAccessSites.csv @@ -82,11 +82,14 @@ "Dundee City Council", "Dundee", "http://bwarrant.dundeecity.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" "Durham City Council", "Durham", "http://publicaccess.durhamcity.gov.uk/publicaccess/dc/", "PublicAccess", "PublicAccessParser" "East Dorset District Council", "East Dorset", "http://193.243.228.16/PublicAccess/dc/", "PublicAccess", "PublicAccessParser" -"Epsom and Ewell Borough Council", "Epsom and Ewell", "http://eplanning.epsom-ewell.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" "Gateshead Metropolitan Borough Council", "Gateshead", "http://planning.gateshead.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" -"Gedling Borough Council", "Gedling", "http://publicaccess.gedling.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" "Gloucestershire County Council", "Gloucestershire", "http://planning.gloucestershire.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" "Newcastle-under-Lyme Borough Council", "Newcastle-under-Lyme", "http://publicaccess.newcastle-staffs.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" "Vale Royal Borough Council", "Vale Royal", "http://pa.valeroyal.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" "Winchester City Council", "Winchester", "http://win2padmz.winchester.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" "Wolverhampton City Council", "Wolverhampton", "http://planningonline.wolverhampton.gov.uk/PublicAccess/dc/", "PublicAccess", "PublicAccessParser" +"Babergh District Council", "Babergh", "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "BaberghParser" +"Basingstoke and Deane Borough Council", "Basingstoke and Deane", "http://planning.basingstoke.gov.uk/DCOnline2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "BasingstokeParser" +"Bassetlaw District Council", "Bassetlaw","http://www.bassetlaw.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "BassetlawParser" +"Bury Metropolitan Borough Council", "Bury", "http://e-planning.bury.gov.uk/ePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch", "AcolnetParser", "BuryParser" +"Derby City Council", "Derby", "http://195.224.106.204/scripts/planningpages02%5CXSLPagesDC_DERBY%5CDCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "DerbyParser" \ No newline at end of file