From 7df8e7ea93aa206faf87c6cd893f53ffbad0e798 Mon Sep 17 00:00:00 2001
From: "duncan.parkes" <duncan.parkes@cf68f80a-222c-0410-aabe-5f79d0504a29>
Date: Tue, 11 Sep 2007 11:40:10 +0000
Subject: [PATCH] add the generated scrapers for fastweb sites

---
 cgi-bin/Craven.cgi          |  29 +++++
 cgi-bin/Eastleigh.cgi       |  29 +++++
 cgi-bin/Eden.cgi            |  29 +++++
 cgi-bin/FastWeb.py          | 207 ++++++++++++++++++++++++++++++++++++
 cgi-bin/Mansfield.cgi       |  29 +++++
 cgi-bin/PlanningUtils.py    |   1 +
 cgi-bin/South Lakeland.cgi  |  29 +++++
 cgi-bin/Sutton.cgi          |  29 +++++
 cgi-bin/Welwyn-Hatfield.cgi |  29 +++++
 cgi-bin/Wyre Forest.cgi     |  29 +++++
 python_scrapers/FastWeb.py  |  14 +--
 11 files changed, 441 insertions(+), 13 deletions(-)
 create mode 100755 cgi-bin/Craven.cgi
 create mode 100755 cgi-bin/Eastleigh.cgi
 create mode 100755 cgi-bin/Eden.cgi
 create mode 100644 cgi-bin/FastWeb.py
 create mode 100755 cgi-bin/Mansfield.cgi
 create mode 100755 cgi-bin/South Lakeland.cgi
 create mode 100755 cgi-bin/Sutton.cgi
 create mode 100755 cgi-bin/Welwyn-Hatfield.cgi
 create mode 100755 cgi-bin/Wyre Forest.cgi

diff --git a/cgi-bin/Craven.cgi b/cgi-bin/Craven.cgi
new file mode 100755
index 0000000..7f16621
--- /dev/null
+++ b/cgi-bin/Craven.cgi
@@ -0,0 +1,29 @@
+#!/usr/local/bin/python
+
+# This is the parser for Craven District Council.
+# it is generated from the file CGITemplate
+
+import cgi
+import cgitb
+#cgitb.enable(display=0, logdir="/tmp")
+
+
+form = cgi.FieldStorage()
+day = form.getfirst('day')
+month = form.getfirst('month')
+year = form.getfirst('year')
+
+
+authority_name = "Craven District Council"
+authority_short_name = "Craven"
+base_url = "http://www.planning.cravendc.gov.uk/fastweb/"
+
+import FastWeb
+
+parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url)
+
+xml = parser.getResults(day, month, year)
+
+print "Content-Type: text/xml"     # XML is following
+print
+print xml                          # print the xml
diff --git a/cgi-bin/Eastleigh.cgi b/cgi-bin/Eastleigh.cgi
new file mode 100755
index 0000000..f2453c4
--- /dev/null
+++ b/cgi-bin/Eastleigh.cgi
@@ -0,0 +1,29 @@
+#!/usr/local/bin/python
+
+# This is the parser for Eastleigh Borough Council.
+# it is generated from the file CGITemplate
+
+import cgi
+import cgitb
+#cgitb.enable(display=0, logdir="/tmp")
+
+
+form = cgi.FieldStorage()
+day = form.getfirst('day')
+month = form.getfirst('month')
+year = form.getfirst('year')
+
+
+authority_name = "Eastleigh Borough Council"
+authority_short_name = "Eastleigh"
+base_url = "http://www.eastleigh.gov.uk/FastWEB/"
+
+import FastWeb
+
+parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url)
+
+xml = parser.getResults(day, month, year)
+
+print "Content-Type: text/xml"     # XML is following
+print
+print xml                          # print the xml
diff --git a/cgi-bin/Eden.cgi b/cgi-bin/Eden.cgi
new file mode 100755
index 0000000..4c289b9
--- /dev/null
+++ b/cgi-bin/Eden.cgi
@@ -0,0 +1,29 @@
+#!/usr/local/bin/python
+
+# This is the parser for Eden District Council.
+# it is generated from the file CGITemplate
+
+import cgi
+import cgitb
+#cgitb.enable(display=0, logdir="/tmp")
+
+
+form = cgi.FieldStorage()
+day = form.getfirst('day')
+month = form.getfirst('month')
+year = form.getfirst('year')
+
+
+authority_name = "Eden District Council"
+authority_short_name = "Eden"
+base_url = "http://eforms.eden.gov.uk/fastweb/"
+
+import FastWeb
+
+parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url)
+
+xml = parser.getResults(day, month, year)
+
+print "Content-Type: text/xml"     # XML is following
+print
+print xml                          # print the xml
diff --git a/cgi-bin/FastWeb.py b/cgi-bin/FastWeb.py
new file mode 100644
index 0000000..508d893
--- /dev/null
+++ b/cgi-bin/FastWeb.py
@@ -0,0 +1,207 @@
+
+import urllib2
+import HTMLParser
+import urlparse
+import datetime
+
+from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication
+
+# example url
+# http://www.planning.cravendc.gov.uk/fastweb/results.asp?Scroll=1&DateReceivedStart=1%2F1%2F2007&DateReceivedEnd=1%2F7%2F2007
+
+search_form_url_end = "results.asp?Scroll=%(scroll)d&DateReceivedStart=%(day)d%%2F%(month)d%%2F%(year)d&DateReceivedEnd=%(day)d%%2F%(month)d%%2F%(year)d"
+
+# for testing paging
+#search_form_url_end = "results.asp?Scroll=%(scroll)d&DateReceivedStart=10%%2F7%%2F2007&DateReceivedEnd=%(day)d%%2F%(month)d%%2F%(year)d"
+
+comment_url_end = "comment.asp?AltRef=%s"
+info_url_end = "detail.asp?AltRef=%s"
+
+class FastWeb:
+    def __init__(self,
+                 authority_name,
+                 authority_short_name,
+                 base_url,
+                 debug=False):
+        
+        self.authority_name = authority_name
+        self.authority_short_name = authority_short_name
+        self.base_url = base_url
+
+        self.debug = debug
+
+        # The object which stores our set of planning application results
+        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
+        
+    def getResultsByDayMonthYear(self, day, month, year):
+        requested_date = datetime.date(year, month, day)
+
+        # What we should do:
+
+        #1) Work out if the page we get back is a results page or the search page again. The search page indicates no results for this day.
+
+        # Assuming we have a results page:
+        #2) Get the total number of results out of it. We can use this to work out how many times we need to request the page, and with what scroll numbers
+
+        #3) Iterate over scroll numbers.
+
+        scroll = 0
+        first_time = True
+        number_of_results = 0
+
+        while first_time or scroll * 20 < number_of_results:
+            scroll += 1
+        
+            this_search_url = search_form_url_end %{"scroll":scroll, "day":day, "month":month, "year":year}
+            url = urlparse.urljoin(self.base_url, this_search_url)
+            response = urllib2.urlopen(url)
+
+            contents = response.read()
+
+            if first_time:
+                # We can now use the returned URL to tell us if there were no results.
+                returned_url = response.geturl()
+
+                # example URL of no results page
+                # http://www.planning.cravendc.gov.uk/fastweb/search.asp?Results=none&
+                if returned_url.count("search.asp"):
+                    # We got back the search page, there were no results for this date
+                    break
+            
+            results_page_parser = FastWebResultsPageParser(self._results, requested_date, self.base_url)
+            results_page_parser.feed(contents)
+
+            if first_time:
+                number_of_results += results_page_parser.number_of_results
+                
+            first_time = False
+
+        return self._results
+    
+    def getResults(self, day, month, year):
+        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
+
+
+
+# States
+
+STARTING = 1
+GOT_RESULTS_COUNT = 2
+IN_RESULTS_TABLE = 3
+IN_RESULTS_TABLE_TD = 4
+IN_INNER_TABLE = 5
+FINISHED = -1
+
+
+class FastWebResultsPageParser(HTMLParser.HTMLParser):
+    def __init__(self, results, requested_date, base_url):
+
+        self.results = results
+
+        self.requested_date = requested_date
+        self.base_url = base_url
+
+
+	HTMLParser.HTMLParser.__init__(self)
+
+        # We'll use this to store the number of results returned for this search
+        self.number_of_results = None
+
+        self._state = STARTING
+        self._td_count = None
+
+        self._data_list = []
+
+        # This will store the planning application we are currently working on.
+        self._current_application = None
+        
+    def get_data(self, flush=True):
+        data = " ".join(self._data_list)
+
+        if flush:
+            self.flush_data()
+            
+        return data
+
+    def flush_data(self):
+        self._data_list = []
+
+    def handle_starttag(self, tag, attrs):
+        if self._state == STARTING and tag == "input":
+            self._state = GOT_RESULTS_COUNT
+
+            # This is where the number of results returned is stored
+            attr_dict = {}
+            
+            for attr_name, attr_value in attrs:
+                attr_dict[attr_name] = attr_value
+                
+            if attr_dict.get("id") == "RecCount":
+                self.number_of_results = int(attr_dict.get("value"))
+
+        elif self._state == GOT_RESULTS_COUNT and tag == "table":
+            self._state = IN_RESULTS_TABLE
+
+        elif self._state == IN_RESULTS_TABLE and tag == "td":
+            self._state = IN_RESULTS_TABLE_TD
+        elif self._state == IN_RESULTS_TABLE_TD and tag == "table":
+            self._state = IN_INNER_TABLE
+            self._td_count = 0
+            self._current_application = PlanningApplication()
+            self._current_application.date_received = self.requested_date
+
+        elif self._state == IN_INNER_TABLE and tag == "td":
+            self._td_count += 1
+            self.flush_data()
+
+    def handle_endtag(self, tag):
+        if self._state == IN_INNER_TABLE and tag == "table":
+            # The next if should never be false, but it pays to be careful :-)
+            if self._current_application.council_reference is not None:
+                self.results.addApplication(self._current_application)
+            self._state = IN_RESULTS_TABLE_TD
+
+        elif self._state == IN_RESULTS_TABLE_TD and tag == "td":
+            self._state = FINISHED
+            
+        elif self._state == IN_INNER_TABLE and tag == "td":
+            if self._td_count == 2:
+                # This data is the App No.
+                council_reference = self.get_data().strip()
+                self._current_application.council_reference = council_reference
+
+                # This also gives us everything we need for the info and comment urls
+                self._current_application.info_url = urlparse.urljoin(self.base_url, info_url_end %(council_reference))
+                self._current_application.comment_url = urlparse.urljoin(self.base_url, comment_url_end %(council_reference))
+                
+            elif self._td_count == 4:
+                # This data is the address
+                self._current_application.address = self.get_data().strip()
+                self._current_application.postcode = getPostcodeFromText(self._current_application.address)
+            elif self._td_count == 7:
+                # This data is the description
+                self._current_application.description = self.get_data().strip()
+
+    
+    def handle_data(self, data):
+        self._data_list.append(data)
+
+        
+    
+# for debug purposes
+
+#cravenparser = FastWeb("Craven District Council", "Craven", "http://www.planning.cravendc.gov.uk/fastweb/")
+
+#eastleighparser = FastWeb("EastLeigh Borough Council", "Eastleigh", "http://www.eastleigh.gov.uk/FastWEB/")
+
+
+#suttonparser = FastWeb("Sutton", "Sutton", "http://82.43.4.135/FASTWEB/")
+
+#print eastleighparser.getResults(10,8,2007)
+#print cravenparser.getResults(25,12,2006)
+#print suttonparser.getResults(10,8,2007)
+
+#south_lakeland_parser = FastWeb("South Lakeland", "South Lakeland", "http://www.southlakeland.gov.uk/fastweb/")
+
+#print south_lakeland_parser.getResults(27,11,2006)
+
diff --git a/cgi-bin/Mansfield.cgi b/cgi-bin/Mansfield.cgi
new file mode 100755
index 0000000..d9bc0f6
--- /dev/null
+++ b/cgi-bin/Mansfield.cgi
@@ -0,0 +1,29 @@
+#!/usr/local/bin/python
+
+# This is the parser for Mansfield District Council.
+# it is generated from the file CGITemplate
+
+import cgi
+import cgitb
+#cgitb.enable(display=0, logdir="/tmp")
+
+
+form = cgi.FieldStorage()
+day = form.getfirst('day')
+month = form.getfirst('month')
+year = form.getfirst('year')
+
+
+authority_name = "Mansfield District Council"
+authority_short_name = "Mansfield"
+base_url = "http://www.mansfield.gov.uk/Fastweb23/"
+
+import FastWeb
+
+parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url)
+
+xml = parser.getResults(day, month, year)
+
+print "Content-Type: text/xml"     # XML is following
+print
+print xml                          # print the xml
diff --git a/cgi-bin/PlanningUtils.py b/cgi-bin/PlanningUtils.py
index 8e12412..9210446 100644
--- a/cgi-bin/PlanningUtils.py
+++ b/cgi-bin/PlanningUtils.py
@@ -90,6 +90,7 @@ class PlanningApplication:
 	return self.displayXML()
         
     def displayXML(self):
+        #print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received
 	return "<application>\n" +\
 	"<council_reference>%s</council_reference>\n" %xmlQuote(self.council_reference) +\
         "<address>%s</address>\n" %xmlQuote(self.address) +\
diff --git a/cgi-bin/South Lakeland.cgi b/cgi-bin/South Lakeland.cgi
new file mode 100755
index 0000000..36461e4
--- /dev/null
+++ b/cgi-bin/South Lakeland.cgi	
@@ -0,0 +1,29 @@
+#!/usr/local/bin/python
+
+# This is the parser for South Lakeland District Council.
+# it is generated from the file CGITemplate
+
+import cgi
+import cgitb
+#cgitb.enable(display=0, logdir="/tmp")
+
+
+form = cgi.FieldStorage()
+day = form.getfirst('day')
+month = form.getfirst('month')
+year = form.getfirst('year')
+
+
+authority_name = "South Lakeland District Council"
+authority_short_name = "South Lakeland"
+base_url = "http://www.southlakeland.gov.uk/fastweb/"
+
+import FastWeb
+
+parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url)
+
+xml = parser.getResults(day, month, year)
+
+print "Content-Type: text/xml"     # XML is following
+print
+print xml                          # print the xml
diff --git a/cgi-bin/Sutton.cgi b/cgi-bin/Sutton.cgi
new file mode 100755
index 0000000..b763f3e
--- /dev/null
+++ b/cgi-bin/Sutton.cgi
@@ -0,0 +1,29 @@
+#!/usr/local/bin/python
+
+# This is the parser for London Borough of Sutton.
+# it is generated from the file CGITemplate
+
+import cgi
+import cgitb
+#cgitb.enable(display=0, logdir="/tmp")
+
+
+form = cgi.FieldStorage()
+day = form.getfirst('day')
+month = form.getfirst('month')
+year = form.getfirst('year')
+
+
+authority_name = "London Borough of Sutton"
+authority_short_name = "Sutton"
+base_url = "http://82.43.4.135/FASTWEB/"
+
+import FastWeb
+
+parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url)
+
+xml = parser.getResults(day, month, year)
+
+print "Content-Type: text/xml"     # XML is following
+print
+print xml                          # print the xml
diff --git a/cgi-bin/Welwyn-Hatfield.cgi b/cgi-bin/Welwyn-Hatfield.cgi
new file mode 100755
index 0000000..2babb8c
--- /dev/null
+++ b/cgi-bin/Welwyn-Hatfield.cgi
@@ -0,0 +1,29 @@
+#!/usr/local/bin/python
+
+# This is the parser for Welwyn-Hatfield District Council.
+# it is generated from the file CGITemplate
+
+import cgi
+import cgitb
+#cgitb.enable(display=0, logdir="/tmp")
+
+
+form = cgi.FieldStorage()
+day = form.getfirst('day')
+month = form.getfirst('month')
+year = form.getfirst('year')
+
+
+authority_name = "Welwyn-Hatfield District Council"
+authority_short_name = "Welwyn-Hatfield"
+base_url = "https://fastweb.welhat.gov.uk/"
+
+import FastWeb
+
+parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url)
+
+xml = parser.getResults(day, month, year)
+
+print "Content-Type: text/xml"     # XML is following
+print
+print xml                          # print the xml
diff --git a/cgi-bin/Wyre Forest.cgi b/cgi-bin/Wyre Forest.cgi
new file mode 100755
index 0000000..9b8816d
--- /dev/null
+++ b/cgi-bin/Wyre Forest.cgi	
@@ -0,0 +1,29 @@
+#!/usr/local/bin/python
+
+# This is the parser for Wyre Forest District Council.
+# it is generated from the file CGITemplate
+
+import cgi
+import cgitb
+#cgitb.enable(display=0, logdir="/tmp")
+
+
+form = cgi.FieldStorage()
+day = form.getfirst('day')
+month = form.getfirst('month')
+year = form.getfirst('year')
+
+
+authority_name = "Wyre Forest District Council"
+authority_short_name = "Wyre Forest"
+base_url = "http://www.wyreforest.gov.uk/fastweb/"
+
+import FastWeb
+
+parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url)
+
+xml = parser.getResults(day, month, year)
+
+print "Content-Type: text/xml"     # XML is following
+print
+print xml                          # print the xml
diff --git a/python_scrapers/FastWeb.py b/python_scrapers/FastWeb.py
index 4386ce4..508d893 100644
--- a/python_scrapers/FastWeb.py
+++ b/python_scrapers/FastWeb.py
@@ -56,22 +56,15 @@ class FastWeb:
             url = urlparse.urljoin(self.base_url, this_search_url)
             response = urllib2.urlopen(url)
 
-            #print response.info()
-            #print response.geturl()
-
             contents = response.read()
-            #print contents
 
             if first_time:
                 # We can now use the returned URL to tell us if there were no results.
                 returned_url = response.geturl()
-                #parsed_returned_url = urlparse.urlparse(returned_url)
 
                 # example URL of no results page
                 # http://www.planning.cravendc.gov.uk/fastweb/search.asp?Results=none&
-                #print parsed_returned_url
                 if returned_url.count("search.asp"):
-                #if parsed_returned_url[4] == "search.asp?Results=none&":
                     # We got back the search page, there were no results for this date
                     break
             
@@ -136,7 +129,7 @@ class FastWebResultsPageParser(HTMLParser.HTMLParser):
     def handle_starttag(self, tag, attrs):
         if self._state == STARTING and tag == "input":
             self._state = GOT_RESULTS_COUNT
-            #print attrs
+
             # This is where the number of results returned is stored
             attr_dict = {}
             
@@ -145,7 +138,6 @@ class FastWebResultsPageParser(HTMLParser.HTMLParser):
                 
             if attr_dict.get("id") == "RecCount":
                 self.number_of_results = int(attr_dict.get("value"))
-                #print self.number_of_results
 
         elif self._state == GOT_RESULTS_COUNT and tag == "table":
             self._state = IN_RESULTS_TABLE
@@ -213,7 +205,3 @@ class FastWebResultsPageParser(HTMLParser.HTMLParser):
 
 #print south_lakeland_parser.getResults(27,11,2006)
 
-# To do
-
-# 3) integrate with other scrapers
-# 4) other fastweb sites