| @@ -0,0 +1,29 @@ | |||||
| #!/usr/local/bin/python | |||||
| # This is the parser for Craven District Council. | |||||
| # it is generated from the file CGITemplate | |||||
| import cgi | |||||
| import cgitb | |||||
| #cgitb.enable(display=0, logdir="/tmp") | |||||
| form = cgi.FieldStorage() | |||||
| day = form.getfirst('day') | |||||
| month = form.getfirst('month') | |||||
| year = form.getfirst('year') | |||||
| authority_name = "Craven District Council" | |||||
| authority_short_name = "Craven" | |||||
| base_url = "http://www.planning.cravendc.gov.uk/fastweb/" | |||||
| import FastWeb | |||||
| parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url) | |||||
| xml = parser.getResults(day, month, year) | |||||
| print "Content-Type: text/xml" # XML is following | |||||
| print xml # print the xml | |||||
| @@ -0,0 +1,29 @@ | |||||
| #!/usr/local/bin/python | |||||
| # This is the parser for Eastleigh Borough Council. | |||||
| # it is generated from the file CGITemplate | |||||
| import cgi | |||||
| import cgitb | |||||
| #cgitb.enable(display=0, logdir="/tmp") | |||||
| form = cgi.FieldStorage() | |||||
| day = form.getfirst('day') | |||||
| month = form.getfirst('month') | |||||
| year = form.getfirst('year') | |||||
| authority_name = "Eastleigh Borough Council" | |||||
| authority_short_name = "Eastleigh" | |||||
| base_url = "http://www.eastleigh.gov.uk/FastWEB/" | |||||
| import FastWeb | |||||
| parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url) | |||||
| xml = parser.getResults(day, month, year) | |||||
| print "Content-Type: text/xml" # XML is following | |||||
| print xml # print the xml | |||||
| @@ -0,0 +1,29 @@ | |||||
| #!/usr/local/bin/python | |||||
| # This is the parser for Eden District Council. | |||||
| # it is generated from the file CGITemplate | |||||
| import cgi | |||||
| import cgitb | |||||
| #cgitb.enable(display=0, logdir="/tmp") | |||||
| form = cgi.FieldStorage() | |||||
| day = form.getfirst('day') | |||||
| month = form.getfirst('month') | |||||
| year = form.getfirst('year') | |||||
| authority_name = "Eden District Council" | |||||
| authority_short_name = "Eden" | |||||
| base_url = "http://eforms.eden.gov.uk/fastweb/" | |||||
| import FastWeb | |||||
| parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url) | |||||
| xml = parser.getResults(day, month, year) | |||||
| print "Content-Type: text/xml" # XML is following | |||||
| print xml # print the xml | |||||
| @@ -0,0 +1,207 @@ | |||||
| import urllib2 | |||||
| import HTMLParser | |||||
| import urlparse | |||||
| import datetime | |||||
| from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication | |||||
| # example url | |||||
| # http://www.planning.cravendc.gov.uk/fastweb/results.asp?Scroll=1&DateReceivedStart=1%2F1%2F2007&DateReceivedEnd=1%2F7%2F2007 | |||||
| search_form_url_end = "results.asp?Scroll=%(scroll)d&DateReceivedStart=%(day)d%%2F%(month)d%%2F%(year)d&DateReceivedEnd=%(day)d%%2F%(month)d%%2F%(year)d" | |||||
| # for testing paging | |||||
| #search_form_url_end = "results.asp?Scroll=%(scroll)d&DateReceivedStart=10%%2F7%%2F2007&DateReceivedEnd=%(day)d%%2F%(month)d%%2F%(year)d" | |||||
| comment_url_end = "comment.asp?AltRef=%s" | |||||
| info_url_end = "detail.asp?AltRef=%s" | |||||
| class FastWeb: | |||||
| def __init__(self, | |||||
| authority_name, | |||||
| authority_short_name, | |||||
| base_url, | |||||
| debug=False): | |||||
| self.authority_name = authority_name | |||||
| self.authority_short_name = authority_short_name | |||||
| self.base_url = base_url | |||||
| self.debug = debug | |||||
| # The object which stores our set of planning application results | |||||
| self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) | |||||
| def getResultsByDayMonthYear(self, day, month, year): | |||||
| requested_date = datetime.date(year, month, day) | |||||
| # What we should do: | |||||
| #1) Work out if the page we get back is a results page or the search page again. The search page indicates no results for this day. | |||||
| # Assuming we have a results page: | |||||
| #2) Get the total number of results out of it. We can use this to work out how many times we need to request the page, and with what scroll numbers | |||||
| #3) Iterate over scroll numbers. | |||||
| scroll = 0 | |||||
| first_time = True | |||||
| number_of_results = 0 | |||||
| while first_time or scroll * 20 < number_of_results: | |||||
| scroll += 1 | |||||
| this_search_url = search_form_url_end %{"scroll":scroll, "day":day, "month":month, "year":year} | |||||
| url = urlparse.urljoin(self.base_url, this_search_url) | |||||
| response = urllib2.urlopen(url) | |||||
| contents = response.read() | |||||
| if first_time: | |||||
| # We can now use the returned URL to tell us if there were no results. | |||||
| returned_url = response.geturl() | |||||
| # example URL of no results page | |||||
| # http://www.planning.cravendc.gov.uk/fastweb/search.asp?Results=none& | |||||
| if returned_url.count("search.asp"): | |||||
| # We got back the search page, there were no results for this date | |||||
| break | |||||
| results_page_parser = FastWebResultsPageParser(self._results, requested_date, self.base_url) | |||||
| results_page_parser.feed(contents) | |||||
| if first_time: | |||||
| number_of_results += results_page_parser.number_of_results | |||||
| first_time = False | |||||
| return self._results | |||||
| def getResults(self, day, month, year): | |||||
| return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() | |||||
| # States | |||||
| STARTING = 1 | |||||
| GOT_RESULTS_COUNT = 2 | |||||
| IN_RESULTS_TABLE = 3 | |||||
| IN_RESULTS_TABLE_TD = 4 | |||||
| IN_INNER_TABLE = 5 | |||||
| FINISHED = -1 | |||||
| class FastWebResultsPageParser(HTMLParser.HTMLParser): | |||||
| def __init__(self, results, requested_date, base_url): | |||||
| self.results = results | |||||
| self.requested_date = requested_date | |||||
| self.base_url = base_url | |||||
| HTMLParser.HTMLParser.__init__(self) | |||||
| # We'll use this to store the number of results returned for this search | |||||
| self.number_of_results = None | |||||
| self._state = STARTING | |||||
| self._td_count = None | |||||
| self._data_list = [] | |||||
| # This will store the planning application we are currently working on. | |||||
| self._current_application = None | |||||
| def get_data(self, flush=True): | |||||
| data = " ".join(self._data_list) | |||||
| if flush: | |||||
| self.flush_data() | |||||
| return data | |||||
| def flush_data(self): | |||||
| self._data_list = [] | |||||
| def handle_starttag(self, tag, attrs): | |||||
| if self._state == STARTING and tag == "input": | |||||
| self._state = GOT_RESULTS_COUNT | |||||
| # This is where the number of results returned is stored | |||||
| attr_dict = {} | |||||
| for attr_name, attr_value in attrs: | |||||
| attr_dict[attr_name] = attr_value | |||||
| if attr_dict.get("id") == "RecCount": | |||||
| self.number_of_results = int(attr_dict.get("value")) | |||||
| elif self._state == GOT_RESULTS_COUNT and tag == "table": | |||||
| self._state = IN_RESULTS_TABLE | |||||
| elif self._state == IN_RESULTS_TABLE and tag == "td": | |||||
| self._state = IN_RESULTS_TABLE_TD | |||||
| elif self._state == IN_RESULTS_TABLE_TD and tag == "table": | |||||
| self._state = IN_INNER_TABLE | |||||
| self._td_count = 0 | |||||
| self._current_application = PlanningApplication() | |||||
| self._current_application.date_received = self.requested_date | |||||
| elif self._state == IN_INNER_TABLE and tag == "td": | |||||
| self._td_count += 1 | |||||
| self.flush_data() | |||||
| def handle_endtag(self, tag): | |||||
| if self._state == IN_INNER_TABLE and tag == "table": | |||||
| # The next if should never be false, but it pays to be careful :-) | |||||
| if self._current_application.council_reference is not None: | |||||
| self.results.addApplication(self._current_application) | |||||
| self._state = IN_RESULTS_TABLE_TD | |||||
| elif self._state == IN_RESULTS_TABLE_TD and tag == "td": | |||||
| self._state = FINISHED | |||||
| elif self._state == IN_INNER_TABLE and tag == "td": | |||||
| if self._td_count == 2: | |||||
| # This data is the App No. | |||||
| council_reference = self.get_data().strip() | |||||
| self._current_application.council_reference = council_reference | |||||
| # This also gives us everything we need for the info and comment urls | |||||
| self._current_application.info_url = urlparse.urljoin(self.base_url, info_url_end %(council_reference)) | |||||
| self._current_application.comment_url = urlparse.urljoin(self.base_url, comment_url_end %(council_reference)) | |||||
| elif self._td_count == 4: | |||||
| # This data is the address | |||||
| self._current_application.address = self.get_data().strip() | |||||
| self._current_application.postcode = getPostcodeFromText(self._current_application.address) | |||||
| elif self._td_count == 7: | |||||
| # This data is the description | |||||
| self._current_application.description = self.get_data().strip() | |||||
| def handle_data(self, data): | |||||
| self._data_list.append(data) | |||||
| # for debug purposes | |||||
| #cravenparser = FastWeb("Craven District Council", "Craven", "http://www.planning.cravendc.gov.uk/fastweb/") | |||||
| #eastleighparser = FastWeb("EastLeigh Borough Council", "Eastleigh", "http://www.eastleigh.gov.uk/FastWEB/") | |||||
| #suttonparser = FastWeb("Sutton", "Sutton", "http://82.43.4.135/FASTWEB/") | |||||
| #print eastleighparser.getResults(10,8,2007) | |||||
| #print cravenparser.getResults(25,12,2006) | |||||
| #print suttonparser.getResults(10,8,2007) | |||||
| #south_lakeland_parser = FastWeb("South Lakeland", "South Lakeland", "http://www.southlakeland.gov.uk/fastweb/") | |||||
| #print south_lakeland_parser.getResults(27,11,2006) | |||||
| @@ -0,0 +1,29 @@ | |||||
| #!/usr/local/bin/python | |||||
| # This is the parser for Mansfield District Council. | |||||
| # it is generated from the file CGITemplate | |||||
| import cgi | |||||
| import cgitb | |||||
| #cgitb.enable(display=0, logdir="/tmp") | |||||
| form = cgi.FieldStorage() | |||||
| day = form.getfirst('day') | |||||
| month = form.getfirst('month') | |||||
| year = form.getfirst('year') | |||||
| authority_name = "Mansfield District Council" | |||||
| authority_short_name = "Mansfield" | |||||
| base_url = "http://www.mansfield.gov.uk/Fastweb23/" | |||||
| import FastWeb | |||||
| parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url) | |||||
| xml = parser.getResults(day, month, year) | |||||
| print "Content-Type: text/xml" # XML is following | |||||
| print xml # print the xml | |||||
| @@ -90,6 +90,7 @@ class PlanningApplication: | |||||
| return self.displayXML() | return self.displayXML() | ||||
| def displayXML(self): | def displayXML(self): | ||||
| #print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received | |||||
| return "<application>\n" +\ | return "<application>\n" +\ | ||||
| "<council_reference>%s</council_reference>\n" %xmlQuote(self.council_reference) +\ | "<council_reference>%s</council_reference>\n" %xmlQuote(self.council_reference) +\ | ||||
| "<address>%s</address>\n" %xmlQuote(self.address) +\ | "<address>%s</address>\n" %xmlQuote(self.address) +\ | ||||
| @@ -0,0 +1,29 @@ | |||||
| #!/usr/local/bin/python | |||||
| # This is the parser for South Lakeland District Council. | |||||
| # it is generated from the file CGITemplate | |||||
| import cgi | |||||
| import cgitb | |||||
| #cgitb.enable(display=0, logdir="/tmp") | |||||
| form = cgi.FieldStorage() | |||||
| day = form.getfirst('day') | |||||
| month = form.getfirst('month') | |||||
| year = form.getfirst('year') | |||||
| authority_name = "South Lakeland District Council" | |||||
| authority_short_name = "South Lakeland" | |||||
| base_url = "http://www.southlakeland.gov.uk/fastweb/" | |||||
| import FastWeb | |||||
| parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url) | |||||
| xml = parser.getResults(day, month, year) | |||||
| print "Content-Type: text/xml" # XML is following | |||||
| print xml # print the xml | |||||
| @@ -0,0 +1,29 @@ | |||||
| #!/usr/local/bin/python | |||||
| # This is the parser for London Borough of Sutton. | |||||
| # it is generated from the file CGITemplate | |||||
| import cgi | |||||
| import cgitb | |||||
| #cgitb.enable(display=0, logdir="/tmp") | |||||
| form = cgi.FieldStorage() | |||||
| day = form.getfirst('day') | |||||
| month = form.getfirst('month') | |||||
| year = form.getfirst('year') | |||||
| authority_name = "London Borough of Sutton" | |||||
| authority_short_name = "Sutton" | |||||
| base_url = "http://82.43.4.135/FASTWEB/" | |||||
| import FastWeb | |||||
| parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url) | |||||
| xml = parser.getResults(day, month, year) | |||||
| print "Content-Type: text/xml" # XML is following | |||||
| print xml # print the xml | |||||
| @@ -0,0 +1,29 @@ | |||||
| #!/usr/local/bin/python | |||||
| # This is the parser for Welwyn-Hatfield District Council. | |||||
| # it is generated from the file CGITemplate | |||||
| import cgi | |||||
| import cgitb | |||||
| #cgitb.enable(display=0, logdir="/tmp") | |||||
| form = cgi.FieldStorage() | |||||
| day = form.getfirst('day') | |||||
| month = form.getfirst('month') | |||||
| year = form.getfirst('year') | |||||
| authority_name = "Welwyn-Hatfield District Council" | |||||
| authority_short_name = "Welwyn-Hatfield" | |||||
| base_url = "https://fastweb.welhat.gov.uk/" | |||||
| import FastWeb | |||||
| parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url) | |||||
| xml = parser.getResults(day, month, year) | |||||
| print "Content-Type: text/xml" # XML is following | |||||
| print xml # print the xml | |||||
| @@ -0,0 +1,29 @@ | |||||
| #!/usr/local/bin/python | |||||
| # This is the parser for Wyre Forest District Council. | |||||
| # it is generated from the file CGITemplate | |||||
| import cgi | |||||
| import cgitb | |||||
| #cgitb.enable(display=0, logdir="/tmp") | |||||
| form = cgi.FieldStorage() | |||||
| day = form.getfirst('day') | |||||
| month = form.getfirst('month') | |||||
| year = form.getfirst('year') | |||||
| authority_name = "Wyre Forest District Council" | |||||
| authority_short_name = "Wyre Forest" | |||||
| base_url = "http://www.wyreforest.gov.uk/fastweb/" | |||||
| import FastWeb | |||||
| parser = FastWeb.FastWeb(authority_name, authority_short_name, base_url) | |||||
| xml = parser.getResults(day, month, year) | |||||
| print "Content-Type: text/xml" # XML is following | |||||
| print xml # print the xml | |||||
| @@ -56,22 +56,15 @@ class FastWeb: | |||||
| url = urlparse.urljoin(self.base_url, this_search_url) | url = urlparse.urljoin(self.base_url, this_search_url) | ||||
| response = urllib2.urlopen(url) | response = urllib2.urlopen(url) | ||||
| #print response.info() | |||||
| #print response.geturl() | |||||
| contents = response.read() | contents = response.read() | ||||
| #print contents | |||||
| if first_time: | if first_time: | ||||
| # We can now use the returned URL to tell us if there were no results. | # We can now use the returned URL to tell us if there were no results. | ||||
| returned_url = response.geturl() | returned_url = response.geturl() | ||||
| #parsed_returned_url = urlparse.urlparse(returned_url) | |||||
| # example URL of no results page | # example URL of no results page | ||||
| # http://www.planning.cravendc.gov.uk/fastweb/search.asp?Results=none& | # http://www.planning.cravendc.gov.uk/fastweb/search.asp?Results=none& | ||||
| #print parsed_returned_url | |||||
| if returned_url.count("search.asp"): | if returned_url.count("search.asp"): | ||||
| #if parsed_returned_url[4] == "search.asp?Results=none&": | |||||
| # We got back the search page, there were no results for this date | # We got back the search page, there were no results for this date | ||||
| break | break | ||||
| @@ -136,7 +129,7 @@ class FastWebResultsPageParser(HTMLParser.HTMLParser): | |||||
| def handle_starttag(self, tag, attrs): | def handle_starttag(self, tag, attrs): | ||||
| if self._state == STARTING and tag == "input": | if self._state == STARTING and tag == "input": | ||||
| self._state = GOT_RESULTS_COUNT | self._state = GOT_RESULTS_COUNT | ||||
| #print attrs | |||||
| # This is where the number of results returned is stored | # This is where the number of results returned is stored | ||||
| attr_dict = {} | attr_dict = {} | ||||
| @@ -145,7 +138,6 @@ class FastWebResultsPageParser(HTMLParser.HTMLParser): | |||||
| if attr_dict.get("id") == "RecCount": | if attr_dict.get("id") == "RecCount": | ||||
| self.number_of_results = int(attr_dict.get("value")) | self.number_of_results = int(attr_dict.get("value")) | ||||
| #print self.number_of_results | |||||
| elif self._state == GOT_RESULTS_COUNT and tag == "table": | elif self._state == GOT_RESULTS_COUNT and tag == "table": | ||||
| self._state = IN_RESULTS_TABLE | self._state = IN_RESULTS_TABLE | ||||
| @@ -213,7 +205,3 @@ class FastWebResultsPageParser(HTMLParser.HTMLParser): | |||||
| #print south_lakeland_parser.getResults(27,11,2006) | #print south_lakeland_parser.getResults(27,11,2006) | ||||
| # To do | |||||
| # 3) integrate with other scrapers | |||||
| # 4) other fastweb sites | |||||