|
|
@@ -0,0 +1,219 @@ |
|
|
|
|
|
|
|
import urllib2 |
|
|
|
import HTMLParser |
|
|
|
import urlparse |
|
|
|
import datetime |
|
|
|
|
|
|
|
from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication |
|
|
|
|
|
|
|
# example url |
|
|
|
# http://www.planning.cravendc.gov.uk/fastweb/results.asp?Scroll=1&DateReceivedStart=1%2F1%2F2007&DateReceivedEnd=1%2F7%2F2007 |
|
|
|
|
|
|
|
search_form_url_end = "results.asp?Scroll=%(scroll)d&DateReceivedStart=%(day)d%%2F%(month)d%%2F%(year)d&DateReceivedEnd=%(day)d%%2F%(month)d%%2F%(year)d" |
|
|
|
|
|
|
|
# for testing paging |
|
|
|
#search_form_url_end = "results.asp?Scroll=%(scroll)d&DateReceivedStart=10%%2F7%%2F2007&DateReceivedEnd=%(day)d%%2F%(month)d%%2F%(year)d" |
|
|
|
|
|
|
|
comment_url_end = "comment.asp?AltRef=%s" |
|
|
|
info_url_end = "detail.asp?AltRef=%s" |
|
|
|
|
|
|
|
class FastWeb: |
|
|
|
def __init__(self, |
|
|
|
authority_name, |
|
|
|
authority_short_name, |
|
|
|
base_url, |
|
|
|
debug=False): |
|
|
|
|
|
|
|
self.authority_name = authority_name |
|
|
|
self.authority_short_name = authority_short_name |
|
|
|
self.base_url = base_url |
|
|
|
|
|
|
|
self.debug = debug |
|
|
|
|
|
|
|
# The object which stores our set of planning application results |
|
|
|
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) |
|
|
|
|
|
|
|
def getResultsByDayMonthYear(self, day, month, year): |
|
|
|
requested_date = datetime.date(year, month, day) |
|
|
|
|
|
|
|
# What we should do: |
|
|
|
|
|
|
|
#1) Work out if the page we get back is a results page or the search page again. The search page indicates no results for this day. |
|
|
|
|
|
|
|
# Assuming we have a results page: |
|
|
|
#2) Get the total number of results out of it. We can use this to work out how many times we need to request the page, and with what scroll numbers |
|
|
|
|
|
|
|
#3) Iterate over scroll numbers. |
|
|
|
|
|
|
|
scroll = 0 |
|
|
|
first_time = True |
|
|
|
number_of_results = 0 |
|
|
|
|
|
|
|
while first_time or scroll * 20 < number_of_results: |
|
|
|
scroll += 1 |
|
|
|
|
|
|
|
this_search_url = search_form_url_end %{"scroll":scroll, "day":day, "month":month, "year":year} |
|
|
|
url = urlparse.urljoin(self.base_url, this_search_url) |
|
|
|
response = urllib2.urlopen(url) |
|
|
|
|
|
|
|
#print response.info() |
|
|
|
#print response.geturl() |
|
|
|
|
|
|
|
contents = response.read() |
|
|
|
#print contents |
|
|
|
|
|
|
|
if first_time: |
|
|
|
# We can now use the returned URL to tell us if there were no results. |
|
|
|
returned_url = response.geturl() |
|
|
|
#parsed_returned_url = urlparse.urlparse(returned_url) |
|
|
|
|
|
|
|
# example URL of no results page |
|
|
|
# http://www.planning.cravendc.gov.uk/fastweb/search.asp?Results=none& |
|
|
|
#print parsed_returned_url |
|
|
|
if returned_url.count("search.asp"): |
|
|
|
#if parsed_returned_url[4] == "search.asp?Results=none&": |
|
|
|
# We got back the search page, there were no results for this date |
|
|
|
break |
|
|
|
|
|
|
|
results_page_parser = FastWebResultsPageParser(self._results, requested_date, self.base_url) |
|
|
|
results_page_parser.feed(contents) |
|
|
|
|
|
|
|
if first_time: |
|
|
|
number_of_results += results_page_parser.number_of_results |
|
|
|
|
|
|
|
first_time = False |
|
|
|
|
|
|
|
return self._results |
|
|
|
|
|
|
|
def getResults(self, day, month, year): |
|
|
|
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# States |
|
|
|
|
|
|
|
STARTING = 1 |
|
|
|
GOT_RESULTS_COUNT = 2 |
|
|
|
IN_RESULTS_TABLE = 3 |
|
|
|
IN_RESULTS_TABLE_TD = 4 |
|
|
|
IN_INNER_TABLE = 5 |
|
|
|
FINISHED = -1 |
|
|
|
|
|
|
|
|
|
|
|
class FastWebResultsPageParser(HTMLParser.HTMLParser): |
|
|
|
def __init__(self, results, requested_date, base_url): |
|
|
|
|
|
|
|
self.results = results |
|
|
|
|
|
|
|
self.requested_date = requested_date |
|
|
|
self.base_url = base_url |
|
|
|
|
|
|
|
|
|
|
|
HTMLParser.HTMLParser.__init__(self) |
|
|
|
|
|
|
|
# We'll use this to store the number of results returned for this search |
|
|
|
self.number_of_results = None |
|
|
|
|
|
|
|
self._state = STARTING |
|
|
|
self._td_count = None |
|
|
|
|
|
|
|
self._data_list = [] |
|
|
|
|
|
|
|
# This will store the planning application we are currently working on. |
|
|
|
self._current_application = None |
|
|
|
|
|
|
|
def get_data(self, flush=True): |
|
|
|
data = " ".join(self._data_list) |
|
|
|
|
|
|
|
if flush: |
|
|
|
self.flush_data() |
|
|
|
|
|
|
|
return data |
|
|
|
|
|
|
|
def flush_data(self): |
|
|
|
self._data_list = [] |
|
|
|
|
|
|
|
def handle_starttag(self, tag, attrs): |
|
|
|
if self._state == STARTING and tag == "input": |
|
|
|
self._state = GOT_RESULTS_COUNT |
|
|
|
#print attrs |
|
|
|
# This is where the number of results returned is stored |
|
|
|
attr_dict = {} |
|
|
|
|
|
|
|
for attr_name, attr_value in attrs: |
|
|
|
attr_dict[attr_name] = attr_value |
|
|
|
|
|
|
|
if attr_dict.get("id") == "RecCount": |
|
|
|
self.number_of_results = int(attr_dict.get("value")) |
|
|
|
#print self.number_of_results |
|
|
|
|
|
|
|
elif self._state == GOT_RESULTS_COUNT and tag == "table": |
|
|
|
self._state = IN_RESULTS_TABLE |
|
|
|
|
|
|
|
elif self._state == IN_RESULTS_TABLE and tag == "td": |
|
|
|
self._state = IN_RESULTS_TABLE_TD |
|
|
|
elif self._state == IN_RESULTS_TABLE_TD and tag == "table": |
|
|
|
self._state = IN_INNER_TABLE |
|
|
|
self._td_count = 0 |
|
|
|
self._current_application = PlanningApplication() |
|
|
|
self._current_application.date_received = self.requested_date |
|
|
|
|
|
|
|
elif self._state == IN_INNER_TABLE and tag == "td": |
|
|
|
self._td_count += 1 |
|
|
|
self.flush_data() |
|
|
|
|
|
|
|
def handle_endtag(self, tag): |
|
|
|
if self._state == IN_INNER_TABLE and tag == "table": |
|
|
|
# The next if should never be false, but it pays to be careful :-) |
|
|
|
if self._current_application.council_reference is not None: |
|
|
|
self.results.addApplication(self._current_application) |
|
|
|
self._state = IN_RESULTS_TABLE_TD |
|
|
|
|
|
|
|
elif self._state == IN_RESULTS_TABLE_TD and tag == "td": |
|
|
|
self._state = FINISHED |
|
|
|
|
|
|
|
elif self._state == IN_INNER_TABLE and tag == "td": |
|
|
|
if self._td_count == 2: |
|
|
|
# This data is the App No. |
|
|
|
council_reference = self.get_data().strip() |
|
|
|
self._current_application.council_reference = council_reference |
|
|
|
|
|
|
|
# This also gives us everything we need for the info and comment urls |
|
|
|
self._current_application.info_url = urlparse.urljoin(self.base_url, info_url_end %(council_reference)) |
|
|
|
self._current_application.comment_url = urlparse.urljoin(self.base_url, comment_url_end %(council_reference)) |
|
|
|
|
|
|
|
elif self._td_count == 4: |
|
|
|
# This data is the address |
|
|
|
self._current_application.address = self.get_data().strip() |
|
|
|
self._current_application.postcode = getPostcodeFromText(self._current_application.address) |
|
|
|
elif self._td_count == 7: |
|
|
|
# This data is the description |
|
|
|
self._current_application.description = self.get_data().strip() |
|
|
|
|
|
|
|
|
|
|
|
def handle_data(self, data): |
|
|
|
self._data_list.append(data) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# for debug purposes |
|
|
|
|
|
|
|
#cravenparser = FastWeb("Craven District Council", "Craven", "http://www.planning.cravendc.gov.uk/fastweb/") |
|
|
|
|
|
|
|
#eastleighparser = FastWeb("EastLeigh Borough Council", "Eastleigh", "http://www.eastleigh.gov.uk/FastWEB/") |
|
|
|
|
|
|
|
|
|
|
|
#suttonparser = FastWeb("Sutton", "Sutton", "http://82.43.4.135/FASTWEB/") |
|
|
|
|
|
|
|
#print eastleighparser.getResults(10,8,2007) |
|
|
|
#print cravenparser.getResults(25,12,2006) |
|
|
|
#print suttonparser.getResults(10,8,2007) |
|
|
|
|
|
|
|
#south_lakeland_parser = FastWeb("South Lakeland", "South Lakeland", "http://www.southlakeland.gov.uk/fastweb/") |
|
|
|
|
|
|
|
#print south_lakeland_parser.getResults(27,11,2006) |
|
|
|
|
|
|
|
# To do |
|
|
|
|
|
|
|
# 3) integrate with other scrapers |
|
|
|
# 4) other fastweb sites |