diff --git a/python_scrapers/FastWeb.py b/python_scrapers/FastWeb.py new file mode 100644 index 0000000..4386ce4 --- /dev/null +++ b/python_scrapers/FastWeb.py @@ -0,0 +1,219 @@ + +import urllib2 +import HTMLParser +import urlparse +import datetime + +from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication + +# example url +# http://www.planning.cravendc.gov.uk/fastweb/results.asp?Scroll=1&DateReceivedStart=1%2F1%2F2007&DateReceivedEnd=1%2F7%2F2007 + +search_form_url_end = "results.asp?Scroll=%(scroll)d&DateReceivedStart=%(day)d%%2F%(month)d%%2F%(year)d&DateReceivedEnd=%(day)d%%2F%(month)d%%2F%(year)d" + +# for testing paging +#search_form_url_end = "results.asp?Scroll=%(scroll)d&DateReceivedStart=10%%2F7%%2F2007&DateReceivedEnd=%(day)d%%2F%(month)d%%2F%(year)d" + +comment_url_end = "comment.asp?AltRef=%s" +info_url_end = "detail.asp?AltRef=%s" + +class FastWeb: + def __init__(self, + authority_name, + authority_short_name, + base_url, + debug=False): + + self.authority_name = authority_name + self.authority_short_name = authority_short_name + self.base_url = base_url + + self.debug = debug + + # The object which stores our set of planning application results + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + def getResultsByDayMonthYear(self, day, month, year): + requested_date = datetime.date(year, month, day) + + # What we should do: + + #1) Work out if the page we get back is a results page or the search page again. The search page indicates no results for this day. + + # Assuming we have a results page: + #2) Get the total number of results out of it. We can use this to work out how many times we need to request the page, and with what scroll numbers + + #3) Iterate over scroll numbers. + + scroll = 0 + first_time = True + number_of_results = 0 + + while first_time or scroll * 20 < number_of_results: + scroll += 1 + + this_search_url = search_form_url_end %{"scroll":scroll, "day":day, "month":month, "year":year} + url = urlparse.urljoin(self.base_url, this_search_url) + response = urllib2.urlopen(url) + + #print response.info() + #print response.geturl() + + contents = response.read() + #print contents + + if first_time: + # We can now use the returned URL to tell us if there were no results. + returned_url = response.geturl() + #parsed_returned_url = urlparse.urlparse(returned_url) + + # example URL of no results page + # http://www.planning.cravendc.gov.uk/fastweb/search.asp?Results=none& + #print parsed_returned_url + if returned_url.count("search.asp"): + #if parsed_returned_url[4] == "search.asp?Results=none&": + # We got back the search page, there were no results for this date + break + + results_page_parser = FastWebResultsPageParser(self._results, requested_date, self.base_url) + results_page_parser.feed(contents) + + if first_time: + number_of_results += results_page_parser.number_of_results + + first_time = False + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + + + +# States + +STARTING = 1 +GOT_RESULTS_COUNT = 2 +IN_RESULTS_TABLE = 3 +IN_RESULTS_TABLE_TD = 4 +IN_INNER_TABLE = 5 +FINISHED = -1 + + +class FastWebResultsPageParser(HTMLParser.HTMLParser): + def __init__(self, results, requested_date, base_url): + + self.results = results + + self.requested_date = requested_date + self.base_url = base_url + + + HTMLParser.HTMLParser.__init__(self) + + # We'll use this to store the number of results returned for this search + self.number_of_results = None + + self._state = STARTING + self._td_count = None + + self._data_list = [] + + # This will store the planning application we are currently working on. + self._current_application = None + + def get_data(self, flush=True): + data = " ".join(self._data_list) + + if flush: + self.flush_data() + + return data + + def flush_data(self): + self._data_list = [] + + def handle_starttag(self, tag, attrs): + if self._state == STARTING and tag == "input": + self._state = GOT_RESULTS_COUNT + #print attrs + # This is where the number of results returned is stored + attr_dict = {} + + for attr_name, attr_value in attrs: + attr_dict[attr_name] = attr_value + + if attr_dict.get("id") == "RecCount": + self.number_of_results = int(attr_dict.get("value")) + #print self.number_of_results + + elif self._state == GOT_RESULTS_COUNT and tag == "table": + self._state = IN_RESULTS_TABLE + + elif self._state == IN_RESULTS_TABLE and tag == "td": + self._state = IN_RESULTS_TABLE_TD + elif self._state == IN_RESULTS_TABLE_TD and tag == "table": + self._state = IN_INNER_TABLE + self._td_count = 0 + self._current_application = PlanningApplication() + self._current_application.date_received = self.requested_date + + elif self._state == IN_INNER_TABLE and tag == "td": + self._td_count += 1 + self.flush_data() + + def handle_endtag(self, tag): + if self._state == IN_INNER_TABLE and tag == "table": + # The next if should never be false, but it pays to be careful :-) + if self._current_application.council_reference is not None: + self.results.addApplication(self._current_application) + self._state = IN_RESULTS_TABLE_TD + + elif self._state == IN_RESULTS_TABLE_TD and tag == "td": + self._state = FINISHED + + elif self._state == IN_INNER_TABLE and tag == "td": + if self._td_count == 2: + # This data is the App No. + council_reference = self.get_data().strip() + self._current_application.council_reference = council_reference + + # This also gives us everything we need for the info and comment urls + self._current_application.info_url = urlparse.urljoin(self.base_url, info_url_end %(council_reference)) + self._current_application.comment_url = urlparse.urljoin(self.base_url, comment_url_end %(council_reference)) + + elif self._td_count == 4: + # This data is the address + self._current_application.address = self.get_data().strip() + self._current_application.postcode = getPostcodeFromText(self._current_application.address) + elif self._td_count == 7: + # This data is the description + self._current_application.description = self.get_data().strip() + + + def handle_data(self, data): + self._data_list.append(data) + + + +# for debug purposes + +#cravenparser = FastWeb("Craven District Council", "Craven", "http://www.planning.cravendc.gov.uk/fastweb/") + +#eastleighparser = FastWeb("EastLeigh Borough Council", "Eastleigh", "http://www.eastleigh.gov.uk/FastWEB/") + + +#suttonparser = FastWeb("Sutton", "Sutton", "http://82.43.4.135/FASTWEB/") + +#print eastleighparser.getResults(10,8,2007) +#print cravenparser.getResults(25,12,2006) +#print suttonparser.getResults(10,8,2007) + +#south_lakeland_parser = FastWeb("South Lakeland", "South Lakeland", "http://www.southlakeland.gov.uk/fastweb/") + +#print south_lakeland_parser.getResults(27,11,2006) + +# To do + +# 3) integrate with other scrapers +# 4) other fastweb sites diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index f60c5db..8ba04d7 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -6,3 +6,4 @@ "ApplicationSearchServletParser.py", "420" "AcolnetParser.py", "420" "MultipartPostHandler.py", "420" +"FastWeb.py", "420" diff --git a/python_scrapers/PlanningUtils.py b/python_scrapers/PlanningUtils.py index 8e12412..9210446 100644 --- a/python_scrapers/PlanningUtils.py +++ b/python_scrapers/PlanningUtils.py @@ -90,6 +90,7 @@ class PlanningApplication: return self.displayXML() def displayXML(self): + #print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received return "\n" +\ "%s\n" %xmlQuote(self.council_reference) +\ "
%s
\n" %xmlQuote(self.address) +\ diff --git a/python_scrapers/PublicAccessSites.csv b/python_scrapers/PublicAccessSites.csv index cfbba04..120c4af 100644 --- a/python_scrapers/PublicAccessSites.csv +++ b/python_scrapers/PublicAccessSites.csv @@ -112,4 +112,12 @@ "South Bedfordshire District Council", "South Bedfordshire", "http://planning.southbeds.gov.uk/plantech/DCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.PgeSearch", "AcolnetParser", "SouthBedfordshireParser" "Suffolk Coastal District Council", "Suffolk Coastal", "https://apps3.suffolkcoastal.gov.uk/planningonline/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "SuffolkCoastalParser" "Surrey Heath Borough Council", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "SurreyHeathParser" -"New Forest District Council", "New Forest DC", "http://web3.newforest.gov.uk/planningonline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "NewForestDCParser" \ No newline at end of file +"New Forest District Council", "New Forest DC", "http://web3.newforest.gov.uk/planningonline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "NewForestDCParser" +"Craven District Council", "Craven", "http://www.planning.cravendc.gov.uk/fastweb/", "FastWeb", "FastWeb" +"Eastleigh Borough Council", "Eastleigh", "http://www.eastleigh.gov.uk/FastWEB/", "FastWeb", "FastWeb" +"Eden District Council", "Eden", "http://eforms.eden.gov.uk/fastweb/", "FastWeb", "FastWeb" +"Mansfield District Council", "Mansfield", "http://www.mansfield.gov.uk/Fastweb23/", "FastWeb", "FastWeb" +"South Lakeland District Council", "South Lakeland", "http://www.southlakeland.gov.uk/fastweb/", "FastWeb", "FastWeb" +"London Borough of Sutton", "Sutton", "http://82.43.4.135/FASTWEB/", "FastWeb", "FastWeb" +"Welwyn-Hatfield District Council", "Welwyn-Hatfield", "https://fastweb.welhat.gov.uk/", "FastWeb", "FastWeb" +"Wyre Forest District Council", "Wyre Forest", "http://www.wyreforest.gov.uk/fastweb/", "FastWeb", "FastWeb" \ No newline at end of file diff --git a/python_scrapers/README b/python_scrapers/README index 477fdbc..bc4bbba 100644 --- a/python_scrapers/README +++ b/python_scrapers/README @@ -1,10 +1,9 @@ -In order to generate the contents of the CGI directory (../CGI) +In order to generate the contents of the CGI directory (../cgi-bin/) run ./createCGI.sh -This script svn deletes the old CGI directory's contents, -generates new cgi files in the CGI directory, +This script generates new cgi files in the CGI directory, copies in some other files that are needed, and commits all these changes to svn. diff --git a/python_scrapers/generateCGIScripts.py b/python_scrapers/generateCGIScripts.py index b597630..f29e65a 100755 --- a/python_scrapers/generateCGIScripts.py +++ b/python_scrapers/generateCGIScripts.py @@ -1,6 +1,6 @@ #!/usr/local/bin/python -list_of_sites_filename = "PublicAccessSites.csv" +list_of_sites_filename = "SitesToGenerate.csv" other_files_to_copy_filename = "OtherFilesToCopy.csv" template_filename = "CGITemplate" python_location = "/usr/local/bin/python"