Automatically exported from code.google.com/p/planningalerts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

220 line
7.7 KiB

  1. import urllib2
  2. import HTMLParser
  3. import urlparse
  4. import datetime
  5. from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication
  6. # example url
  7. # http://www.planning.cravendc.gov.uk/fastweb/results.asp?Scroll=1&DateReceivedStart=1%2F1%2F2007&DateReceivedEnd=1%2F7%2F2007
  8. search_form_url_end = "results.asp?Scroll=%(scroll)d&DateReceivedStart=%(day)d%%2F%(month)d%%2F%(year)d&DateReceivedEnd=%(day)d%%2F%(month)d%%2F%(year)d"
  9. # for testing paging
  10. #search_form_url_end = "results.asp?Scroll=%(scroll)d&DateReceivedStart=10%%2F7%%2F2007&DateReceivedEnd=%(day)d%%2F%(month)d%%2F%(year)d"
  11. comment_url_end = "comment.asp?AltRef=%s"
  12. info_url_end = "detail.asp?AltRef=%s"
  13. class FastWeb:
  14. def __init__(self,
  15. authority_name,
  16. authority_short_name,
  17. base_url,
  18. debug=False):
  19. self.authority_name = authority_name
  20. self.authority_short_name = authority_short_name
  21. self.base_url = base_url
  22. self.debug = debug
  23. # The object which stores our set of planning application results
  24. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  25. def getResultsByDayMonthYear(self, day, month, year):
  26. requested_date = datetime.date(year, month, day)
  27. # What we should do:
  28. #1) Work out if the page we get back is a results page or the search page again. The search page indicates no results for this day.
  29. # Assuming we have a results page:
  30. #2) Get the total number of results out of it. We can use this to work out how many times we need to request the page, and with what scroll numbers
  31. #3) Iterate over scroll numbers.
  32. scroll = 0
  33. first_time = True
  34. number_of_results = 0
  35. while first_time or scroll * 20 < number_of_results:
  36. scroll += 1
  37. this_search_url = search_form_url_end %{"scroll":scroll, "day":day, "month":month, "year":year}
  38. url = urlparse.urljoin(self.base_url, this_search_url)
  39. response = urllib2.urlopen(url)
  40. #print response.info()
  41. #print response.geturl()
  42. contents = response.read()
  43. #print contents
  44. if first_time:
  45. # We can now use the returned URL to tell us if there were no results.
  46. returned_url = response.geturl()
  47. #parsed_returned_url = urlparse.urlparse(returned_url)
  48. # example URL of no results page
  49. # http://www.planning.cravendc.gov.uk/fastweb/search.asp?Results=none&
  50. #print parsed_returned_url
  51. if returned_url.count("search.asp"):
  52. #if parsed_returned_url[4] == "search.asp?Results=none&":
  53. # We got back the search page, there were no results for this date
  54. break
  55. results_page_parser = FastWebResultsPageParser(self._results, requested_date, self.base_url)
  56. results_page_parser.feed(contents)
  57. if first_time:
  58. number_of_results += results_page_parser.number_of_results
  59. first_time = False
  60. return self._results
  61. def getResults(self, day, month, year):
  62. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  63. # States
  64. STARTING = 1
  65. GOT_RESULTS_COUNT = 2
  66. IN_RESULTS_TABLE = 3
  67. IN_RESULTS_TABLE_TD = 4
  68. IN_INNER_TABLE = 5
  69. FINISHED = -1
  70. class FastWebResultsPageParser(HTMLParser.HTMLParser):
  71. def __init__(self, results, requested_date, base_url):
  72. self.results = results
  73. self.requested_date = requested_date
  74. self.base_url = base_url
  75. HTMLParser.HTMLParser.__init__(self)
  76. # We'll use this to store the number of results returned for this search
  77. self.number_of_results = None
  78. self._state = STARTING
  79. self._td_count = None
  80. self._data_list = []
  81. # This will store the planning application we are currently working on.
  82. self._current_application = None
  83. def get_data(self, flush=True):
  84. data = " ".join(self._data_list)
  85. if flush:
  86. self.flush_data()
  87. return data
  88. def flush_data(self):
  89. self._data_list = []
  90. def handle_starttag(self, tag, attrs):
  91. if self._state == STARTING and tag == "input":
  92. self._state = GOT_RESULTS_COUNT
  93. #print attrs
  94. # This is where the number of results returned is stored
  95. attr_dict = {}
  96. for attr_name, attr_value in attrs:
  97. attr_dict[attr_name] = attr_value
  98. if attr_dict.get("id") == "RecCount":
  99. self.number_of_results = int(attr_dict.get("value"))
  100. #print self.number_of_results
  101. elif self._state == GOT_RESULTS_COUNT and tag == "table":
  102. self._state = IN_RESULTS_TABLE
  103. elif self._state == IN_RESULTS_TABLE and tag == "td":
  104. self._state = IN_RESULTS_TABLE_TD
  105. elif self._state == IN_RESULTS_TABLE_TD and tag == "table":
  106. self._state = IN_INNER_TABLE
  107. self._td_count = 0
  108. self._current_application = PlanningApplication()
  109. self._current_application.date_received = self.requested_date
  110. elif self._state == IN_INNER_TABLE and tag == "td":
  111. self._td_count += 1
  112. self.flush_data()
  113. def handle_endtag(self, tag):
  114. if self._state == IN_INNER_TABLE and tag == "table":
  115. # The next if should never be false, but it pays to be careful :-)
  116. if self._current_application.council_reference is not None:
  117. self.results.addApplication(self._current_application)
  118. self._state = IN_RESULTS_TABLE_TD
  119. elif self._state == IN_RESULTS_TABLE_TD and tag == "td":
  120. self._state = FINISHED
  121. elif self._state == IN_INNER_TABLE and tag == "td":
  122. if self._td_count == 2:
  123. # This data is the App No.
  124. council_reference = self.get_data().strip()
  125. self._current_application.council_reference = council_reference
  126. # This also gives us everything we need for the info and comment urls
  127. self._current_application.info_url = urlparse.urljoin(self.base_url, info_url_end %(council_reference))
  128. self._current_application.comment_url = urlparse.urljoin(self.base_url, comment_url_end %(council_reference))
  129. elif self._td_count == 4:
  130. # This data is the address
  131. self._current_application.address = self.get_data().strip()
  132. self._current_application.postcode = getPostcodeFromText(self._current_application.address)
  133. elif self._td_count == 7:
  134. # This data is the description
  135. self._current_application.description = self.get_data().strip()
  136. def handle_data(self, data):
  137. self._data_list.append(data)
  138. # for debug purposes
  139. #cravenparser = FastWeb("Craven District Council", "Craven", "http://www.planning.cravendc.gov.uk/fastweb/")
  140. #eastleighparser = FastWeb("EastLeigh Borough Council", "Eastleigh", "http://www.eastleigh.gov.uk/FastWEB/")
  141. #suttonparser = FastWeb("Sutton", "Sutton", "http://82.43.4.135/FASTWEB/")
  142. #print eastleighparser.getResults(10,8,2007)
  143. #print cravenparser.getResults(25,12,2006)
  144. #print suttonparser.getResults(10,8,2007)
  145. #south_lakeland_parser = FastWeb("South Lakeland", "South Lakeland", "http://www.southlakeland.gov.uk/fastweb/")
  146. #print south_lakeland_parser.getResults(27,11,2006)
  147. # To do
  148. # 3) integrate with other scrapers
  149. # 4) other fastweb sites