Automatically exported from code.google.com/p/planningalerts
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 
 
 
 
 

208 linhas
7.3 KiB

  1. import urllib2
  2. import HTMLParser
  3. import urlparse
  4. import datetime
  5. from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication
  6. # example url
  7. # http://www.planning.cravendc.gov.uk/fastweb/results.asp?Scroll=1&DateReceivedStart=1%2F1%2F2007&DateReceivedEnd=1%2F7%2F2007
  8. search_form_url_end = "results.asp?Scroll=%(scroll)d&DateReceivedStart=%(day)d%%2F%(month)d%%2F%(year)d&DateReceivedEnd=%(day)d%%2F%(month)d%%2F%(year)d"
  9. # for testing paging
  10. #search_form_url_end = "results.asp?Scroll=%(scroll)d&DateReceivedStart=10%%2F7%%2F2007&DateReceivedEnd=%(day)d%%2F%(month)d%%2F%(year)d"
  11. comment_url_end = "comment.asp?AltRef=%s"
  12. info_url_end = "detail.asp?AltRef=%s"
  13. class FastWeb:
  14. def __init__(self,
  15. authority_name,
  16. authority_short_name,
  17. base_url,
  18. debug=False):
  19. self.authority_name = authority_name
  20. self.authority_short_name = authority_short_name
  21. self.base_url = base_url
  22. self.debug = debug
  23. # The object which stores our set of planning application results
  24. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  25. def getResultsByDayMonthYear(self, day, month, year):
  26. requested_date = datetime.date(year, month, day)
  27. # What we should do:
  28. #1) Work out if the page we get back is a results page or the search page again. The search page indicates no results for this day.
  29. # Assuming we have a results page:
  30. #2) Get the total number of results out of it. We can use this to work out how many times we need to request the page, and with what scroll numbers
  31. #3) Iterate over scroll numbers.
  32. scroll = 0
  33. first_time = True
  34. number_of_results = 0
  35. while first_time or scroll * 20 < number_of_results:
  36. scroll += 1
  37. this_search_url = search_form_url_end %{"scroll":scroll, "day":day, "month":month, "year":year}
  38. url = urlparse.urljoin(self.base_url, this_search_url)
  39. response = urllib2.urlopen(url)
  40. contents = response.read()
  41. if first_time:
  42. # We can now use the returned URL to tell us if there were no results.
  43. returned_url = response.geturl()
  44. # example URL of no results page
  45. # http://www.planning.cravendc.gov.uk/fastweb/search.asp?Results=none&
  46. if returned_url.count("search.asp"):
  47. # We got back the search page, there were no results for this date
  48. break
  49. results_page_parser = FastWebResultsPageParser(self._results, requested_date, self.base_url)
  50. results_page_parser.feed(contents)
  51. if first_time:
  52. number_of_results += results_page_parser.number_of_results
  53. first_time = False
  54. return self._results
  55. def getResults(self, day, month, year):
  56. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  57. # States
  58. STARTING = 1
  59. GOT_RESULTS_COUNT = 2
  60. IN_RESULTS_TABLE = 3
  61. IN_RESULTS_TABLE_TD = 4
  62. IN_INNER_TABLE = 5
  63. FINISHED = -1
  64. class FastWebResultsPageParser(HTMLParser.HTMLParser):
  65. def __init__(self, results, requested_date, base_url):
  66. self.results = results
  67. self.requested_date = requested_date
  68. self.base_url = base_url
  69. HTMLParser.HTMLParser.__init__(self)
  70. # We'll use this to store the number of results returned for this search
  71. self.number_of_results = None
  72. self._state = STARTING
  73. self._td_count = None
  74. self._data_list = []
  75. # This will store the planning application we are currently working on.
  76. self._current_application = None
  77. def get_data(self, flush=True):
  78. data = " ".join(self._data_list)
  79. if flush:
  80. self.flush_data()
  81. return data
  82. def flush_data(self):
  83. self._data_list = []
  84. def handle_starttag(self, tag, attrs):
  85. if self._state == STARTING and tag == "input":
  86. self._state = GOT_RESULTS_COUNT
  87. # This is where the number of results returned is stored
  88. attr_dict = {}
  89. for attr_name, attr_value in attrs:
  90. attr_dict[attr_name] = attr_value
  91. if attr_dict.get("id") == "RecCount":
  92. self.number_of_results = int(attr_dict.get("value"))
  93. elif self._state == GOT_RESULTS_COUNT and tag == "table":
  94. self._state = IN_RESULTS_TABLE
  95. elif self._state == IN_RESULTS_TABLE and tag == "td":
  96. self._state = IN_RESULTS_TABLE_TD
  97. elif self._state == IN_RESULTS_TABLE_TD and tag == "table":
  98. self._state = IN_INNER_TABLE
  99. self._td_count = 0
  100. self._current_application = PlanningApplication()
  101. self._current_application.date_received = self.requested_date
  102. elif self._state == IN_INNER_TABLE and tag == "td":
  103. self._td_count += 1
  104. self.flush_data()
  105. def handle_endtag(self, tag):
  106. if self._state == IN_INNER_TABLE and tag == "table":
  107. # The next if should never be false, but it pays to be careful :-)
  108. if self._current_application.council_reference is not None:
  109. self.results.addApplication(self._current_application)
  110. self._state = IN_RESULTS_TABLE_TD
  111. elif self._state == IN_RESULTS_TABLE_TD and tag == "td":
  112. self._state = FINISHED
  113. elif self._state == IN_INNER_TABLE and tag == "td":
  114. if self._td_count == 2:
  115. # This data is the App No.
  116. council_reference = self.get_data().strip()
  117. self._current_application.council_reference = council_reference
  118. # This also gives us everything we need for the info and comment urls
  119. self._current_application.info_url = urlparse.urljoin(self.base_url, info_url_end %(council_reference))
  120. self._current_application.comment_url = urlparse.urljoin(self.base_url, comment_url_end %(council_reference))
  121. elif self._td_count == 4:
  122. # This data is the address
  123. self._current_application.address = self.get_data().strip()
  124. self._current_application.postcode = getPostcodeFromText(self._current_application.address)
  125. elif self._td_count == 7:
  126. # This data is the description
  127. self._current_application.description = self.get_data().strip()
  128. def handle_data(self, data):
  129. self._data_list.append(data)
  130. # for debug purposes
  131. #cravenparser = FastWeb("Craven District Council", "Craven", "http://www.planning.cravendc.gov.uk/fastweb/")
  132. #eastleighparser = FastWeb("EastLeigh Borough Council", "Eastleigh", "http://www.eastleigh.gov.uk/FastWEB/")
  133. #suttonparser = FastWeb("Sutton", "Sutton", "http://82.43.4.135/FASTWEB/")
  134. #print eastleighparser.getResults(10,8,2007)
  135. #print cravenparser.getResults(25,12,2006)
  136. #print suttonparser.getResults(10,8,2007)
  137. #south_lakeland_parser = FastWeb("South Lakeland", "South Lakeland", "http://www.southlakeland.gov.uk/fastweb/")
  138. #print south_lakeland_parser.getResults(27,11,2006)