Automatically exported from code.google.com/p/planningalerts
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

PublicAccess.py 13 KiB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. #!/usr/bin/python
  2. import urllib, urllib2
  3. import HTMLParser
  4. import urlparse
  5. import datetime, time
  6. import cookielib
  7. cookie_jar = cookielib.CookieJar()
  8. from PlanningUtils import fixNewlines, PlanningAuthorityResults, PlanningApplication
  9. search_form_url_end = "tdc/DcApplication/application_searchform.aspx"
  10. search_results_url_end = "tdc/DcApplication/application_searchresults.aspx"
  11. comments_url_end = "tdc/DcApplication/application_comments_entryform.aspx"
  12. class PublicAccessParser(HTMLParser.HTMLParser):
  13. """This is the class which parses the PublicAccess search results page.
  14. """
  15. def __init__(self,
  16. authority_name,
  17. authority_short_name,
  18. base_url,
  19. debug=False):
  20. HTMLParser.HTMLParser.__init__(self)
  21. self.authority_name = authority_name
  22. self.authority_short_name = authority_short_name
  23. self.base_url = base_url
  24. self.debug = debug
  25. # this will change to True when we enter the table of results
  26. self._in_results_table = False
  27. # this will be set to True when we have passed the header row
  28. # in the results table
  29. self._past_header_row = False
  30. # this will be true when we are in a <td> in the results table
  31. self._in_td = False
  32. # For each row, this will say how many tds we have seen so far
  33. self._td_count = 0
  34. # The object which stores our set of planning application results
  35. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  36. # This will store the planning application we are currently working on.
  37. self._current_application = None
  38. def handle_starttag(self, tag, attrs):
  39. if tag == "table":
  40. self.handle_start_table(attrs)
  41. # we are only interested in tr tags if we are in the results table
  42. elif self._in_results_table and tag == "tr":
  43. self.handle_start_tr(attrs)
  44. # we are only interested in td tags if we are in the results table
  45. elif self._in_results_table and tag == "td":
  46. self.handle_start_td(attrs)
  47. # we are only interested in <a> tags if we are in the 6th td in
  48. # the results table.
  49. elif self._in_td and self._td_count == 6 and tag == "a":
  50. self.handle_start_a(attrs)
  51. # If the tag is not one of these then we aren't interested
  52. def handle_endtag(self, tag):
  53. # we only need to consider end tags if we are in the results table
  54. if self._in_results_table:
  55. if tag == "table":
  56. self.handle_end_table()
  57. if tag == "tr":
  58. self.handle_end_tr()
  59. if tag == "td":
  60. self.handle_end_td()
  61. def handle_start_table(self, attrs):
  62. for attr,value in attrs:
  63. if attr == "class":
  64. if value == "cResultsForm":
  65. self._in_results_table = True
  66. break
  67. def handle_end_table(self):
  68. # If we see an end table tag, then note that we have left the
  69. # results table. This method is only called if we are in that table.
  70. self._in_results_table = False
  71. def handle_start_tr(self, attrs):
  72. # The first tr we meet in the results table is just headers
  73. # We will set a flag at the end of that tr to avoid creating
  74. # a blank PlanningApplication
  75. if self._past_header_row:
  76. # Create a candidate result object
  77. self._current_application = PlanningApplication()
  78. self._td_count = 0
  79. def handle_end_tr(self):
  80. # If we are in the results table, and not finishing the header row
  81. # append the current result to the results list.
  82. if self._past_header_row:
  83. self._results.addApplication(self._current_application)
  84. else:
  85. # The first row of the results table is headers
  86. # We want to do nothing until after it
  87. self._past_header_row = True
  88. def handle_start_td(self, attrs):
  89. # increase the td count by one
  90. self._td_count += 1
  91. # note that we are now in a td
  92. self._in_td = True
  93. def handle_end_td(self):
  94. # note that we are now not in a td
  95. self._in_td = False
  96. def handle_start_a(self, attrs):
  97. # this method is only getting called if we are in the
  98. # 6th td of a non-header row of the results table.
  99. # go through the attributes of the <a> looking for one
  100. # named 'href'
  101. for attr,value in attrs:
  102. if attr == "href":
  103. # the value of this tag is a relative url.
  104. # parse it so we can get the query string from it
  105. parsed_info_url = urlparse.urlparse(value)
  106. # the 4th part of the tuple is the query string
  107. query_string = parsed_info_url[4]
  108. # join this query string to the search URL, and store this as
  109. # the info URL of the current planning application
  110. self._current_application.info_url = urlparse.urljoin(self.base_url, value)
  111. # Join this query string to the comments URL, and store this as
  112. # the comments URL of the current planning application
  113. comments_url = urlparse.urljoin(self.base_url, comments_url_end)
  114. self._current_application.comment_url = urlparse.urljoin(comments_url, query_string)
  115. # while we're here, let's follow some links to find the postcode...
  116. # the postcode is in an input tag in the property page. This page
  117. # can be found by following the info url.
  118. # The newlines in the info page need fixing.
  119. info_file_contents = fixNewlines(urllib2.urlopen(self._current_application.info_url).read())
  120. info_file_parser = PublicAccessInfoPageParser()
  121. info_file_parser.feed(info_file_contents)
  122. property_page_url = urlparse.urljoin(self._current_application.info_url, info_file_parser.property_page_url)
  123. # the newlines in this page need fixing
  124. property_file_contents = fixNewlines(urllib2.urlopen(property_page_url).read())
  125. property_file_parser = PublicAccessPropertyPageParser()
  126. property_file_parser.feed(property_file_contents)
  127. # Set the postcode on the current planning application from the
  128. # one found on the property page
  129. if property_file_parser.postcode is not None:
  130. self._current_application.postcode = property_file_parser.postcode
  131. # There is no need for us to look at any more attributes.
  132. break
  133. def handle_data(self, data):
  134. if self._in_td:
  135. # The first td contains the reference
  136. if self._td_count == 1:
  137. self._current_application.council_reference = data
  138. # The second td contains the date the application was received
  139. elif self._td_count == 2:
  140. year, month, day = time.strptime(data, "%d/%m/%Y")[:3]
  141. received_date = datetime.date(year, month, day)
  142. self._current_application.date_received = received_date
  143. # The third td contains the address
  144. elif self._td_count == 3:
  145. #data = data.replace("^M","\n")
  146. self._current_application.address = data
  147. # The fourth td contains the description
  148. elif self._td_count == 4:
  149. self._current_application.description = data
  150. # 5 is status - we don't need it.
  151. # 6 is a button - this is where we will get our postcode,
  152. # comment_url, and info_url from (when handling the <a> tag).
  153. def getResultsByDayMonthYear(self, day, month, year):
  154. # First download the search form (in order to get a session cookie
  155. search_form_request = urllib2.Request(urlparse.urljoin(self.base_url, search_form_url_end))
  156. search_form_response = urllib2.urlopen(search_form_request)
  157. cookie_jar.extract_cookies(search_form_response, search_form_request)
  158. # We are only doing this first search in order to get a cookie
  159. # The paging on the site doesn't work with cookies turned off.
  160. search_data1 = urllib.urlencode({"searchType":"ADV",
  161. "caseNo":"",
  162. "PPReference":"",
  163. "AltReference":"",
  164. "srchtype":"",
  165. "srchstatus":"",
  166. "srchdecision":"",
  167. "srchapstatus":"",
  168. "srchappealdecision":"",
  169. "srchwardcode":"",
  170. "srchparishcode":"",
  171. "srchagentdetails":"",
  172. "srchDateReceivedStart":"%(day)02d/%(month)02d/%(year)d" %{"day":day ,"month": month ,"year": year},
  173. "srchDateReceivedEnd":"%(day)02d/%(month)02d/%(year)d" %{"day":day, "month":month, "year":year} })
  174. if self.debug:
  175. print search_data1
  176. search_url = urlparse.urljoin(self.base_url, search_results_url_end)
  177. request1 = urllib2.Request(search_url, search_data1)
  178. cookie_jar.add_cookie_header(request1)
  179. response1 = urllib2.urlopen(request1)
  180. # This search is the one we will actually use.
  181. # a maximum of 100 results are returned on this site,
  182. # hence setting "pagesize" to 100. I doubt there will ever
  183. # be more than 100 in one day in PublicAccess...
  184. # "currentpage" = 1 gets us to the first page of results
  185. # (there will only be one anyway, as we are asking for 100 results...)
  186. #http://planning.york.gov.uk/PublicAccess/tdc/DcApplication/application_searchresults.aspx?szSearchDescription=Applications%20received%20between%2022/02/2007%20and%2022/02/2007&searchType=ADV&bccaseno=&currentpage=2&pagesize=10&module=P3
  187. search_data2 = urllib.urlencode((("szSearchDescription","Applications received between %(day)02d/%(month)02d/%(year)d and %(day)02d/%(month)02d/%(year)d"%{"day":day ,"month": month ,"year": year}), ("searchType","ADV"), ("bccaseno",""), ("currentpage","1"), ("pagesize","100"), ("module","P3")))
  188. if self.debug:
  189. print search_data2
  190. # This time we want to do a get request, so add the search data into the url
  191. request2_url = urlparse.urljoin(self.base_url, search_results_url_end + "?" + search_data2)
  192. request2 = urllib2.Request(request2_url)
  193. # add the cookie we stored from our first search
  194. cookie_jar.add_cookie_header(request2)
  195. response2 = urllib2.urlopen(request2)
  196. contents = fixNewlines(response2.read())
  197. if self.debug:
  198. print contents
  199. self.feed(contents)
  200. return self._results
  201. def getResults(self, day, month, year):
  202. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  203. class PublicAccessInfoPageParser(HTMLParser.HTMLParser):
  204. """A parser to get the URL for the property details page out of the
  205. info page (this url is needed in order to get the postcode of the
  206. application.
  207. """
  208. def __init__(self):
  209. HTMLParser.HTMLParser.__init__(self)
  210. self.property_page_url = None
  211. def handle_starttag(self, tag, attrs):
  212. """The URL of the property details page is contained in an <a> tag in
  213. an attribute with key 'A_btnPropertyDetails'. There is some garbage on
  214. either side of it which we will have to clear up before storing it...
  215. We go through the <a> tags looking for one with an attribute with
  216. key 'id' and value 'A_btnPropertyDetails'. When we find it we go through
  217. its attributes looking for one with key 'href' - the value of this attribute
  218. contains the URL we want, after a bit of tidying up.
  219. Once we have got the URL, there is no need for us to look at any more <a> tags.
  220. """
  221. if tag == "a" and self.property_page_url is None:
  222. if attrs.count(("id","A_btnPropertyDetails")) > 0:
  223. for attr,value in attrs:
  224. if attr == "href":
  225. the_link = value
  226. # this has some garbage on either side of it...
  227. # let's strip that off
  228. # the garbage on the left is separated by whitespace.
  229. # the garbage on the right is separated by a "'".
  230. self.property_page_url = the_link.split()[1].split("'")[0]
  231. class PublicAccessPropertyPageParser(HTMLParser.HTMLParser):
  232. """A parser to get the postcode out of the property details page."""
  233. def __init__(self):
  234. HTMLParser.HTMLParser.__init__(self)
  235. self.postcode = None
  236. def handle_starttag(self, tag, attrs):
  237. """The postcode is contained in an <input> tag.
  238. This tag has an attribute 'name' with value postcode.
  239. It also has an attribute 'value' with value the postcode of this application.
  240. We go through the input tags looking for one with an attribute with
  241. key 'name' and value 'postcode'. When we find one,
  242. we look through its attributes for one with key 'value' - we store the value of this
  243. attribute as self.postcode.
  244. Once we have the postcode, there is no need to look at any more input tags.
  245. """
  246. if tag == "input" and self.postcode is None:
  247. if attrs.count(("name","postcode")) > 0:
  248. for attr,value in attrs:
  249. if attr == "value":
  250. self.postcode = value