Automatically exported from code.google.com/p/planningalerts

ApplicationSearchServletParser.py 17 KiB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504
  1. import urllib, urllib2
  2. import HTMLParser
  3. import urlparse
  4. import datetime, time
  5. from PlanningUtils import PlanningAuthorityResults, \
  6. getPostcodeFromText, \
  7. PlanningApplication
  8. # The search results list will give us reference, location, description,
  9. # and info url of each app.
  10. # The info page gives us the received date,
  11. # and comment_url
  12. class ApplicationSearchServletParser(HTMLParser.HTMLParser):
  13. """Parser for ApplicationSearchServlet sites.
  14. """
  15. # These indicate the column of the main table containing this
  16. # piece of information.
  17. # They should be overridden in subclasses
  18. #self._rows_to_ignore_at_start = None
  19. _reference_col_no = None
  20. _location_col_no = None
  21. _description_col_no = None
  22. def __init__(self,
  23. authority_name,
  24. authority_short_name,
  25. base_url,
  26. debug=False):
  27. HTMLParser.HTMLParser.__init__(self)
  28. self.authority_name = authority_name
  29. self.authority_short_name = authority_short_name
  30. self.base_url = base_url
  31. self.debug = debug
  32. self.search_url = urlparse.urljoin(self.base_url, "portal/servlets/ApplicationSearchServlet")
  33. self._comment_url = urlparse.urljoin(self.base_url, "portal/servlets/PlanningComments?REFNO=%(council_reference)s")
  34. self._requested_date = None
  35. # 0 - no
  36. # 1 - maybe
  37. # 2 - yes
  38. # 3 - finished
  39. self._in_results_table = 0
  40. self._tr_count = 0
  41. self._td_count = 0
  42. self._data_list = []
  43. # this will hold the application we are currently working on.
  44. self._current_application = None
  45. # The object which stores our set of planning application results
  46. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  47. def _checkAttrsForResultsTable(self, attrs):
  48. raise SystemError
  49. def handle_starttag(self, tag, attrs):
  50. if self.debug:
  51. print tag, attrs
  52. if tag == "table" and self._in_results_table == 0:
  53. self._in_results_table = 1
  54. self._checkAttrsForResultsTable(attrs)
  55. elif tag == "tr" and self._in_results_table == 2:
  56. self._tr_count += 1
  57. self._td_count = 0
  58. self._data_list = []
  59. self._current_application = PlanningApplication()
  60. elif tag == "td" and self._in_results_table == 2:
  61. self._td_count += 1
  62. elif tag == "a" and self._in_results_table == 2 and self._td_count == self._reference_col_no:
  63. # The href attribute contains the link to the info page
  64. for (key, value) in attrs:
  65. if key == "href":
  66. self._current_application.info_url = urlparse.urljoin(self.search_url, value)
  67. def handle_endtag(self, tag):
  68. if self.debug:
  69. print "ending: " , tag
  70. if tag == "table" and self._in_results_table == 2:
  71. self._in_results_table = 3
  72. elif tag == "tr" and self._in_results_table == 2:
  73. if self._current_application.council_reference is not None:
  74. # get the received date
  75. #info_response = urllib2.urlopen(self._current_application.info_url)
  76. #info_page_parser = InfoPageParser()
  77. #info_page_parser.feed(info_response.read())
  78. self._current_application.date_received = self._requested_date#info_page_parser.date_received
  79. self._results.addApplication(self._current_application)
  80. elif tag == "td" and self._in_results_table == 2:
  81. if self._td_count == self._location_col_no:
  82. data = ' '.join(self._data_list).strip()
  83. self._current_application.address = data
  84. postcode = getPostcodeFromText(data)
  85. if postcode is not None:
  86. self._current_application.postcode = postcode
  87. self._data_list = []
  88. elif self._td_count == self._description_col_no:
  89. data = ' '.join(self._data_list).strip()
  90. self._current_application.description = data
  91. self._data_list = []
  92. elif tag == 'a' and self._in_results_table == 2 and self._td_count == self._reference_col_no:
  93. data = ''.join(self._data_list).strip()
  94. self._current_application.council_reference = data
  95. self._current_application.comment_url = self._comment_url %{"council_reference": data}
  96. self._data_list = []
  97. def handle_data(self, data):
  98. if self.debug:
  99. print data
  100. if self._in_results_table == 2:
  101. if self._td_count == self._reference_col_no or \
  102. self._td_count == self._location_col_no or \
  103. self._td_count == self._description_col_no:
  104. self._data_list.append(data.strip())
  105. def getResultsByDayMonthYear(self, day, month, year):
  106. """This will return an ApplicationResults object containg the
  107. applications for the date passed in."""
  108. # Were going to need a datetime object for the requested date
  109. self._requested_date = datetime.date(year, month, day)
  110. required_format = "%d-%m-%Y"
  111. search_data = urllib.urlencode({"ReceivedDateFrom":self._requested_date.strftime(required_format),
  112. "ReceivedDateTo":self._requested_date.strftime(required_format)})
  113. search_request = urllib2.Request(self.search_url, search_data)
  114. search_response = urllib2.urlopen(search_request)
  115. search_contents = search_response.read()
  116. self.feed(search_contents)
  117. return self._results
  118. def getResults(self, day, month, year):
  119. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  120. class CoventrySearchParser(ApplicationSearchServletParser):
  121. # results table spotter
  122. # width="100%" border="0"
  123. _reference_col_no = 1
  124. _location_col_no = 5
  125. _description_col_no = 8
  126. def _checkAttrsForResultsTable(self, attrs):
  127. got_width = False
  128. got_border = False
  129. for key, value in attrs:
  130. if key == 'width' and value == '100%':
  131. got_width = True
  132. elif key == 'border' and value == '0':
  133. got_border = True
  134. if got_width and got_border:
  135. self._in_results_table = 2
  136. else:
  137. self._in_results_table = 0
  138. class AllerdaleSearchParser(ApplicationSearchServletParser):
  139. # results table spotter
  140. #class="nis_table" summary="Table of planning applications that matched your query, showing reference number, received date, and address"
  141. _reference_col_no = 1
  142. _location_col_no = 3
  143. _description_col_no = 4
  144. def _checkAttrsForResultsTable(self, attrs):
  145. got_class = False
  146. got_summary = False
  147. for key, value in attrs:
  148. if key == 'class' and value == 'nis_table':
  149. got_class = True
  150. elif key == 'summary' and value == 'Table of planning applications that matched your query, showing reference number, received date, and address':
  151. got_summary = True
  152. if got_class and got_summary:
  153. self._in_results_table = 2
  154. else:
  155. self._in_results_table = 0
  156. class AlnwickSearchParser(ApplicationSearchServletParser):
  157. # results table spotter
  158. # width="100%" class="niscontent"
  159. _reference_col_no = 1
  160. _location_col_no = 2
  161. _description_col_no = 7
  162. def _checkAttrsForResultsTable(self, attrs):
  163. got_class = False
  164. for key, value in attrs:
  165. if key == 'class' and value == 'niscontent':
  166. got_class = True
  167. if got_class:
  168. self._in_results_table = 2
  169. else:
  170. self._in_results_table = 0
  171. class BarrowSearchParser(ApplicationSearchServletParser):
  172. # results table spotter
  173. # width="100%" border="0"
  174. _reference_col_no = 1
  175. _location_col_no = 3
  176. _description_col_no = 6
  177. def _checkAttrsForResultsTable(self, attrs):
  178. got_width = False
  179. got_border = False
  180. for key, value in attrs:
  181. if key == 'width' and value == '100%':
  182. got_width = True
  183. elif key == 'border' and value == '0':
  184. got_border = True
  185. if got_width and got_border:
  186. self._in_results_table = 2
  187. else:
  188. self._in_results_table = 0
  189. class HartlepoolSearchParser(ApplicationSearchServletParser):
  190. # results table spotter
  191. # summary="Table of planning applications that matched your query, showing reference number, received date, and address"
  192. _reference_col_no = 1
  193. _location_col_no = 2
  194. _description_col_no = 3
  195. def _checkAttrsForResultsTable(self, attrs):
  196. got_summary = False
  197. for key, value in attrs:
  198. if key == 'summary' and value == "Table of planning applications that matched your query, showing reference number, received date, and address":
  199. got_summary = True
  200. if got_summary:
  201. self._in_results_table = 2
  202. else:
  203. self._in_results_table = 0
  204. class NorthWarksSearchParser(ApplicationSearchServletParser):
  205. # results table spotter
  206. # table width="100%" border="0" cellspacing="0" cellpadding="0"
  207. _reference_col_no = 1
  208. _location_col_no = 3
  209. _description_col_no = 4
  210. def _checkAttrsForResultsTable(self, attrs):
  211. got_width = False
  212. got_border = False
  213. got_cellspacing = False
  214. got_cellpadding = False
  215. for key, value in attrs:
  216. if key == 'width' and value == "100%":
  217. got_width = True
  218. elif key == 'border' and value == '0':
  219. got_border = True
  220. elif key == 'cellspacing' and value == '0':
  221. got_cellspacing = True
  222. elif key == 'cellpadding' and value == '0':
  223. got_cellpadding = True
  224. if got_width and got_border and got_cellspacing and got_cellpadding:
  225. self._in_results_table = 2
  226. else:
  227. self._in_results_table = 0
  228. class StHelensSearchParser(ApplicationSearchServletParser):
  229. # results table spotter
  230. # summary="Search Results List"
  231. _reference_col_no = 1
  232. _location_col_no = 2
  233. _description_col_no = 5
  234. def _checkAttrsForResultsTable(self, attrs):
  235. got_summary = False
  236. for key, value in attrs:
  237. if key == 'summary' and value == "Search Results List":
  238. got_summary = True
  239. if got_summary:
  240. self._in_results_table = 2
  241. else:
  242. self._in_results_table = 0
  243. class EasingtonSearchParser(ApplicationSearchServletParser):
  244. # results table spotter
  245. #table width="100%" border="0" cellspacing="0" cellpadding="0"
  246. _reference_col_no = 1
  247. _location_col_no = 3
  248. _description_col_no = 6
  249. def _checkAttrsForResultsTable(self, attrs):
  250. got_width = False
  251. got_border = False
  252. got_cellspacing = False
  253. got_cellpadding = False
  254. for key, value in attrs:
  255. if key == 'width' and value == "100%":
  256. got_width = True
  257. elif key == 'border' and value == '0':
  258. got_border = True
  259. elif key == 'cellspacing' and value == '0':
  260. got_cellspacing = True
  261. elif key == 'cellpadding' and value == '0':
  262. got_cellpadding = True
  263. if got_width and got_border and got_cellspacing and got_cellpadding:
  264. self._in_results_table = 2
  265. else:
  266. self._in_results_table = 0
  267. class HighPeakSearchParser(ApplicationSearchServletParser):
  268. # results table spotter
  269. # table class="data" width="95%"
  270. _reference_col_no = 1
  271. _location_col_no = 2
  272. _description_col_no = 5
  273. def _checkAttrsForResultsTable(self, attrs):
  274. got_class = False
  275. got_width = False
  276. for key, value in attrs:
  277. if key == 'class' and value == "data":
  278. got_class = True
  279. if key == 'width' and value == "95%":
  280. got_width = True
  281. if got_class and got_width:
  282. self._in_results_table = 2
  283. else:
  284. self._in_results_table = 0
  285. class WearValleySearchParser(ApplicationSearchServletParser):
  286. # results table spotter
  287. # table summary="Table of planning applications that matched your query, showing reference number, received date, and address"
  288. _reference_col_no = 1
  289. _location_col_no = 3
  290. _description_col_no = 4
  291. def _checkAttrsForResultsTable(self, attrs):
  292. got_summary= False
  293. for key, value in attrs:
  294. if key == 'summary' and value == "Table of planning applications that matched your query, showing reference number, received date, and address":
  295. got_summary = True
  296. if got_summary:
  297. self._in_results_table = 2
  298. else:
  299. self._in_results_table = 0
  300. class WellingboroughSearchParser(ApplicationSearchServletParser):
  301. # results table spotter
  302. #table width="100%" border="0"
  303. _reference_col_no = 1
  304. _location_col_no = 3
  305. _description_col_no = 6
  306. def _checkAttrsForResultsTable(self, attrs):
  307. got_width = False
  308. got_border = False
  309. for key, value in attrs:
  310. if key == 'width' and value == "100%":
  311. got_width = True
  312. elif key == 'border' and value == "0":
  313. got_border = True
  314. if got_width and got_border:
  315. self._in_results_table = 2
  316. else:
  317. self._in_results_table = 0
  318. class EalingSearchParser(ApplicationSearchServletParser):
  319. # results table spotter
  320. # table width="100%" cellspacing="0px" border="1px" cellpadding="2px" bordercolor="#FFFFFF"
  321. _reference_col_no = 1
  322. _location_col_no = 3
  323. _description_col_no = 4
  324. def _checkAttrsForResultsTable(self, attrs):
  325. got_width = False
  326. got_cellspacing = False
  327. got_border = False
  328. got_cellpadding = False
  329. got_bordercolor = False
  330. for key, value in attrs:
  331. if key == 'width' and value == "100%":
  332. got_width = True
  333. elif key == 'cellspacing' and value == "0px":
  334. got_cellspacing = True
  335. elif key == 'border' and value == "1px":
  336. got_border = True
  337. elif key == 'cellpadding' and value == "2px":
  338. got_cellpadding = True
  339. elif key == 'bordercolor' and value == "#FFFFFF":
  340. got_bordercolor = True
  341. if got_width and got_cellspacing and got_border and got_cellpadding and got_bordercolor:
  342. self._in_results_table = 2
  343. else:
  344. self._in_results_table = 0
  345. class HaringeySearchParser(ApplicationSearchServletParser):
  346. # results table spotter
  347. # summary="Application Results"
  348. _reference_col_no = 1
  349. _location_col_no = 2
  350. _description_col_no = 5
  351. def _checkAttrsForResultsTable(self, attrs):
  352. got_summary= False
  353. for key, value in attrs:
  354. if key == 'summary' and value == "Application Results":
  355. got_summary = True
  356. if got_summary:
  357. self._in_results_table = 2
  358. else:
  359. self._in_results_table = 0
  360. class DenbighshireSearchParser(ApplicationSearchServletParser):
  361. # results table spotter
  362. #table width="100%" border="0"
  363. _reference_col_no = 1
  364. _location_col_no = 3
  365. _description_col_no = 5
  366. def _checkAttrsForResultsTable(self, attrs):
  367. got_width = False
  368. got_border = False
  369. for key, value in attrs:
  370. if key == 'width' and value == "100%":
  371. got_width = True
  372. elif key == 'border' and value == "0":
  373. got_border = True
  374. if got_width and got_border:
  375. self._in_results_table = 2
  376. else:
  377. self._in_results_table = 0
  378. if __name__ == "__main__":
  379. #parser = CoventrySearchParser("Coventry", "Coventry", "http://planning.coventry.gov.uk")
  380. #parser = AllerdaleSearchParser("Allerdale", "Allerdale", "http://planning.allerdale.gov.uk")
  381. #parser = AlnwickSearchParser("Alnwick", "Alnwick", "http://services.castlemorpeth.gov.uk:7777")
  382. #parser = BarrowSearchParser("Barrow", "Barrow", "http://localportal.barrowbc.gov.uk")
  383. #parser = HartlepoolSearchParser("Hartlepool", "Hartlepool", "http://eforms.hartlepool.gov.uk:7777")
  384. #parser = NorthWarksSearchParser("North Warwickshire", "North Warks", "http://planning.northwarks.gov.uk")
  385. #parser = StHelensSearchParser("St Helens", "St Helens", "http://212.248.225.150:8080")
  386. #parser = EasingtonSearchParser("Easington", "Easington", "http://planning.easington.gov.uk")
  387. #parser = HighPeakSearchParser("High Peak", "High Peak", "http://planning.highpeak.gov.uk")
  388. #parser = WearValleySearchParser("Wear Valley", "Wear Valley", "http://planning.wearvalley.gov.uk")
  389. #parser = WellingboroughSearchParser("Wellingborough", "Wellingborough", "http://planning.wellingborough.gov.uk")
  390. #parser = EalingSearchParser("Ealing", "Ealing", "http://www.pam.ealing.gov.uk")
  391. parser = HaringeySearchParser("Haringey", "Haringey", "http://www.planningservices.haringey.gov.uk")
  392. #parser = DenbighshireSearchParser("Denbighshire", "Denbighshire", "http://planning.denbighshire.gov.uk")
  393. print parser.getResults(1,8,2008)