Automatically exported from code.google.com/p/planningalerts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

AcolnetParser.py 14 KiB

17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
  1. #!/usr/local/bin/python
  2. import urllib, urllib2
  3. import HTMLParser
  4. #from BeautifulSoup import BeautifulSoup
  5. import urlparse
  6. import re
  7. end_head_regex = re.compile("</head", re.IGNORECASE)
  8. import MultipartPostHandler
  9. # this is not mine, or part of standard python (though it should be!)
  10. # it comes from http://pipe.scs.fsu.edu/PostHandler/MultipartPostHandler.py
  11. from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication
  12. from datetime import date
  13. from time import strptime
  14. date_format = "%d/%m/%Y"
  15. our_date = date(2007,4,25)
  16. #This is to get the system key out of the info url
  17. system_key_regex = re.compile("TheSystemkey=(\d*)", re.IGNORECASE)
  18. class AcolnetParser(HTMLParser.HTMLParser):
  19. case_number_tr = None # this one can be got by the td class attribute
  20. reg_date_tr = None
  21. location_tr = None
  22. proposal_tr = None
  23. # There is no online comment facility in these, so we provide an
  24. # appropriate email address instead
  25. comments_email_address = None
  26. action_regex = re.compile("<form[^>]*action=\"([^\"]*ACTION=UNWRAP&RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)
  27. def __init__(self,
  28. authority_name,
  29. authority_short_name,
  30. base_url,
  31. debug=False):
  32. HTMLParser.HTMLParser.__init__(self)
  33. self.authority_name = authority_name
  34. self.authority_short_name = authority_short_name
  35. self.base_url = base_url
  36. self.debug = debug
  37. self._tr_number = 0
  38. # This will be used to track the subtable depth
  39. # when we are in a results-table, in order to
  40. # avoid adding an application before we have got to
  41. # the end of the results-table
  42. self._subtable_depth = None
  43. self._in_td = False
  44. # This in where we store the results
  45. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  46. # This will store the planning application we are currently working on.
  47. self._current_application = None
  48. def _cleanupHTML(self, html):
  49. """This method should be overridden in subclasses to perform site specific
  50. HTML cleanup."""
  51. return html
  52. def handle_starttag(self, tag, attrs):
  53. #print tag, attrs
  54. if tag == "table":
  55. if self._current_application is None:
  56. # Each application is in a separate table with class "results-table"
  57. for key, value in attrs:
  58. if key == "class" and value == "results-table":
  59. #print "found results-table"
  60. self._current_application = PlanningApplication()
  61. self._tr_number = 0
  62. self._subtable_depth = 0
  63. self._current_application.comment_url = self.comments_email_address
  64. break
  65. else:
  66. # We are already in a results-table, and this is the start of a subtable,
  67. # so increment the subtable depth.
  68. self._subtable_depth += 1
  69. elif self._current_application is not None:
  70. if tag == "tr" and self._subtable_depth == 0:
  71. self._tr_number += 1
  72. if tag == "td":
  73. self._in_td = True
  74. if tag == "a" and self._tr_number == self.case_number_tr:
  75. # this is where we get the info link and the case number
  76. for key, value in attrs:
  77. if key == "href":
  78. self._current_application.info_url = value
  79. system_key = system_key_regex.search(value).groups()[0]
  80. if self.comments_email_address is not None:
  81. self._current_application.comment_url = self.comments_email_address
  82. else:
  83. self._current_application.comment_url = value.replace("PgeResultDetail", "PgeCommentForm")
  84. def handle_data(self, data):
  85. # If we are in the tr which contains the case number,
  86. # then data is the council reference, so
  87. # add it to self._current_application.
  88. if self._in_td:
  89. if self._tr_number == self.case_number_tr:
  90. self._current_application.council_reference = data.strip()
  91. elif self._tr_number == self.reg_date_tr:
  92. # we need to make a date object out of data
  93. date_as_str = ''.join(data.strip().split())
  94. received_date = date(*strptime(date_as_str, date_format)[0:3])
  95. #print received_date
  96. self._current_application.date_received = received_date
  97. elif self._tr_number == self.location_tr:
  98. location = data.strip()
  99. self._current_application.address = location
  100. self._current_application.postcode = getPostcodeFromText(location)
  101. elif self._tr_number == self.proposal_tr:
  102. self._current_application.description = data.strip()
  103. def handle_endtag(self, tag):
  104. #print "ending: ", tag
  105. if tag == "table" and self._current_application is not None:
  106. if self._subtable_depth > 0:
  107. self._subtable_depth -= 1
  108. else:
  109. # We need to add the last application in the table
  110. if self._current_application is not None:
  111. #print "adding application"
  112. self._results.addApplication(self._current_application)
  113. #print self._current_application
  114. self._current_application = None
  115. self._tr_number = None
  116. self._subtable_depth = None
  117. elif tag == "td":
  118. self._in_td = False
  119. def getResultsByDayMonthYear(self, day, month, year):
  120. # first we fetch the search page to get ourselves some session info...
  121. search_form_response = urllib2.urlopen(self.base_url)
  122. search_form_contents = search_form_response.read()
  123. outfile = open("tmpfile", "w")
  124. outfile.write(search_form_contents)
  125. # This sometimes causes a problem in HTMLParser, so let's just get the link
  126. # out with a regex...
  127. groups = self.action_regex.search(search_form_contents).groups()
  128. action = groups[0]
  129. #print action
  130. action_url = urlparse.urljoin(self.base_url, action)
  131. #print action_url
  132. our_date = date(year, month, day)
  133. search_data = {"regdate1": our_date.strftime(date_format),
  134. "regdate2": our_date.strftime(date_format),
  135. }
  136. opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
  137. response = opener.open(action_url, search_data)
  138. results_html = response.read()
  139. # This is for doing site specific html cleanup
  140. results_html = self._cleanupHTML(results_html)
  141. #some javascript garbage in the header upsets HTMLParser,
  142. #so we'll just have the body
  143. just_body = "<html>" + end_head_regex.split(results_html)[-1]
  144. #outfile = open(self.authority_short_name + ".debug", "w")
  145. #outfile.write(just_body)
  146. self.feed(just_body)
  147. return self._results
  148. def getResults(self, day, month, year):
  149. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  150. class BaberghParser(AcolnetParser):
  151. case_number_tr = 1 # this one can be got by the td class attribute
  152. reg_date_tr = 2
  153. location_tr = 4
  154. proposal_tr = 5
  155. # It would be nice to scrape this...
  156. comments_email_address = "planning.reception@babergh.gov.uk"
  157. class BasingstokeParser(AcolnetParser):
  158. case_number_tr = 1 # this one can be got by the td class attribute
  159. reg_date_tr = 3
  160. location_tr = 6
  161. proposal_tr = 8
  162. # It would be nice to scrape this...
  163. comments_email_address = "development.control@basingstoke.gov.uk"
  164. class BassetlawParser(AcolnetParser):
  165. case_number_tr = 1 # this one can be got by the td class attribute
  166. reg_date_tr = 2
  167. location_tr = 5
  168. proposal_tr = 6
  169. comments_email_address = "planning@bassetlaw.gov.uk"
  170. def _cleanupHTML(self, html):
  171. """There is a broken div in this page. We don't need any divs, so
  172. let's get rid of them all."""
  173. div_regex = re.compile("</?div[^>]*>", re.IGNORECASE)
  174. return div_regex.sub('', html)
  175. class BridgnorthParser(AcolnetParser):
  176. # This site is currently down...
  177. #search_url = "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"
  178. #authority_name = "Bridgenorth District Council"
  179. #authority_short_name = "Bridgenorth"
  180. case_number_tr = 1 # this one can be got by the td class attribute
  181. reg_date_tr = 2
  182. location_tr = 4
  183. proposal_tr = 5
  184. comments_email_address = "contactus@bridgnorth-dc.gov.uk"
  185. class BuryParser(AcolnetParser):
  186. case_number_tr = 1 # this one can be got by the td class attribute
  187. reg_date_tr = 2
  188. location_tr = 4
  189. proposal_tr = 5
  190. comments_email_address = "development.control@bury.gov.uk"
  191. ## class CanterburyParser(AcolnetParser):
  192. ## search_url = "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
  193. ## case_number_tr = 1 # this one can be got by the td class attribute
  194. ## reg_date_tr = 2
  195. ## location_tr = 4
  196. ## proposal_tr = 5
  197. ## authority_name = "Canterbury City Council"
  198. ## authority_short_name = "Canterbury"
  199. ## comments_email_address = ""
  200. class CarlisleParser(AcolnetParser):
  201. case_number_tr = 1 # this one can be got by the td class attribute
  202. reg_date_tr = 2
  203. location_tr = 5
  204. proposal_tr = 6
  205. comments_email_address = "dc@carlisle.gov.uk"
  206. class DerbyParser(AcolnetParser):
  207. case_number_tr = 1 # this one can be got by the td class attribute
  208. reg_date_tr = 3
  209. location_tr = 4
  210. proposal_tr = 5
  211. comments_email_address = "developmentcontrol@derby.gov.uk"
  212. class CroydonParser(AcolnetParser):
  213. case_number_tr = 1 # this one can be got by the td class attribute
  214. reg_date_tr = 3
  215. location_tr = 5
  216. proposal_tr = 6
  217. comments_email_address = "planning.control@croydon.gov.uk"
  218. class EastLindseyParser(AcolnetParser):
  219. case_number_tr = 1 # this one can be got by the td class attribute
  220. reg_date_tr = 3
  221. location_tr = 5
  222. proposal_tr = 6
  223. comments_email_address = "development.control@e-lindsey.gov.uk"
  224. class FyldeParser(AcolnetParser):
  225. case_number_tr = 1 # this one can be got by the td class attribute
  226. reg_date_tr = 2
  227. location_tr = 4
  228. proposal_tr = 5
  229. comments_email_address = "planning@fylde.gov.uk"
  230. class HarlowParser(AcolnetParser):
  231. case_number_tr = 1 # this one can be got by the td class attribute
  232. reg_date_tr = 2
  233. location_tr = 4
  234. proposal_tr = 5
  235. comments_email_address = "Planning.services@harlow.gov.uk"
  236. class HavantParser(AcolnetParser):
  237. case_number_tr = 1 # this one can be got by the td class attribute
  238. reg_date_tr = 2
  239. location_tr = 4
  240. proposal_tr = 5
  241. comments_email_address = "representations@havant.gov.uk"
  242. class HertsmereParser(AcolnetParser):
  243. case_number_tr = 1 # this one can be got by the td class attribute
  244. reg_date_tr = 2
  245. location_tr = 4
  246. proposal_tr = 5
  247. comments_email_address = "planning@hertsmere.gov.uk"
  248. class LewishamParser(AcolnetParser):
  249. case_number_tr = 1 # this one can be got by the td class attribute
  250. reg_date_tr = 2
  251. location_tr = 4
  252. proposal_tr = 5
  253. comments_email_address = "planning@lewisham.gov.uk"
  254. ## class NorthHertfordshireParser(AcolnetParser):
  255. ## case_number_tr = 1 # this one can be got by the td class attribute
  256. ## reg_date_tr = 2
  257. ## location_tr = 4
  258. ## proposal_tr = 5
  259. ## comments_email_address = "planning@lewisham.gov.uk"
  260. ## class MidSuffolkParser(AcolnetParser):
  261. ## case_number_tr = 1 # this one can be got by the td class attribute
  262. ## reg_date_tr = 2
  263. ## location_tr = 4
  264. ## proposal_tr = 5
  265. ## comments_email_address = "planning@lewisham.gov.uk"
  266. ## #action_regex = re.compile("<FORM .*action=\"(.*ACTION=UNWRAP&RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)
  267. class NewForestParser(AcolnetParser):
  268. # In this case there is an online comment facility at the
  269. # bottom of each view app page...
  270. case_number_tr = 1 # this one can be got by the td class attribute
  271. reg_date_tr = 2
  272. location_tr = 4
  273. proposal_tr = 5
  274. class NorthWiltshireParser(AcolnetParser):
  275. case_number_tr = 1 # this one can be got by the td class attribute
  276. reg_date_tr = 3
  277. location_tr = 6
  278. proposal_tr = 7
  279. class OldhamParser(AcolnetParser):
  280. case_number_tr = 1 # this one can be got by the td class attribute
  281. reg_date_tr = 3
  282. location_tr = 6
  283. proposal_tr = 7
  284. def _cleanupHTML(self, html):
  285. """There is a bad table end tag in this one.
  286. Fix it before we start"""
  287. bad_table_end = '</table summary="Copyright">'
  288. good_table_end = '</table>'
  289. return html.replace(bad_table_end, good_table_end)
  290. class RenfrewshireParser(AcolnetParser):
  291. case_number_tr = 1 # this one can be got by the td class attribute
  292. reg_date_tr = 2
  293. location_tr = 4
  294. proposal_tr = 5
  295. comments_email_address = "pt@renfrewshire.gov.uk"
  296. class SouthBedfordshireParser(AcolnetParser):
  297. case_number_tr = 1 # this one can be got by the td class attribute
  298. reg_date_tr = 3
  299. location_tr = 5
  300. proposal_tr = 6
  301. class SuffolkCoastalParser(AcolnetParser):
  302. case_number_tr = 1 # this one can be got by the td class attribute
  303. reg_date_tr = 2
  304. location_tr = 4
  305. proposal_tr = 5
  306. comments_email_address = "d.c.admin@suffolkcoastal.gov.uk"
  307. class SurreyHeathParser(AcolnetParser):
  308. case_number_tr = 1 # this one can be got by the td class attribute
  309. reg_date_tr = 2
  310. location_tr = 4
  311. proposal_tr = 5
  312. comments_email_address = "development-control@surreyheath.gov.uk"
  313. if __name__ == '__main__':
  314. day = 15
  315. month = 3
  316. year = 2007
  317. # returns error 400 - bad request
  318. #parser = BridgenorthParser()
  319. # cambridgeshire is a bit different...
  320. # no advanced search page
  321. # canterbury
  322. # results as columns of one table
  323. parser = SurreyHeathParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  324. print parser.getResults(day, month, year)