Automatically exported from code.google.com/p/planningalerts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

AcolnetParser.py 17 KiB

17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514
  1. #!/usr/local/bin/python
  2. import urllib, urllib2
  3. import HTMLParser
  4. #from BeautifulSoup import BeautifulSoup
  5. # Adding this to try to help Surrey Heath - Duncan 14/9/2007
  6. import cookielib
  7. cookie_jar = cookielib.CookieJar()
  8. ################
  9. import urlparse
  10. import re
  11. end_head_regex = re.compile("</head", re.IGNORECASE)
  12. import MultipartPostHandler
  13. # this is not mine, or part of standard python (though it should be!)
  14. # it comes from http://pipe.scs.fsu.edu/PostHandler/MultipartPostHandler.py
  15. from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication
  16. from datetime import date
  17. from time import strptime
  18. date_format = "%d/%m/%Y"
  19. our_date = date(2007,4,25)
  20. #This is to get the system key out of the info url
  21. system_key_regex = re.compile("TheSystemkey=(\d*)", re.IGNORECASE)
  22. class AcolnetParser(HTMLParser.HTMLParser):
  23. case_number_tr = None # this one can be got by the td class attribute
  24. reg_date_tr = None
  25. location_tr = None
  26. proposal_tr = None
  27. # There is no online comment facility in these, so we provide an
  28. # appropriate email address instead
  29. comments_email_address = None
  30. action_regex = re.compile("<form[^>]*action=\"([^\"]*ACTION=UNWRAP&RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)
  31. def __init__(self,
  32. authority_name,
  33. authority_short_name,
  34. base_url,
  35. debug=False):
  36. HTMLParser.HTMLParser.__init__(self)
  37. self.authority_name = authority_name
  38. self.authority_short_name = authority_short_name
  39. self.base_url = base_url
  40. self.debug = debug
  41. self._tr_number = 0
  42. # This will be used to track the subtable depth
  43. # when we are in a results-table, in order to
  44. # avoid adding an application before we have got to
  45. # the end of the results-table
  46. self._subtable_depth = None
  47. self._in_td = False
  48. # This in where we store the results
  49. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  50. # This will store the planning application we are currently working on.
  51. self._current_application = None
  52. def _cleanupHTML(self, html):
  53. """This method should be overridden in subclasses to perform site specific
  54. HTML cleanup."""
  55. return html
  56. def handle_starttag(self, tag, attrs):
  57. #print tag, attrs
  58. if tag == "table":
  59. if self._current_application is None:
  60. # Each application is in a separate table with class "results-table"
  61. for key, value in attrs:
  62. if key == "class" and value == "results-table":
  63. #print "found results-table"
  64. self._current_application = PlanningApplication()
  65. self._tr_number = 0
  66. self._subtable_depth = 0
  67. self._current_application.comment_url = self.comments_email_address
  68. break
  69. else:
  70. # We are already in a results-table, and this is the start of a subtable,
  71. # so increment the subtable depth.
  72. self._subtable_depth += 1
  73. elif self._current_application is not None:
  74. if tag == "tr" and self._subtable_depth == 0:
  75. self._tr_number += 1
  76. if tag == "td":
  77. self._in_td = True
  78. if tag == "a" and self._tr_number == self.case_number_tr:
  79. # this is where we get the info link and the case number
  80. for key, value in attrs:
  81. if key == "href":
  82. self._current_application.info_url = value
  83. system_key = system_key_regex.search(value).groups()[0]
  84. if self.comments_email_address is not None:
  85. self._current_application.comment_url = self.comments_email_address
  86. else:
  87. self._current_application.comment_url = value.replace("PgeResultDetail", "PgeCommentForm")
  88. def handle_data(self, data):
  89. # If we are in the tr which contains the case number,
  90. # then data is the council reference, so
  91. # add it to self._current_application.
  92. if self._in_td:
  93. if self._tr_number == self.case_number_tr:
  94. self._current_application.council_reference = data.strip()
  95. elif self._tr_number == self.reg_date_tr:
  96. # we need to make a date object out of data
  97. date_as_str = ''.join(data.strip().split())
  98. received_date = date(*strptime(date_as_str, date_format)[0:3])
  99. #print received_date
  100. self._current_application.date_received = received_date
  101. elif self._tr_number == self.location_tr:
  102. location = data.strip()
  103. self._current_application.address = location
  104. self._current_application.postcode = getPostcodeFromText(location)
  105. elif self._tr_number == self.proposal_tr:
  106. self._current_application.description = data.strip()
  107. def handle_endtag(self, tag):
  108. #print "ending: ", tag
  109. if tag == "table" and self._current_application is not None:
  110. if self._subtable_depth > 0:
  111. self._subtable_depth -= 1
  112. else:
  113. # We need to add the last application in the table
  114. if self._current_application is not None:
  115. #print "adding application"
  116. self._results.addApplication(self._current_application)
  117. #print self._current_application
  118. self._current_application = None
  119. self._tr_number = None
  120. self._subtable_depth = None
  121. elif tag == "td":
  122. self._in_td = False
  123. def _getSearchResponse(self):
  124. # It looks like we sometimes need to do some stuff to get around a
  125. # javascript redirect and cookies.
  126. search_form_request = urllib2.Request(self.base_url)
  127. search_form_response = urllib2.urlopen(search_form_request)
  128. return search_form_response
  129. def getResultsByDayMonthYear(self, day, month, year):
  130. # first we fetch the search page to get ourselves some session info...
  131. search_form_response = self._getSearchResponse()
  132. search_form_contents = search_form_response.read()
  133. #outfile = open("tmpfile", "w")
  134. #outfile.write(search_form_contents)
  135. # This sometimes causes a problem in HTMLParser, so let's just get the link
  136. # out with a regex...
  137. groups = self.action_regex.search(search_form_contents).groups()
  138. action = groups[0]
  139. #print action
  140. action_url = urlparse.urljoin(self.base_url, action)
  141. #print action_url
  142. our_date = date(year, month, day)
  143. search_data = {"regdate1": our_date.strftime(date_format),
  144. "regdate2": our_date.strftime(date_format),
  145. }
  146. opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
  147. response = opener.open(action_url, search_data)
  148. results_html = response.read()
  149. # This is for doing site specific html cleanup
  150. results_html = self._cleanupHTML(results_html)
  151. #some javascript garbage in the header upsets HTMLParser,
  152. #so we'll just have the body
  153. just_body = "<html>" + end_head_regex.split(results_html)[-1]
  154. #outfile = open(self.authority_short_name + ".debug", "w")
  155. #outfile.write(just_body)
  156. self.feed(just_body)
  157. return self._results
  158. def getResults(self, day, month, year):
  159. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  160. ## # Babergh up to 21/06/2007
  161. ## class BaberghParser(AcolnetParser):
  162. ## case_number_tr = 1 # this one can be got by the td class attribute
  163. ## reg_date_tr = 2
  164. ## location_tr = 4
  165. ## proposal_tr = 5
  166. ## # It would be nice to scrape this...
  167. ## comments_email_address = "planning.reception@babergh.gov.uk"
  168. # Site changes to here from 22/06/2007
  169. class BaberghParser(AcolnetParser):
  170. case_number_tr = 1 # this one can be got by the td class attribute
  171. reg_date_tr = 3
  172. location_tr = 5
  173. proposal_tr = 6
  174. # It would be nice to scrape this...
  175. comments_email_address = "planning.reception@babergh.gov.uk"
  176. class BasingstokeParser(AcolnetParser):
  177. case_number_tr = 1 # this one can be got by the td class attribute
  178. reg_date_tr = 3
  179. location_tr = 6
  180. proposal_tr = 8
  181. # It would be nice to scrape this...
  182. comments_email_address = "development.control@basingstoke.gov.uk"
  183. class BassetlawParser(AcolnetParser):
  184. case_number_tr = 1 # this one can be got by the td class attribute
  185. reg_date_tr = 2
  186. location_tr = 4
  187. proposal_tr = 5
  188. comments_email_address = "planning@bassetlaw.gov.uk"
  189. def _cleanupHTML(self, html):
  190. """There is a broken div in this page. We don't need any divs, so
  191. let's get rid of them all."""
  192. div_regex = re.compile("</?div[^>]*>", re.IGNORECASE)
  193. return div_regex.sub('', html)
  194. class BridgnorthParser(AcolnetParser):
  195. # This site is currently down...
  196. #search_url = "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"
  197. #authority_name = "Bridgenorth District Council"
  198. #authority_short_name = "Bridgenorth"
  199. case_number_tr = 1 # this one can be got by the td class attribute
  200. reg_date_tr = 2
  201. location_tr = 4
  202. proposal_tr = 5
  203. comments_email_address = "contactus@bridgnorth-dc.gov.uk"
  204. class BuryParser(AcolnetParser):
  205. case_number_tr = 1 # this one can be got by the td class attribute
  206. reg_date_tr = 3
  207. location_tr = 5
  208. proposal_tr = 6
  209. #comments_email_address = "development.control@bury.gov.uk"
  210. ## class CanterburyParser(AcolnetParser):
  211. ## search_url = "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
  212. ## case_number_tr = 1 # this one can be got by the td class attribute
  213. ## reg_date_tr = 2
  214. ## location_tr = 4
  215. ## proposal_tr = 5
  216. ## authority_name = "Canterbury City Council"
  217. ## authority_short_name = "Canterbury"
  218. ## comments_email_address = ""
  219. class CarlisleParser(AcolnetParser):
  220. case_number_tr = 1 # this one can be got by the td class attribute
  221. reg_date_tr = 2
  222. location_tr = 5
  223. proposal_tr = 6
  224. comments_email_address = "dc@carlisle.gov.uk"
  225. class DerbyParser(AcolnetParser):
  226. case_number_tr = 1 # this one can be got by the td class attribute
  227. reg_date_tr = 3
  228. location_tr = 4
  229. proposal_tr = 5
  230. comments_email_address = "developmentcontrol@derby.gov.uk"
  231. class CroydonParser(AcolnetParser):
  232. case_number_tr = 1 # this one can be got by the td class attribute
  233. reg_date_tr = 3
  234. location_tr = 5
  235. proposal_tr = 6
  236. comments_email_address = "planning.control@croydon.gov.uk"
  237. class EastLindseyParser(AcolnetParser):
  238. case_number_tr = 1 # this one can be got by the td class attribute
  239. reg_date_tr = 3
  240. location_tr = 5
  241. proposal_tr = 6
  242. comments_email_address = "development.control@e-lindsey.gov.uk"
  243. class FyldeParser(AcolnetParser):
  244. case_number_tr = 1 # this one can be got by the td class attribute
  245. reg_date_tr = 2
  246. location_tr = 4
  247. proposal_tr = 5
  248. comments_email_address = "planning@fylde.gov.uk"
  249. class HarlowParser(AcolnetParser):
  250. case_number_tr = 1 # this one can be got by the td class attribute
  251. reg_date_tr = 2
  252. location_tr = 4
  253. proposal_tr = 5
  254. comments_email_address = "Planning.services@harlow.gov.uk"
  255. class HavantParser(AcolnetParser):
  256. case_number_tr = 1 # this one can be got by the td class attribute
  257. reg_date_tr = 3
  258. location_tr = 6
  259. proposal_tr = 8
  260. comments_email_address = "representations@havant.gov.uk"
  261. class HertsmereParser(AcolnetParser):
  262. case_number_tr = 1 # this one can be got by the td class attribute
  263. reg_date_tr = 2
  264. location_tr = 4
  265. proposal_tr = 5
  266. comments_email_address = "planning@hertsmere.gov.uk"
  267. class LewishamParser(AcolnetParser):
  268. case_number_tr = 1 # this one can be got by the td class attribute
  269. reg_date_tr = 2
  270. location_tr = 4
  271. proposal_tr = 5
  272. comments_email_address = "planning@lewisham.gov.uk"
  273. class NorthHertfordshireParser(AcolnetParser):
  274. case_number_tr = 1 # this one can be got by the td class attribute
  275. reg_date_tr = 2
  276. location_tr = 4
  277. proposal_tr = 5
  278. ## class MidSuffolkParser(AcolnetParser):
  279. ## case_number_tr = 1 # this one can be got by the td class attribute
  280. ## reg_date_tr = 2
  281. ## location_tr = 4
  282. ## proposal_tr = 5
  283. ## comments_email_address = "planning@lewisham.gov.uk"
  284. ## #action_regex = re.compile("<FORM .*action=\"(.*ACTION=UNWRAP&RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)
  285. class NewForestNPParser(AcolnetParser):
  286. # In this case there is an online comment facility at the
  287. # bottom of each view app page...
  288. case_number_tr = 1 # this one can be got by the td class attribute
  289. reg_date_tr = 2
  290. location_tr = 4
  291. proposal_tr = 5
  292. class NewForestDCParser(AcolnetParser):
  293. # In this case there is an online comment facility at the
  294. # bottom of each view app page...
  295. case_number_tr = 1 # this one can be got by the td class attribute
  296. reg_date_tr = 3
  297. location_tr = 6
  298. proposal_tr = 7
  299. class NorthWiltshireParser(AcolnetParser):
  300. case_number_tr = 1 # this one can be got by the td class attribute
  301. reg_date_tr = 3
  302. location_tr = 6
  303. proposal_tr = 7
  304. class OldhamParser(AcolnetParser):
  305. case_number_tr = 1 # this one can be got by the td class attribute
  306. reg_date_tr = 3
  307. location_tr = 6
  308. proposal_tr = 7
  309. def _cleanupHTML(self, html):
  310. """There is a bad table end tag in this one.
  311. Fix it before we start"""
  312. bad_table_end = '</table summary="Copyright">'
  313. good_table_end = '</table>'
  314. return html.replace(bad_table_end, good_table_end)
  315. class RenfrewshireParser(AcolnetParser):
  316. case_number_tr = 1 # this one can be got by the td class attribute
  317. reg_date_tr = 2
  318. location_tr = 4
  319. proposal_tr = 5
  320. comments_email_address = "pt@renfrewshire.gov.uk"
  321. class SouthBedfordshireParser(AcolnetParser):
  322. case_number_tr = 1 # this one can be got by the td class attribute
  323. reg_date_tr = 3
  324. location_tr = 5
  325. proposal_tr = 6
  326. class SuffolkCoastalParser(AcolnetParser):
  327. case_number_tr = 1 # this one can be got by the td class attribute
  328. reg_date_tr = 2
  329. location_tr = 4
  330. proposal_tr = 5
  331. comments_email_address = "d.c.admin@suffolkcoastal.gov.uk"
  332. class GuildfordParser(AcolnetParser):
  333. case_number_tr = 1
  334. reg_date_tr = 7
  335. location_tr = 2
  336. proposal_tr = 3
  337. #http://www.guildford.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&Root=PgeSearch
  338. class SurreyHeathParser(AcolnetParser):
  339. # This is not working yet.
  340. # _getSearchResponse is an attempt to work around
  341. # cookies and a javascript redirect.
  342. # I may have a bit more of a go at this at some point if I have time.
  343. case_number_tr = 1 # this one can be got by the td class attribute
  344. reg_date_tr = 2
  345. location_tr = 4
  346. proposal_tr = 5
  347. comments_email_address = "development-control@surreyheath.gov.uk"
  348. def _getSearchResponse(self):
  349. # It looks like we sometimes need to do some stuff to get around a
  350. # javascript redirect and cookies.
  351. search_form_request = urllib2.Request(self.base_url)
  352. # Lying about the user-agent doesn't seem to help.
  353. #search_form_request.add_header("user-agent", "Mozilla/5.0 (compatible; Konqu...L/3.5.6 (like Gecko) (Kubuntu)")
  354. search_form_response = urllib2.urlopen(search_form_request)
  355. cookie_jar.extract_cookies(search_form_response, search_form_request)
  356. print search_form_response.geturl()
  357. print search_form_response.info()
  358. print search_form_response.read()
  359. # validate_url = "https://www.public.surreyheath-online.gov.uk/whalecom7cace3215643e22bb7b0b8cc97a7/whalecom0/InternalSite/Validate.asp"
  360. # javascript_redirect_url = urlparse.urljoin(self.base_url, "/whalecom7cace3215643e22bb7b0b8cc97a7/whalecom0/InternalSite/RedirectToOrigURL.asp?site_name=public&secure=1")
  361. # javascript_redirect_request = urllib2.Request(javascript_redirect_url)
  362. # javascript_redirect_request.add_header('Referer', validate_url)
  363. # cookie_jar.add_cookie_header(javascript_redirect_request)
  364. # javascript_redirect_response = urllib2.urlopen(javascript_redirect_request)
  365. # return javascript_redirect_response
  366. if __name__ == '__main__':
  367. day = 22
  368. month = 2
  369. year = 2005
  370. # returns error 400 - bad request
  371. #parser = BridgenorthParser()
  372. # cambridgeshire is a bit different...
  373. # no advanced search page
  374. # canterbury
  375. # results as columns of one table
  376. #parser = SurreyHeathParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  377. parser = GuildfordParser("Guildford", "Guildford", "http://www.guildford.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&Root=PgeSearch")
  378. print parser.getResults(day, month, year)