Automatically exported from code.google.com/p/planningalerts
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

AcolnetParser_HTMLParser.py 18 KiB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542
  1. #!/usr/local/bin/python
  2. import urllib, urllib2
  3. import HTMLParser
  4. #from BeautifulSoup import BeautifulSoup
  5. # Adding this to try to help Surrey Heath - Duncan 14/9/2007
  6. import cookielib
  7. cookie_jar = cookielib.CookieJar()
  8. ################
  9. import urlparse
  10. import re
  11. # We allow the optional > for Bridgenorth, which doesn't have broken html
  12. end_head_regex = re.compile("</head>?", re.IGNORECASE)
  13. import MultipartPostHandler
  14. # this is not mine, or part of standard python (though it should be!)
  15. # it comes from http://pipe.scs.fsu.edu/PostHandler/MultipartPostHandler.py
  16. from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication
  17. from datetime import date
  18. from time import strptime
  19. date_format = "%d/%m/%Y"
  20. our_date = date(2007,4,25)
  21. #This is to get the system key out of the info url
  22. system_key_regex = re.compile("TheSystemkey=(\d*)", re.IGNORECASE)
  23. class AcolnetParser(HTMLParser.HTMLParser):
  24. case_number_tr = None # this one can be got by the td class attribute
  25. reg_date_tr = None
  26. location_tr = None
  27. proposal_tr = None
  28. # There is no online comment facility in these, so we provide an
  29. # appropriate email address instead
  30. comments_email_address = None
  31. # The optional amp; is to cope with Oldham, which seems to have started
  32. # quoting this url.
  33. action_regex = re.compile("<form[^>]*action=\"([^\"]*ACTION=UNWRAP&(?:amp;)?RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)
  34. def __init__(self,
  35. authority_name,
  36. authority_short_name,
  37. base_url,
  38. debug=False):
  39. HTMLParser.HTMLParser.__init__(self)
  40. self.authority_name = authority_name
  41. self.authority_short_name = authority_short_name
  42. self.base_url = base_url
  43. self.debug = debug
  44. self._tr_number = 0
  45. # This will be used to track the subtable depth
  46. # when we are in a results-table, in order to
  47. # avoid adding an application before we have got to
  48. # the end of the results-table
  49. self._subtable_depth = None
  50. self._in_td = False
  51. # This in where we store the results
  52. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  53. # This will store the planning application we are currently working on.
  54. self._current_application = None
  55. def _cleanupHTML(self, html):
  56. """This method should be overridden in subclasses to perform site specific
  57. HTML cleanup."""
  58. return html
  59. def handle_starttag(self, tag, attrs):
  60. #print tag, attrs
  61. if tag == "table":
  62. if self._current_application is None:
  63. # Each application is in a separate table with class "results-table"
  64. for key, value in attrs:
  65. if key == "class" and value == "results-table":
  66. #print "found results-table"
  67. self._current_application = PlanningApplication()
  68. self._tr_number = 0
  69. self._subtable_depth = 0
  70. self._current_application.comment_url = self.comments_email_address
  71. break
  72. else:
  73. # We are already in a results-table, and this is the start of a subtable,
  74. # so increment the subtable depth.
  75. self._subtable_depth += 1
  76. elif self._current_application is not None:
  77. if tag == "tr" and self._subtable_depth == 0:
  78. self._tr_number += 1
  79. if tag == "td":
  80. self._in_td = True
  81. if tag == "a" and self._tr_number == self.case_number_tr:
  82. # this is where we get the info link and the case number
  83. for key, value in attrs:
  84. if key == "href":
  85. self._current_application.info_url = value
  86. system_key = system_key_regex.search(value).groups()[0]
  87. if self.comments_email_address is not None:
  88. self._current_application.comment_url = self.comments_email_address
  89. else:
  90. self._current_application.comment_url = value.replace("PgeResultDetail", "PgeCommentForm")
  91. def handle_data(self, data):
  92. # If we are in the tr which contains the case number,
  93. # then data is the council reference, so
  94. # add it to self._current_application.
  95. if self._in_td:
  96. if self._tr_number == self.case_number_tr:
  97. self._current_application.council_reference = data.strip()
  98. elif self._tr_number == self.reg_date_tr:
  99. # we need to make a date object out of data
  100. date_as_str = ''.join(data.strip().split())
  101. received_date = date(*strptime(date_as_str, date_format)[0:3])
  102. #print received_date
  103. self._current_application.date_received = received_date
  104. elif self._tr_number == self.location_tr:
  105. location = data.strip()
  106. self._current_application.address = location
  107. self._current_application.postcode = getPostcodeFromText(location)
  108. elif self._tr_number == self.proposal_tr:
  109. self._current_application.description = data.strip()
  110. def handle_endtag(self, tag):
  111. #print "ending: ", tag
  112. if tag == "table" and self._current_application is not None:
  113. if self._subtable_depth > 0:
  114. self._subtable_depth -= 1
  115. else:
  116. # We need to add the last application in the table
  117. if self._current_application is not None:
  118. #print "adding application"
  119. self._results.addApplication(self._current_application)
  120. #print self._current_application
  121. self._current_application = None
  122. self._tr_number = None
  123. self._subtable_depth = None
  124. elif tag == "td":
  125. self._in_td = False
  126. def _getSearchResponse(self):
  127. # It looks like we sometimes need to do some stuff to get around a
  128. # javascript redirect and cookies.
  129. search_form_request = urllib2.Request(self.base_url)
  130. search_form_response = urllib2.urlopen(search_form_request)
  131. return search_form_response
  132. def getResultsByDayMonthYear(self, day, month, year):
  133. # first we fetch the search page to get ourselves some session info...
  134. search_form_response = self._getSearchResponse()
  135. search_form_contents = search_form_response.read()
  136. # This sometimes causes a problem in HTMLParser, so let's just get the link
  137. # out with a regex...
  138. groups = self.action_regex.search(search_form_contents).groups()
  139. action = groups[0]
  140. #print action
  141. # This is to handle the amp; which seems to have appeared in this
  142. # url on the Oldham site
  143. action = ''.join(action.split('amp;'))
  144. action_url = urlparse.urljoin(self.base_url, action)
  145. #print action_url
  146. our_date = date(year, month, day)
  147. search_data = {"regdate1": our_date.strftime(date_format),
  148. "regdate2": our_date.strftime(date_format),
  149. }
  150. opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
  151. response = opener.open(action_url, search_data)
  152. results_html = response.read()
  153. # This is for doing site specific html cleanup
  154. results_html = self._cleanupHTML(results_html)
  155. #some javascript garbage in the header upsets HTMLParser,
  156. #so we'll just have the body
  157. just_body = "<html>" + end_head_regex.split(results_html)[-1]
  158. #outfile = open(self.authority_short_name + ".debug", "w")
  159. #outfile.write(just_body)
  160. #print just_body
  161. self.feed(just_body)
  162. return self._results
  163. def getResults(self, day, month, year):
  164. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  165. ## # Babergh up to 21/06/2007
  166. ## class BaberghParser(AcolnetParser):
  167. ## case_number_tr = 1 # this one can be got by the td class attribute
  168. ## reg_date_tr = 2
  169. ## location_tr = 4
  170. ## proposal_tr = 5
  171. ## # It would be nice to scrape this...
  172. ## comments_email_address = "planning.reception@babergh.gov.uk"
  173. # Site changes to here from 22/06/2007
  174. class BaberghParser(AcolnetParser):
  175. case_number_tr = 1 # this one can be got by the td class attribute
  176. reg_date_tr = 3
  177. location_tr = 5
  178. proposal_tr = 6
  179. # It would be nice to scrape this...
  180. comments_email_address = "planning.reception@babergh.gov.uk"
  181. class BasingstokeParser(AcolnetParser):
  182. case_number_tr = 1 # this one can be got by the td class attribute
  183. reg_date_tr = 3
  184. location_tr = 6
  185. proposal_tr = 8
  186. # It would be nice to scrape this...
  187. comments_email_address = "development.control@basingstoke.gov.uk"
  188. class BassetlawParser(AcolnetParser):
  189. case_number_tr = 1 # this one can be got by the td class attribute
  190. reg_date_tr = 2
  191. location_tr = 4
  192. proposal_tr = 5
  193. comments_email_address = "planning@bassetlaw.gov.uk"
  194. def _cleanupHTML(self, html):
  195. """There is a broken div in this page. We don't need any divs, so
  196. let's get rid of them all."""
  197. div_regex = re.compile("</?div[^>]*>", re.IGNORECASE)
  198. return div_regex.sub('', html)
  199. class BridgnorthParser(AcolnetParser):
  200. # This site is currently down...
  201. #search_url = "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"
  202. #authority_name = "Bridgenorth District Council"
  203. #authority_short_name = "Bridgenorth"
  204. case_number_tr = 1 # this one can be got by the td class attribute
  205. reg_date_tr = 2
  206. location_tr = 4
  207. proposal_tr = 5
  208. comments_email_address = "contactus@bridgnorth-dc.gov.uk"
  209. class BuryParser(AcolnetParser):
  210. case_number_tr = 1 # this one can be got by the td class attribute
  211. reg_date_tr = 3
  212. location_tr = 5
  213. proposal_tr = 6
  214. #comments_email_address = "development.control@bury.gov.uk"
  215. ## class CanterburyParser(AcolnetParser):
  216. ## search_url = "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
  217. ## case_number_tr = 1 # this one can be got by the td class attribute
  218. ## reg_date_tr = 2
  219. ## location_tr = 4
  220. ## proposal_tr = 5
  221. ## authority_name = "Canterbury City Council"
  222. ## authority_short_name = "Canterbury"
  223. ## comments_email_address = ""
  224. class CarlisleParser(AcolnetParser):
  225. case_number_tr = 1 # this one can be got by the td class attribute
  226. reg_date_tr = 2
  227. location_tr = 5
  228. proposal_tr = 6
  229. comments_email_address = "dc@carlisle.gov.uk"
  230. class DerbyParser(AcolnetParser):
  231. case_number_tr = 1 # this one can be got by the td class attribute
  232. reg_date_tr = 3
  233. location_tr = 4
  234. proposal_tr = 5
  235. comments_email_address = "developmentcontrol@derby.gov.uk"
  236. class CroydonParser(AcolnetParser):
  237. case_number_tr = 1 # this one can be got by the td class attribute
  238. reg_date_tr = 3
  239. location_tr = 5
  240. proposal_tr = 6
  241. comments_email_address = "planning.control@croydon.gov.uk"
  242. class EastLindseyParser(AcolnetParser):
  243. case_number_tr = 1 # this one can be got by the td class attribute
  244. reg_date_tr = 3
  245. location_tr = 5
  246. proposal_tr = 6
  247. comments_email_address = "development.control@e-lindsey.gov.uk"
  248. class FyldeParser(AcolnetParser):
  249. case_number_tr = 1 # this one can be got by the td class attribute
  250. reg_date_tr = 2
  251. location_tr = 4
  252. proposal_tr = 5
  253. comments_email_address = "planning@fylde.gov.uk"
  254. class HarlowParser(AcolnetParser):
  255. case_number_tr = 1 # this one can be got by the td class attribute
  256. reg_date_tr = 2
  257. location_tr = 4
  258. proposal_tr = 5
  259. comments_email_address = "Planning.services@harlow.gov.uk"
  260. class HavantParser(AcolnetParser):
  261. case_number_tr = 1 # this one can be got by the td class attribute
  262. reg_date_tr = 3
  263. location_tr = 6
  264. proposal_tr = 8
  265. comments_email_address = "representations@havant.gov.uk"
  266. class HertsmereParser(AcolnetParser):
  267. case_number_tr = 1 # this one can be got by the td class attribute
  268. reg_date_tr = 2
  269. location_tr = 4
  270. proposal_tr = 5
  271. comments_email_address = "planning@hertsmere.gov.uk"
  272. class LewishamParser(AcolnetParser):
  273. case_number_tr = 1 # this one can be got by the td class attribute
  274. reg_date_tr = 2
  275. location_tr = 4
  276. proposal_tr = 5
  277. comments_email_address = "planning@lewisham.gov.uk"
  278. class NorthHertfordshireParser(AcolnetParser):
  279. case_number_tr = 1 # this one can be got by the td class attribute
  280. reg_date_tr = 2
  281. location_tr = 4
  282. proposal_tr = 5
  283. ## class MidSuffolkParser(AcolnetParser):
  284. ## case_number_tr = 1 # this one can be got by the td class attribute
  285. ## reg_date_tr = 2
  286. ## location_tr = 4
  287. ## proposal_tr = 5
  288. ## comments_email_address = "planning@lewisham.gov.uk"
  289. ## #action_regex = re.compile("<FORM .*action=\"(.*ACTION=UNWRAP&RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)
  290. class NewForestNPParser(AcolnetParser):
  291. # In this case there is an online comment facility at the
  292. # bottom of each view app page...
  293. case_number_tr = 1 # this one can be got by the td class attribute
  294. reg_date_tr = 2
  295. location_tr = 4
  296. proposal_tr = 5
  297. class NewForestDCParser(AcolnetParser):
  298. # In this case there is an online comment facility at the
  299. # bottom of each view app page...
  300. case_number_tr = 1 # this one can be got by the td class attribute
  301. reg_date_tr = 3
  302. location_tr = 6
  303. proposal_tr = 7
  304. class NorthWiltshireParser(AcolnetParser):
  305. case_number_tr = 1 # this one can be got by the td class attribute
  306. reg_date_tr = 3
  307. location_tr = 6
  308. proposal_tr = 7
  309. class OldhamParser(AcolnetParser):
  310. case_number_tr = 1 # this one can be got by the td class attribute
  311. reg_date_tr = 3
  312. location_tr = 6
  313. proposal_tr = 7
  314. def _cleanupHTML(self, html):
  315. """There is a bad table end tag in this one.
  316. Fix it before we start"""
  317. bad_table_end = '</table summary="Copyright">'
  318. good_table_end = '</table>'
  319. return html.replace(bad_table_end, good_table_end)
  320. class RenfrewshireParser(AcolnetParser):
  321. case_number_tr = 1 # this one can be got by the td class attribute
  322. reg_date_tr = 2
  323. location_tr = 4
  324. proposal_tr = 5
  325. comments_email_address = "pt@renfrewshire.gov.uk"
  326. class SouthBedfordshireParser(AcolnetParser):
  327. case_number_tr = 1 # this one can be got by the td class attribute
  328. reg_date_tr = 3
  329. location_tr = 5
  330. proposal_tr = 6
  331. class SuffolkCoastalParser(AcolnetParser):
  332. # case_number_tr = 1 # this one can be got by the td class attribute
  333. # reg_date_tr = 2
  334. # location_tr = 4
  335. # proposal_tr = 5
  336. # New URL with different layout
  337. case_number_tr = 1
  338. reg_date_tr = 3
  339. location_tr = 5
  340. proposal_tr = 6
  341. comments_email_address = "d.c.admin@suffolkcoastal.gov.uk"
  342. class GuildfordParser(AcolnetParser):
  343. case_number_tr = 1
  344. reg_date_tr = 7
  345. location_tr = 2
  346. proposal_tr = 3
  347. #http://www.guildford.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&Root=PgeSearch
  348. class BoltonParser(AcolnetParser):
  349. case_number_tr = 1
  350. reg_date_tr = 2
  351. location_tr = 4
  352. proposal_tr = 5
  353. comments_email_address = "Planning.control@bolton.gov.uk"
  354. class ExeterParser(AcolnetParser):
  355. case_number_tr = 1
  356. reg_date_tr = 3
  357. location_tr = 5
  358. proposal_tr = 6
  359. class SurreyHeathParser(AcolnetParser):
  360. # This is not working yet.
  361. # _getSearchResponse is an attempt to work around
  362. # cookies and a javascript redirect.
  363. # I may have a bit more of a go at this at some point if I have time.
  364. case_number_tr = 1 # this one can be got by the td class attribute
  365. reg_date_tr = 2
  366. location_tr = 4
  367. proposal_tr = 5
  368. comments_email_address = "development-control@surreyheath.gov.uk"
  369. def _getSearchResponse(self):
  370. # It looks like we sometimes need to do some stuff to get around a
  371. # javascript redirect and cookies.
  372. search_form_request = urllib2.Request(self.base_url)
  373. # Lying about the user-agent doesn't seem to help.
  374. #search_form_request.add_header("user-agent", "Mozilla/5.0 (compatible; Konqu...L/3.5.6 (like Gecko) (Kubuntu)")
  375. search_form_response = urllib2.urlopen(search_form_request)
  376. cookie_jar.extract_cookies(search_form_response, search_form_request)
  377. print search_form_response.geturl()
  378. print search_form_response.info()
  379. print search_form_response.read()
  380. # validate_url = "https://www.public.surreyheath-online.gov.uk/whalecom7cace3215643e22bb7b0b8cc97a7/whalecom0/InternalSite/Validate.asp"
  381. # javascript_redirect_url = urlparse.urljoin(self.base_url, "/whalecom7cace3215643e22bb7b0b8cc97a7/whalecom0/InternalSite/RedirectToOrigURL.asp?site_name=public&secure=1")
  382. # javascript_redirect_request = urllib2.Request(javascript_redirect_url)
  383. # javascript_redirect_request.add_header('Referer', validate_url)
  384. # cookie_jar.add_cookie_header(javascript_redirect_request)
  385. # javascript_redirect_response = urllib2.urlopen(javascript_redirect_request)
  386. # return javascript_redirect_response
  387. if __name__ == '__main__':
  388. day = 4
  389. month = 12
  390. year = 2007
  391. # returns error 400 - bad request
  392. #parser = BridgenorthParser()
  393. # cambridgeshire is a bit different...
  394. # no advanced search page
  395. # canterbury
  396. # results as columns of one table
  397. #parser = SurreyHeathParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  398. parser = BridgnorthParser("Bridgnorth", "Bridgnorth", "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  399. # parser = OldhamParser("Oldham", "Oldham", "http://planning.oldham.gov.uk/planning/AcolNetCGI.gov?ACTION=UNWRAP&Root=PgeSearch")
  400. print parser.getResults(day, month, year)