Automatically exported from code.google.com/p/planningalerts
Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

AcolnetParser.py 18 KiB

há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542
  1. #!/usr/local/bin/python
  2. import urllib, urllib2
  3. import HTMLParser
  4. #from BeautifulSoup import BeautifulSoup
  5. # Adding this to try to help Surrey Heath - Duncan 14/9/2007
  6. import cookielib
  7. cookie_jar = cookielib.CookieJar()
  8. ################
  9. import urlparse
  10. import re
  11. end_head_regex = re.compile("</head", re.IGNORECASE)
  12. import MultipartPostHandler
  13. # this is not mine, or part of standard python (though it should be!)
  14. # it comes from http://pipe.scs.fsu.edu/PostHandler/MultipartPostHandler.py
  15. from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication
  16. from datetime import date
  17. from time import strptime
  18. date_format = "%d/%m/%Y"
  19. our_date = date(2007,4,25)
  20. #This is to get the system key out of the info url
  21. system_key_regex = re.compile("TheSystemkey=(\d*)", re.IGNORECASE)
  22. class AcolnetParser(HTMLParser.HTMLParser):
  23. case_number_tr = None # this one can be got by the td class attribute
  24. reg_date_tr = None
  25. location_tr = None
  26. proposal_tr = None
  27. # There is no online comment facility in these, so we provide an
  28. # appropriate email address instead
  29. comments_email_address = None
  30. # The optional amp; is to cope with Oldham, which seems to have started
  31. # quoting this url.
  32. action_regex = re.compile("<form[^>]*action=\"([^\"]*ACTION=UNWRAP&(?:amp;)?RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)
  33. def __init__(self,
  34. authority_name,
  35. authority_short_name,
  36. base_url,
  37. debug=False):
  38. HTMLParser.HTMLParser.__init__(self)
  39. self.authority_name = authority_name
  40. self.authority_short_name = authority_short_name
  41. self.base_url = base_url
  42. self.debug = debug
  43. self._tr_number = 0
  44. # This will be used to track the subtable depth
  45. # when we are in a results-table, in order to
  46. # avoid adding an application before we have got to
  47. # the end of the results-table
  48. self._subtable_depth = None
  49. self._in_td = False
  50. # This in where we store the results
  51. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  52. # This will store the planning application we are currently working on.
  53. self._current_application = None
  54. def _cleanupHTML(self, html):
  55. """This method should be overridden in subclasses to perform site specific
  56. HTML cleanup."""
  57. return html
  58. def handle_starttag(self, tag, attrs):
  59. #print tag, attrs
  60. if tag == "table":
  61. if self._current_application is None:
  62. # Each application is in a separate table with class "results-table"
  63. for key, value in attrs:
  64. if key == "class" and value == "results-table":
  65. #print "found results-table"
  66. self._current_application = PlanningApplication()
  67. self._tr_number = 0
  68. self._subtable_depth = 0
  69. self._current_application.comment_url = self.comments_email_address
  70. break
  71. else:
  72. # We are already in a results-table, and this is the start of a subtable,
  73. # so increment the subtable depth.
  74. self._subtable_depth += 1
  75. elif self._current_application is not None:
  76. if tag == "tr" and self._subtable_depth == 0:
  77. self._tr_number += 1
  78. if tag == "td":
  79. self._in_td = True
  80. if tag == "a" and self._tr_number == self.case_number_tr:
  81. # this is where we get the info link and the case number
  82. for key, value in attrs:
  83. if key == "href":
  84. self._current_application.info_url = value
  85. system_key = system_key_regex.search(value).groups()[0]
  86. if self.comments_email_address is not None:
  87. self._current_application.comment_url = self.comments_email_address
  88. else:
  89. self._current_application.comment_url = value.replace("PgeResultDetail", "PgeCommentForm")
  90. def handle_data(self, data):
  91. # If we are in the tr which contains the case number,
  92. # then data is the council reference, so
  93. # add it to self._current_application.
  94. if self._in_td:
  95. if self._tr_number == self.case_number_tr:
  96. self._current_application.council_reference = data.strip()
  97. elif self._tr_number == self.reg_date_tr:
  98. # we need to make a date object out of data
  99. date_as_str = ''.join(data.strip().split())
  100. received_date = date(*strptime(date_as_str, date_format)[0:3])
  101. #print received_date
  102. self._current_application.date_received = received_date
  103. elif self._tr_number == self.location_tr:
  104. location = data.strip()
  105. self._current_application.address = location
  106. self._current_application.postcode = getPostcodeFromText(location)
  107. elif self._tr_number == self.proposal_tr:
  108. self._current_application.description = data.strip()
  109. def handle_endtag(self, tag):
  110. #print "ending: ", tag
  111. if tag == "table" and self._current_application is not None:
  112. if self._subtable_depth > 0:
  113. self._subtable_depth -= 1
  114. else:
  115. # We need to add the last application in the table
  116. if self._current_application is not None:
  117. #print "adding application"
  118. self._results.addApplication(self._current_application)
  119. #print self._current_application
  120. self._current_application = None
  121. self._tr_number = None
  122. self._subtable_depth = None
  123. elif tag == "td":
  124. self._in_td = False
  125. def _getSearchResponse(self):
  126. # It looks like we sometimes need to do some stuff to get around a
  127. # javascript redirect and cookies.
  128. search_form_request = urllib2.Request(self.base_url)
  129. search_form_response = urllib2.urlopen(search_form_request)
  130. return search_form_response
  131. def getResultsByDayMonthYear(self, day, month, year):
  132. # first we fetch the search page to get ourselves some session info...
  133. search_form_response = self._getSearchResponse()
  134. search_form_contents = search_form_response.read()
  135. # This sometimes causes a problem in HTMLParser, so let's just get the link
  136. # out with a regex...
  137. groups = self.action_regex.search(search_form_contents).groups()
  138. action = groups[0]
  139. #print action
  140. # This is to handle the amp; which seems to have appeared in this
  141. # url on the Oldham site
  142. action = ''.join(action.split('amp;'))
  143. action_url = urlparse.urljoin(self.base_url, action)
  144. print action_url
  145. our_date = date(year, month, day)
  146. search_data = {"regdate1": our_date.strftime(date_format),
  147. "regdate2": our_date.strftime(date_format),
  148. }
  149. opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
  150. response = opener.open(action_url, search_data)
  151. results_html = response.read()
  152. #outfile = open("tmpfile", "w")
  153. #outfile.write(results_html)
  154. # This is for doing site specific html cleanup
  155. results_html = self._cleanupHTML(results_html)
  156. #some javascript garbage in the header upsets HTMLParser,
  157. #so we'll just have the body
  158. just_body = "<html>" + end_head_regex.split(results_html)[-1]
  159. #outfile = open(self.authority_short_name + ".debug", "w")
  160. #outfile.write(just_body)
  161. self.feed(just_body)
  162. return self._results
  163. def getResults(self, day, month, year):
  164. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  165. ## # Babergh up to 21/06/2007
  166. ## class BaberghParser(AcolnetParser):
  167. ## case_number_tr = 1 # this one can be got by the td class attribute
  168. ## reg_date_tr = 2
  169. ## location_tr = 4
  170. ## proposal_tr = 5
  171. ## # It would be nice to scrape this...
  172. ## comments_email_address = "planning.reception@babergh.gov.uk"
  173. # Site changes to here from 22/06/2007
  174. class BaberghParser(AcolnetParser):
  175. case_number_tr = 1 # this one can be got by the td class attribute
  176. reg_date_tr = 3
  177. location_tr = 5
  178. proposal_tr = 6
  179. # It would be nice to scrape this...
  180. comments_email_address = "planning.reception@babergh.gov.uk"
  181. class BasingstokeParser(AcolnetParser):
  182. case_number_tr = 1 # this one can be got by the td class attribute
  183. reg_date_tr = 3
  184. location_tr = 6
  185. proposal_tr = 8
  186. # It would be nice to scrape this...
  187. comments_email_address = "development.control@basingstoke.gov.uk"
  188. class BassetlawParser(AcolnetParser):
  189. case_number_tr = 1 # this one can be got by the td class attribute
  190. reg_date_tr = 2
  191. location_tr = 4
  192. proposal_tr = 5
  193. comments_email_address = "planning@bassetlaw.gov.uk"
  194. def _cleanupHTML(self, html):
  195. """There is a broken div in this page. We don't need any divs, so
  196. let's get rid of them all."""
  197. div_regex = re.compile("</?div[^>]*>", re.IGNORECASE)
  198. return div_regex.sub('', html)
  199. class BridgnorthParser(AcolnetParser):
  200. # This site is currently down...
  201. #search_url = "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"
  202. #authority_name = "Bridgenorth District Council"
  203. #authority_short_name = "Bridgenorth"
  204. case_number_tr = 1 # this one can be got by the td class attribute
  205. reg_date_tr = 2
  206. location_tr = 4
  207. proposal_tr = 5
  208. comments_email_address = "contactus@bridgnorth-dc.gov.uk"
  209. class BuryParser(AcolnetParser):
  210. case_number_tr = 1 # this one can be got by the td class attribute
  211. reg_date_tr = 3
  212. location_tr = 5
  213. proposal_tr = 6
  214. #comments_email_address = "development.control@bury.gov.uk"
  215. ## class CanterburyParser(AcolnetParser):
  216. ## search_url = "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
  217. ## case_number_tr = 1 # this one can be got by the td class attribute
  218. ## reg_date_tr = 2
  219. ## location_tr = 4
  220. ## proposal_tr = 5
  221. ## authority_name = "Canterbury City Council"
  222. ## authority_short_name = "Canterbury"
  223. ## comments_email_address = ""
  224. class CarlisleParser(AcolnetParser):
  225. case_number_tr = 1 # this one can be got by the td class attribute
  226. reg_date_tr = 2
  227. location_tr = 5
  228. proposal_tr = 6
  229. comments_email_address = "dc@carlisle.gov.uk"
  230. class DerbyParser(AcolnetParser):
  231. case_number_tr = 1 # this one can be got by the td class attribute
  232. reg_date_tr = 3
  233. location_tr = 4
  234. proposal_tr = 5
  235. comments_email_address = "developmentcontrol@derby.gov.uk"
  236. class CroydonParser(AcolnetParser):
  237. case_number_tr = 1 # this one can be got by the td class attribute
  238. reg_date_tr = 3
  239. location_tr = 5
  240. proposal_tr = 6
  241. comments_email_address = "planning.control@croydon.gov.uk"
  242. class EastLindseyParser(AcolnetParser):
  243. case_number_tr = 1 # this one can be got by the td class attribute
  244. reg_date_tr = 3
  245. location_tr = 5
  246. proposal_tr = 6
  247. comments_email_address = "development.control@e-lindsey.gov.uk"
  248. class FyldeParser(AcolnetParser):
  249. case_number_tr = 1 # this one can be got by the td class attribute
  250. reg_date_tr = 2
  251. location_tr = 4
  252. proposal_tr = 5
  253. comments_email_address = "planning@fylde.gov.uk"
  254. class HarlowParser(AcolnetParser):
  255. case_number_tr = 1 # this one can be got by the td class attribute
  256. reg_date_tr = 2
  257. location_tr = 4
  258. proposal_tr = 5
  259. comments_email_address = "Planning.services@harlow.gov.uk"
  260. class HavantParser(AcolnetParser):
  261. case_number_tr = 1 # this one can be got by the td class attribute
  262. reg_date_tr = 3
  263. location_tr = 6
  264. proposal_tr = 8
  265. comments_email_address = "representations@havant.gov.uk"
  266. class HertsmereParser(AcolnetParser):
  267. case_number_tr = 1 # this one can be got by the td class attribute
  268. reg_date_tr = 2
  269. location_tr = 4
  270. proposal_tr = 5
  271. comments_email_address = "planning@hertsmere.gov.uk"
  272. class LewishamParser(AcolnetParser):
  273. case_number_tr = 1 # this one can be got by the td class attribute
  274. reg_date_tr = 2
  275. location_tr = 4
  276. proposal_tr = 5
  277. comments_email_address = "planning@lewisham.gov.uk"
  278. class NorthHertfordshireParser(AcolnetParser):
  279. case_number_tr = 1 # this one can be got by the td class attribute
  280. reg_date_tr = 2
  281. location_tr = 4
  282. proposal_tr = 5
  283. ## class MidSuffolkParser(AcolnetParser):
  284. ## case_number_tr = 1 # this one can be got by the td class attribute
  285. ## reg_date_tr = 2
  286. ## location_tr = 4
  287. ## proposal_tr = 5
  288. ## comments_email_address = "planning@lewisham.gov.uk"
  289. ## #action_regex = re.compile("<FORM .*action=\"(.*ACTION=UNWRAP&RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)
  290. class NewForestNPParser(AcolnetParser):
  291. # In this case there is an online comment facility at the
  292. # bottom of each view app page...
  293. case_number_tr = 1 # this one can be got by the td class attribute
  294. reg_date_tr = 2
  295. location_tr = 4
  296. proposal_tr = 5
  297. class NewForestDCParser(AcolnetParser):
  298. # In this case there is an online comment facility at the
  299. # bottom of each view app page...
  300. case_number_tr = 1 # this one can be got by the td class attribute
  301. reg_date_tr = 3
  302. location_tr = 6
  303. proposal_tr = 7
  304. class NorthWiltshireParser(AcolnetParser):
  305. case_number_tr = 1 # this one can be got by the td class attribute
  306. reg_date_tr = 3
  307. location_tr = 6
  308. proposal_tr = 7
  309. class OldhamParser(AcolnetParser):
  310. case_number_tr = 1 # this one can be got by the td class attribute
  311. reg_date_tr = 3
  312. location_tr = 6
  313. proposal_tr = 7
  314. def _cleanupHTML(self, html):
  315. """There is a bad table end tag in this one.
  316. Fix it before we start"""
  317. bad_table_end = '</table summary="Copyright">'
  318. good_table_end = '</table>'
  319. return html.replace(bad_table_end, good_table_end)
  320. class RenfrewshireParser(AcolnetParser):
  321. case_number_tr = 1 # this one can be got by the td class attribute
  322. reg_date_tr = 2
  323. location_tr = 4
  324. proposal_tr = 5
  325. comments_email_address = "pt@renfrewshire.gov.uk"
  326. class SouthBedfordshireParser(AcolnetParser):
  327. case_number_tr = 1 # this one can be got by the td class attribute
  328. reg_date_tr = 3
  329. location_tr = 5
  330. proposal_tr = 6
  331. class SuffolkCoastalParser(AcolnetParser):
  332. # case_number_tr = 1 # this one can be got by the td class attribute
  333. # reg_date_tr = 2
  334. # location_tr = 4
  335. # proposal_tr = 5
  336. # New URL with different layout
  337. case_number_tr = 1
  338. reg_date_tr = 3
  339. location_tr = 5
  340. proposal_tr = 6
  341. comments_email_address = "d.c.admin@suffolkcoastal.gov.uk"
  342. class GuildfordParser(AcolnetParser):
  343. case_number_tr = 1
  344. reg_date_tr = 7
  345. location_tr = 2
  346. proposal_tr = 3
  347. #http://www.guildford.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&Root=PgeSearch
  348. class BoltonParser(AcolnetParser):
  349. case_number_tr = 1
  350. reg_date_tr = 2
  351. location_tr = 4
  352. proposal_tr = 5
  353. comments_email_address = "Planning.control@bolton.gov.uk"
  354. class ExeterParser(AcolnetParser):
  355. case_number_tr = 1
  356. reg_date_tr = 3
  357. location_tr = 5
  358. proposal_tr = 6
  359. class SurreyHeathParser(AcolnetParser):
  360. # This is not working yet.
  361. # _getSearchResponse is an attempt to work around
  362. # cookies and a javascript redirect.
  363. # I may have a bit more of a go at this at some point if I have time.
  364. case_number_tr = 1 # this one can be got by the td class attribute
  365. reg_date_tr = 2
  366. location_tr = 4
  367. proposal_tr = 5
  368. comments_email_address = "development-control@surreyheath.gov.uk"
  369. def _getSearchResponse(self):
  370. # It looks like we sometimes need to do some stuff to get around a
  371. # javascript redirect and cookies.
  372. search_form_request = urllib2.Request(self.base_url)
  373. # Lying about the user-agent doesn't seem to help.
  374. #search_form_request.add_header("user-agent", "Mozilla/5.0 (compatible; Konqu...L/3.5.6 (like Gecko) (Kubuntu)")
  375. search_form_response = urllib2.urlopen(search_form_request)
  376. cookie_jar.extract_cookies(search_form_response, search_form_request)
  377. print search_form_response.geturl()
  378. print search_form_response.info()
  379. print search_form_response.read()
  380. # validate_url = "https://www.public.surreyheath-online.gov.uk/whalecom7cace3215643e22bb7b0b8cc97a7/whalecom0/InternalSite/Validate.asp"
  381. # javascript_redirect_url = urlparse.urljoin(self.base_url, "/whalecom7cace3215643e22bb7b0b8cc97a7/whalecom0/InternalSite/RedirectToOrigURL.asp?site_name=public&secure=1")
  382. # javascript_redirect_request = urllib2.Request(javascript_redirect_url)
  383. # javascript_redirect_request.add_header('Referer', validate_url)
  384. # cookie_jar.add_cookie_header(javascript_redirect_request)
  385. # javascript_redirect_response = urllib2.urlopen(javascript_redirect_request)
  386. # return javascript_redirect_response
  387. if __name__ == '__main__':
  388. day = 20
  389. month = 11
  390. year = 2007
  391. # returns error 400 - bad request
  392. #parser = BridgenorthParser()
  393. # cambridgeshire is a bit different...
  394. # no advanced search page
  395. # canterbury
  396. # results as columns of one table
  397. #parser = SurreyHeathParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  398. parser = OldhamParser("Oldham", "Oldham", "http://planning.oldham.gov.uk/planning/AcolNetCGI.gov?ACTION=UNWRAP&Root=PgeSearch")
  399. print parser.getResults(day, month, year)