Automatically exported from code.google.com/p/planningalerts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

AcolnetParser.py 17 KiB

17 年之前
17 年之前
17 年之前
17 年之前
17 年之前
17 年之前
17 年之前
17 年之前
17 年之前
17 年之前
17 年之前
17 年之前
17 年之前
17 年之前
17 年之前
17 年之前
17 年之前
17 年之前
17 年之前
17 年之前
17 年之前
17 年之前
17 年之前
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495
  1. #!/usr/local/bin/python
  2. import urllib, urllib2
  3. import HTMLParser
  4. #from BeautifulSoup import BeautifulSoup
  5. # Adding this to try to help Surrey Heath - Duncan 14/9/2007
  6. import cookielib
  7. cookie_jar = cookielib.CookieJar()
  8. ################
  9. import urlparse
  10. import re
  11. end_head_regex = re.compile("</head", re.IGNORECASE)
  12. import MultipartPostHandler
  13. # this is not mine, or part of standard python (though it should be!)
  14. # it comes from http://pipe.scs.fsu.edu/PostHandler/MultipartPostHandler.py
  15. from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication
  16. from datetime import date
  17. from time import strptime
  18. date_format = "%d/%m/%Y"
  19. our_date = date(2007,4,25)
  20. #This is to get the system key out of the info url
  21. system_key_regex = re.compile("TheSystemkey=(\d*)", re.IGNORECASE)
  22. class AcolnetParser(HTMLParser.HTMLParser):
  23. case_number_tr = None # this one can be got by the td class attribute
  24. reg_date_tr = None
  25. location_tr = None
  26. proposal_tr = None
  27. # There is no online comment facility in these, so we provide an
  28. # appropriate email address instead
  29. comments_email_address = None
  30. action_regex = re.compile("<form[^>]*action=\"([^\"]*ACTION=UNWRAP&RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)
  31. def __init__(self,
  32. authority_name,
  33. authority_short_name,
  34. base_url,
  35. debug=False):
  36. HTMLParser.HTMLParser.__init__(self)
  37. self.authority_name = authority_name
  38. self.authority_short_name = authority_short_name
  39. self.base_url = base_url
  40. self.debug = debug
  41. self._tr_number = 0
  42. # This will be used to track the subtable depth
  43. # when we are in a results-table, in order to
  44. # avoid adding an application before we have got to
  45. # the end of the results-table
  46. self._subtable_depth = None
  47. self._in_td = False
  48. # This in where we store the results
  49. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  50. # This will store the planning application we are currently working on.
  51. self._current_application = None
  52. def _cleanupHTML(self, html):
  53. """This method should be overridden in subclasses to perform site specific
  54. HTML cleanup."""
  55. return html
  56. def handle_starttag(self, tag, attrs):
  57. #print tag, attrs
  58. if tag == "table":
  59. if self._current_application is None:
  60. # Each application is in a separate table with class "results-table"
  61. for key, value in attrs:
  62. if key == "class" and value == "results-table":
  63. #print "found results-table"
  64. self._current_application = PlanningApplication()
  65. self._tr_number = 0
  66. self._subtable_depth = 0
  67. self._current_application.comment_url = self.comments_email_address
  68. break
  69. else:
  70. # We are already in a results-table, and this is the start of a subtable,
  71. # so increment the subtable depth.
  72. self._subtable_depth += 1
  73. elif self._current_application is not None:
  74. if tag == "tr" and self._subtable_depth == 0:
  75. self._tr_number += 1
  76. if tag == "td":
  77. self._in_td = True
  78. if tag == "a" and self._tr_number == self.case_number_tr:
  79. # this is where we get the info link and the case number
  80. for key, value in attrs:
  81. if key == "href":
  82. self._current_application.info_url = value
  83. system_key = system_key_regex.search(value).groups()[0]
  84. if self.comments_email_address is not None:
  85. self._current_application.comment_url = self.comments_email_address
  86. else:
  87. self._current_application.comment_url = value.replace("PgeResultDetail", "PgeCommentForm")
  88. def handle_data(self, data):
  89. # If we are in the tr which contains the case number,
  90. # then data is the council reference, so
  91. # add it to self._current_application.
  92. if self._in_td:
  93. if self._tr_number == self.case_number_tr:
  94. self._current_application.council_reference = data.strip()
  95. elif self._tr_number == self.reg_date_tr:
  96. # we need to make a date object out of data
  97. date_as_str = ''.join(data.strip().split())
  98. received_date = date(*strptime(date_as_str, date_format)[0:3])
  99. #print received_date
  100. self._current_application.date_received = received_date
  101. elif self._tr_number == self.location_tr:
  102. location = data.strip()
  103. self._current_application.address = location
  104. self._current_application.postcode = getPostcodeFromText(location)
  105. elif self._tr_number == self.proposal_tr:
  106. self._current_application.description = data.strip()
  107. def handle_endtag(self, tag):
  108. #print "ending: ", tag
  109. if tag == "table" and self._current_application is not None:
  110. if self._subtable_depth > 0:
  111. self._subtable_depth -= 1
  112. else:
  113. # We need to add the last application in the table
  114. if self._current_application is not None:
  115. #print "adding application"
  116. self._results.addApplication(self._current_application)
  117. #print self._current_application
  118. self._current_application = None
  119. self._tr_number = None
  120. self._subtable_depth = None
  121. elif tag == "td":
  122. self._in_td = False
  123. def _getSearchResponse(self):
  124. # It looks like we sometimes need to do some stuff to get around a
  125. # javascript redirect and cookies.
  126. search_form_request = urllib2.Request(self.base_url)
  127. search_form_response = urllib2.urlopen(search_form_request)
  128. return search_form_response
  129. def getResultsByDayMonthYear(self, day, month, year):
  130. # first we fetch the search page to get ourselves some session info...
  131. search_form_response = self._getSearchResponse()
  132. search_form_contents = search_form_response.read()
  133. #outfile = open("tmpfile", "w")
  134. #outfile.write(search_form_contents)
  135. # This sometimes causes a problem in HTMLParser, so let's just get the link
  136. # out with a regex...
  137. groups = self.action_regex.search(search_form_contents).groups()
  138. action = groups[0]
  139. #print action
  140. action_url = urlparse.urljoin(self.base_url, action)
  141. #print action_url
  142. our_date = date(year, month, day)
  143. search_data = {"regdate1": our_date.strftime(date_format),
  144. "regdate2": our_date.strftime(date_format),
  145. }
  146. opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
  147. response = opener.open(action_url, search_data)
  148. results_html = response.read()
  149. # This is for doing site specific html cleanup
  150. results_html = self._cleanupHTML(results_html)
  151. #some javascript garbage in the header upsets HTMLParser,
  152. #so we'll just have the body
  153. just_body = "<html>" + end_head_regex.split(results_html)[-1]
  154. #outfile = open(self.authority_short_name + ".debug", "w")
  155. #outfile.write(just_body)
  156. self.feed(just_body)
  157. return self._results
  158. def getResults(self, day, month, year):
  159. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  160. class BaberghParser(AcolnetParser):
  161. case_number_tr = 1 # this one can be got by the td class attribute
  162. reg_date_tr = 2
  163. location_tr = 4
  164. proposal_tr = 5
  165. # It would be nice to scrape this...
  166. comments_email_address = "planning.reception@babergh.gov.uk"
  167. class BasingstokeParser(AcolnetParser):
  168. case_number_tr = 1 # this one can be got by the td class attribute
  169. reg_date_tr = 3
  170. location_tr = 6
  171. proposal_tr = 8
  172. # It would be nice to scrape this...
  173. comments_email_address = "development.control@basingstoke.gov.uk"
  174. class BassetlawParser(AcolnetParser):
  175. case_number_tr = 1 # this one can be got by the td class attribute
  176. reg_date_tr = 2
  177. location_tr = 5
  178. proposal_tr = 6
  179. comments_email_address = "planning@bassetlaw.gov.uk"
  180. def _cleanupHTML(self, html):
  181. """There is a broken div in this page. We don't need any divs, so
  182. let's get rid of them all."""
  183. div_regex = re.compile("</?div[^>]*>", re.IGNORECASE)
  184. return div_regex.sub('', html)
  185. class BridgnorthParser(AcolnetParser):
  186. # This site is currently down...
  187. #search_url = "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"
  188. #authority_name = "Bridgenorth District Council"
  189. #authority_short_name = "Bridgenorth"
  190. case_number_tr = 1 # this one can be got by the td class attribute
  191. reg_date_tr = 2
  192. location_tr = 4
  193. proposal_tr = 5
  194. comments_email_address = "contactus@bridgnorth-dc.gov.uk"
  195. class BuryParser(AcolnetParser):
  196. case_number_tr = 1 # this one can be got by the td class attribute
  197. reg_date_tr = 2
  198. location_tr = 4
  199. proposal_tr = 5
  200. comments_email_address = "development.control@bury.gov.uk"
  201. ## class CanterburyParser(AcolnetParser):
  202. ## search_url = "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
  203. ## case_number_tr = 1 # this one can be got by the td class attribute
  204. ## reg_date_tr = 2
  205. ## location_tr = 4
  206. ## proposal_tr = 5
  207. ## authority_name = "Canterbury City Council"
  208. ## authority_short_name = "Canterbury"
  209. ## comments_email_address = ""
  210. class CarlisleParser(AcolnetParser):
  211. case_number_tr = 1 # this one can be got by the td class attribute
  212. reg_date_tr = 2
  213. location_tr = 5
  214. proposal_tr = 6
  215. comments_email_address = "dc@carlisle.gov.uk"
  216. class DerbyParser(AcolnetParser):
  217. case_number_tr = 1 # this one can be got by the td class attribute
  218. reg_date_tr = 3
  219. location_tr = 4
  220. proposal_tr = 5
  221. comments_email_address = "developmentcontrol@derby.gov.uk"
  222. class CroydonParser(AcolnetParser):
  223. case_number_tr = 1 # this one can be got by the td class attribute
  224. reg_date_tr = 3
  225. location_tr = 5
  226. proposal_tr = 6
  227. comments_email_address = "planning.control@croydon.gov.uk"
  228. class EastLindseyParser(AcolnetParser):
  229. case_number_tr = 1 # this one can be got by the td class attribute
  230. reg_date_tr = 3
  231. location_tr = 5
  232. proposal_tr = 6
  233. comments_email_address = "development.control@e-lindsey.gov.uk"
  234. class FyldeParser(AcolnetParser):
  235. case_number_tr = 1 # this one can be got by the td class attribute
  236. reg_date_tr = 2
  237. location_tr = 4
  238. proposal_tr = 5
  239. comments_email_address = "planning@fylde.gov.uk"
  240. class HarlowParser(AcolnetParser):
  241. case_number_tr = 1 # this one can be got by the td class attribute
  242. reg_date_tr = 2
  243. location_tr = 4
  244. proposal_tr = 5
  245. comments_email_address = "Planning.services@harlow.gov.uk"
  246. class HavantParser(AcolnetParser):
  247. case_number_tr = 1 # this one can be got by the td class attribute
  248. reg_date_tr = 3
  249. location_tr = 6
  250. proposal_tr = 8
  251. comments_email_address = "representations@havant.gov.uk"
  252. class HertsmereParser(AcolnetParser):
  253. case_number_tr = 1 # this one can be got by the td class attribute
  254. reg_date_tr = 2
  255. location_tr = 4
  256. proposal_tr = 5
  257. comments_email_address = "planning@hertsmere.gov.uk"
  258. class LewishamParser(AcolnetParser):
  259. case_number_tr = 1 # this one can be got by the td class attribute
  260. reg_date_tr = 2
  261. location_tr = 4
  262. proposal_tr = 5
  263. comments_email_address = "planning@lewisham.gov.uk"
  264. class NorthHertfordshireParser(AcolnetParser):
  265. case_number_tr = 1 # this one can be got by the td class attribute
  266. reg_date_tr = 2
  267. location_tr = 4
  268. proposal_tr = 5
  269. ## class MidSuffolkParser(AcolnetParser):
  270. ## case_number_tr = 1 # this one can be got by the td class attribute
  271. ## reg_date_tr = 2
  272. ## location_tr = 4
  273. ## proposal_tr = 5
  274. ## comments_email_address = "planning@lewisham.gov.uk"
  275. ## #action_regex = re.compile("<FORM .*action=\"(.*ACTION=UNWRAP&RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)
  276. class NewForestNPParser(AcolnetParser):
  277. # In this case there is an online comment facility at the
  278. # bottom of each view app page...
  279. case_number_tr = 1 # this one can be got by the td class attribute
  280. reg_date_tr = 2
  281. location_tr = 4
  282. proposal_tr = 5
  283. class NewForestDCParser(AcolnetParser):
  284. # In this case there is an online comment facility at the
  285. # bottom of each view app page...
  286. case_number_tr = 1 # this one can be got by the td class attribute
  287. reg_date_tr = 2
  288. location_tr = 5
  289. proposal_tr = 6
  290. class NorthWiltshireParser(AcolnetParser):
  291. case_number_tr = 1 # this one can be got by the td class attribute
  292. reg_date_tr = 3
  293. location_tr = 6
  294. proposal_tr = 7
  295. class OldhamParser(AcolnetParser):
  296. case_number_tr = 1 # this one can be got by the td class attribute
  297. reg_date_tr = 3
  298. location_tr = 6
  299. proposal_tr = 7
  300. def _cleanupHTML(self, html):
  301. """There is a bad table end tag in this one.
  302. Fix it before we start"""
  303. bad_table_end = '</table summary="Copyright">'
  304. good_table_end = '</table>'
  305. return html.replace(bad_table_end, good_table_end)
  306. class RenfrewshireParser(AcolnetParser):
  307. case_number_tr = 1 # this one can be got by the td class attribute
  308. reg_date_tr = 2
  309. location_tr = 4
  310. proposal_tr = 5
  311. comments_email_address = "pt@renfrewshire.gov.uk"
  312. class SouthBedfordshireParser(AcolnetParser):
  313. case_number_tr = 1 # this one can be got by the td class attribute
  314. reg_date_tr = 3
  315. location_tr = 5
  316. proposal_tr = 6
  317. class SuffolkCoastalParser(AcolnetParser):
  318. case_number_tr = 1 # this one can be got by the td class attribute
  319. reg_date_tr = 2
  320. location_tr = 4
  321. proposal_tr = 5
  322. comments_email_address = "d.c.admin@suffolkcoastal.gov.uk"
  323. class SurreyHeathParser(AcolnetParser):
  324. # This is not working yet.
  325. # _getSearchResponse is an attempt to work around
  326. # cookies and a javascript redirect.
  327. # I may have a bit more of a go at this at some point if I have time.
  328. case_number_tr = 1 # this one can be got by the td class attribute
  329. reg_date_tr = 2
  330. location_tr = 4
  331. proposal_tr = 5
  332. comments_email_address = "development-control@surreyheath.gov.uk"
  333. def _getSearchResponse(self):
  334. # It looks like we sometimes need to do some stuff to get around a
  335. # javascript redirect and cookies.
  336. search_form_request = urllib2.Request(self.base_url)
  337. # Lying about the user-agent doesn't seem to help.
  338. #search_form_request.add_header("user-agent", "Mozilla/5.0 (compatible; Konqu...L/3.5.6 (like Gecko) (Kubuntu)")
  339. search_form_response = urllib2.urlopen(search_form_request)
  340. cookie_jar.extract_cookies(search_form_response, search_form_request)
  341. print search_form_response.geturl()
  342. print search_form_response.info()
  343. print search_form_response.read()
  344. # validate_url = "https://www.public.surreyheath-online.gov.uk/whalecom7cace3215643e22bb7b0b8cc97a7/whalecom0/InternalSite/Validate.asp"
  345. # javascript_redirect_url = urlparse.urljoin(self.base_url, "/whalecom7cace3215643e22bb7b0b8cc97a7/whalecom0/InternalSite/RedirectToOrigURL.asp?site_name=public&secure=1")
  346. # javascript_redirect_request = urllib2.Request(javascript_redirect_url)
  347. # javascript_redirect_request.add_header('Referer', validate_url)
  348. # cookie_jar.add_cookie_header(javascript_redirect_request)
  349. # javascript_redirect_response = urllib2.urlopen(javascript_redirect_request)
  350. # return javascript_redirect_response
  351. if __name__ == '__main__':
  352. day = 31
  353. month = 8
  354. year = 2007
  355. # returns error 400 - bad request
  356. #parser = BridgenorthParser()
  357. # cambridgeshire is a bit different...
  358. # no advanced search page
  359. # canterbury
  360. # results as columns of one table
  361. parser = SurreyHeathParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  362. print parser.getResults(day, month, year)