Automatically exported from code.google.com/p/planningalerts
Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.
 
 
 
 
 
 

505 рядки
17 KiB

  1. #!/usr/local/bin/python
  2. import urllib, urllib2
  3. import HTMLParser
  4. #from BeautifulSoup import BeautifulSoup
  5. # Adding this to try to help Surrey Heath - Duncan 14/9/2007
  6. import cookielib
  7. cookie_jar = cookielib.CookieJar()
  8. ################
  9. import urlparse
  10. import re
  11. end_head_regex = re.compile("</head", re.IGNORECASE)
  12. import MultipartPostHandler
  13. # this is not mine, or part of standard python (though it should be!)
  14. # it comes from http://pipe.scs.fsu.edu/PostHandler/MultipartPostHandler.py
  15. from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication
  16. from datetime import date
  17. from time import strptime
  18. date_format = "%d/%m/%Y"
  19. our_date = date(2007,4,25)
  20. #This is to get the system key out of the info url
  21. system_key_regex = re.compile("TheSystemkey=(\d*)", re.IGNORECASE)
  22. class AcolnetParser(HTMLParser.HTMLParser):
  23. case_number_tr = None # this one can be got by the td class attribute
  24. reg_date_tr = None
  25. location_tr = None
  26. proposal_tr = None
  27. # There is no online comment facility in these, so we provide an
  28. # appropriate email address instead
  29. comments_email_address = None
  30. action_regex = re.compile("<form[^>]*action=\"([^\"]*ACTION=UNWRAP&RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)
  31. def __init__(self,
  32. authority_name,
  33. authority_short_name,
  34. base_url,
  35. debug=False):
  36. HTMLParser.HTMLParser.__init__(self)
  37. self.authority_name = authority_name
  38. self.authority_short_name = authority_short_name
  39. self.base_url = base_url
  40. self.debug = debug
  41. self._tr_number = 0
  42. # This will be used to track the subtable depth
  43. # when we are in a results-table, in order to
  44. # avoid adding an application before we have got to
  45. # the end of the results-table
  46. self._subtable_depth = None
  47. self._in_td = False
  48. # This in where we store the results
  49. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  50. # This will store the planning application we are currently working on.
  51. self._current_application = None
  52. def _cleanupHTML(self, html):
  53. """This method should be overridden in subclasses to perform site specific
  54. HTML cleanup."""
  55. return html
  56. def handle_starttag(self, tag, attrs):
  57. #print tag, attrs
  58. if tag == "table":
  59. if self._current_application is None:
  60. # Each application is in a separate table with class "results-table"
  61. for key, value in attrs:
  62. if key == "class" and value == "results-table":
  63. #print "found results-table"
  64. self._current_application = PlanningApplication()
  65. self._tr_number = 0
  66. self._subtable_depth = 0
  67. self._current_application.comment_url = self.comments_email_address
  68. break
  69. else:
  70. # We are already in a results-table, and this is the start of a subtable,
  71. # so increment the subtable depth.
  72. self._subtable_depth += 1
  73. elif self._current_application is not None:
  74. if tag == "tr" and self._subtable_depth == 0:
  75. self._tr_number += 1
  76. if tag == "td":
  77. self._in_td = True
  78. if tag == "a" and self._tr_number == self.case_number_tr:
  79. # this is where we get the info link and the case number
  80. for key, value in attrs:
  81. if key == "href":
  82. self._current_application.info_url = value
  83. system_key = system_key_regex.search(value).groups()[0]
  84. if self.comments_email_address is not None:
  85. self._current_application.comment_url = self.comments_email_address
  86. else:
  87. self._current_application.comment_url = value.replace("PgeResultDetail", "PgeCommentForm")
  88. def handle_data(self, data):
  89. # If we are in the tr which contains the case number,
  90. # then data is the council reference, so
  91. # add it to self._current_application.
  92. if self._in_td:
  93. if self._tr_number == self.case_number_tr:
  94. self._current_application.council_reference = data.strip()
  95. elif self._tr_number == self.reg_date_tr:
  96. # we need to make a date object out of data
  97. date_as_str = ''.join(data.strip().split())
  98. received_date = date(*strptime(date_as_str, date_format)[0:3])
  99. #print received_date
  100. self._current_application.date_received = received_date
  101. elif self._tr_number == self.location_tr:
  102. location = data.strip()
  103. self._current_application.address = location
  104. self._current_application.postcode = getPostcodeFromText(location)
  105. elif self._tr_number == self.proposal_tr:
  106. self._current_application.description = data.strip()
  107. def handle_endtag(self, tag):
  108. #print "ending: ", tag
  109. if tag == "table" and self._current_application is not None:
  110. if self._subtable_depth > 0:
  111. self._subtable_depth -= 1
  112. else:
  113. # We need to add the last application in the table
  114. if self._current_application is not None:
  115. #print "adding application"
  116. self._results.addApplication(self._current_application)
  117. #print self._current_application
  118. self._current_application = None
  119. self._tr_number = None
  120. self._subtable_depth = None
  121. elif tag == "td":
  122. self._in_td = False
  123. def _getSearchResponse(self):
  124. # It looks like we sometimes need to do some stuff to get around a
  125. # javascript redirect and cookies.
  126. search_form_request = urllib2.Request(self.base_url)
  127. search_form_response = urllib2.urlopen(search_form_request)
  128. return search_form_response
  129. def getResultsByDayMonthYear(self, day, month, year):
  130. # first we fetch the search page to get ourselves some session info...
  131. search_form_response = self._getSearchResponse()
  132. search_form_contents = search_form_response.read()
  133. #outfile = open("tmpfile", "w")
  134. #outfile.write(search_form_contents)
  135. # This sometimes causes a problem in HTMLParser, so let's just get the link
  136. # out with a regex...
  137. groups = self.action_regex.search(search_form_contents).groups()
  138. action = groups[0]
  139. #print action
  140. action_url = urlparse.urljoin(self.base_url, action)
  141. #print action_url
  142. our_date = date(year, month, day)
  143. search_data = {"regdate1": our_date.strftime(date_format),
  144. "regdate2": our_date.strftime(date_format),
  145. }
  146. opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
  147. response = opener.open(action_url, search_data)
  148. results_html = response.read()
  149. # This is for doing site specific html cleanup
  150. results_html = self._cleanupHTML(results_html)
  151. #some javascript garbage in the header upsets HTMLParser,
  152. #so we'll just have the body
  153. just_body = "<html>" + end_head_regex.split(results_html)[-1]
  154. #outfile = open(self.authority_short_name + ".debug", "w")
  155. #outfile.write(just_body)
  156. self.feed(just_body)
  157. return self._results
  158. def getResults(self, day, month, year):
  159. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  160. class BaberghParser(AcolnetParser):
  161. case_number_tr = 1 # this one can be got by the td class attribute
  162. reg_date_tr = 2
  163. location_tr = 4
  164. proposal_tr = 5
  165. # It would be nice to scrape this...
  166. comments_email_address = "planning.reception@babergh.gov.uk"
  167. class BasingstokeParser(AcolnetParser):
  168. case_number_tr = 1 # this one can be got by the td class attribute
  169. reg_date_tr = 3
  170. location_tr = 6
  171. proposal_tr = 8
  172. # It would be nice to scrape this...
  173. comments_email_address = "development.control@basingstoke.gov.uk"
  174. class BassetlawParser(AcolnetParser):
  175. case_number_tr = 1 # this one can be got by the td class attribute
  176. reg_date_tr = 2
  177. location_tr = 5
  178. proposal_tr = 6
  179. comments_email_address = "planning@bassetlaw.gov.uk"
  180. def _cleanupHTML(self, html):
  181. """There is a broken div in this page. We don't need any divs, so
  182. let's get rid of them all."""
  183. div_regex = re.compile("</?div[^>]*>", re.IGNORECASE)
  184. return div_regex.sub('', html)
  185. class BridgnorthParser(AcolnetParser):
  186. # This site is currently down...
  187. #search_url = "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"
  188. #authority_name = "Bridgenorth District Council"
  189. #authority_short_name = "Bridgenorth"
  190. case_number_tr = 1 # this one can be got by the td class attribute
  191. reg_date_tr = 2
  192. location_tr = 4
  193. proposal_tr = 5
  194. comments_email_address = "contactus@bridgnorth-dc.gov.uk"
  195. class BuryParser(AcolnetParser):
  196. case_number_tr = 1 # this one can be got by the td class attribute
  197. reg_date_tr = 2
  198. location_tr = 4
  199. proposal_tr = 5
  200. comments_email_address = "development.control@bury.gov.uk"
  201. ## class CanterburyParser(AcolnetParser):
  202. ## search_url = "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
  203. ## case_number_tr = 1 # this one can be got by the td class attribute
  204. ## reg_date_tr = 2
  205. ## location_tr = 4
  206. ## proposal_tr = 5
  207. ## authority_name = "Canterbury City Council"
  208. ## authority_short_name = "Canterbury"
  209. ## comments_email_address = ""
  210. class CarlisleParser(AcolnetParser):
  211. case_number_tr = 1 # this one can be got by the td class attribute
  212. reg_date_tr = 2
  213. location_tr = 5
  214. proposal_tr = 6
  215. comments_email_address = "dc@carlisle.gov.uk"
  216. class DerbyParser(AcolnetParser):
  217. case_number_tr = 1 # this one can be got by the td class attribute
  218. reg_date_tr = 3
  219. location_tr = 4
  220. proposal_tr = 5
  221. comments_email_address = "developmentcontrol@derby.gov.uk"
  222. class CroydonParser(AcolnetParser):
  223. case_number_tr = 1 # this one can be got by the td class attribute
  224. reg_date_tr = 3
  225. location_tr = 5
  226. proposal_tr = 6
  227. comments_email_address = "planning.control@croydon.gov.uk"
  228. class EastLindseyParser(AcolnetParser):
  229. case_number_tr = 1 # this one can be got by the td class attribute
  230. reg_date_tr = 3
  231. location_tr = 5
  232. proposal_tr = 6
  233. comments_email_address = "development.control@e-lindsey.gov.uk"
  234. class FyldeParser(AcolnetParser):
  235. case_number_tr = 1 # this one can be got by the td class attribute
  236. reg_date_tr = 2
  237. location_tr = 4
  238. proposal_tr = 5
  239. comments_email_address = "planning@fylde.gov.uk"
  240. class HarlowParser(AcolnetParser):
  241. case_number_tr = 1 # this one can be got by the td class attribute
  242. reg_date_tr = 2
  243. location_tr = 4
  244. proposal_tr = 5
  245. comments_email_address = "Planning.services@harlow.gov.uk"
  246. class HavantParser(AcolnetParser):
  247. case_number_tr = 1 # this one can be got by the td class attribute
  248. reg_date_tr = 3
  249. location_tr = 6
  250. proposal_tr = 8
  251. comments_email_address = "representations@havant.gov.uk"
  252. class HertsmereParser(AcolnetParser):
  253. case_number_tr = 1 # this one can be got by the td class attribute
  254. reg_date_tr = 2
  255. location_tr = 4
  256. proposal_tr = 5
  257. comments_email_address = "planning@hertsmere.gov.uk"
  258. class LewishamParser(AcolnetParser):
  259. case_number_tr = 1 # this one can be got by the td class attribute
  260. reg_date_tr = 2
  261. location_tr = 4
  262. proposal_tr = 5
  263. comments_email_address = "planning@lewisham.gov.uk"
  264. class NorthHertfordshireParser(AcolnetParser):
  265. case_number_tr = 1 # this one can be got by the td class attribute
  266. reg_date_tr = 2
  267. location_tr = 4
  268. proposal_tr = 5
  269. ## class MidSuffolkParser(AcolnetParser):
  270. ## case_number_tr = 1 # this one can be got by the td class attribute
  271. ## reg_date_tr = 2
  272. ## location_tr = 4
  273. ## proposal_tr = 5
  274. ## comments_email_address = "planning@lewisham.gov.uk"
  275. ## #action_regex = re.compile("<FORM .*action=\"(.*ACTION=UNWRAP&RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)
  276. class NewForestNPParser(AcolnetParser):
  277. # In this case there is an online comment facility at the
  278. # bottom of each view app page...
  279. case_number_tr = 1 # this one can be got by the td class attribute
  280. reg_date_tr = 2
  281. location_tr = 4
  282. proposal_tr = 5
  283. class NewForestDCParser(AcolnetParser):
  284. # In this case there is an online comment facility at the
  285. # bottom of each view app page...
  286. case_number_tr = 1 # this one can be got by the td class attribute
  287. reg_date_tr = 2
  288. location_tr = 5
  289. proposal_tr = 6
  290. class NorthWiltshireParser(AcolnetParser):
  291. case_number_tr = 1 # this one can be got by the td class attribute
  292. reg_date_tr = 3
  293. location_tr = 6
  294. proposal_tr = 7
  295. class OldhamParser(AcolnetParser):
  296. case_number_tr = 1 # this one can be got by the td class attribute
  297. reg_date_tr = 3
  298. location_tr = 6
  299. proposal_tr = 7
  300. def _cleanupHTML(self, html):
  301. """There is a bad table end tag in this one.
  302. Fix it before we start"""
  303. bad_table_end = '</table summary="Copyright">'
  304. good_table_end = '</table>'
  305. return html.replace(bad_table_end, good_table_end)
  306. class RenfrewshireParser(AcolnetParser):
  307. case_number_tr = 1 # this one can be got by the td class attribute
  308. reg_date_tr = 2
  309. location_tr = 4
  310. proposal_tr = 5
  311. comments_email_address = "pt@renfrewshire.gov.uk"
  312. class SouthBedfordshireParser(AcolnetParser):
  313. case_number_tr = 1 # this one can be got by the td class attribute
  314. reg_date_tr = 3
  315. location_tr = 5
  316. proposal_tr = 6
  317. class SuffolkCoastalParser(AcolnetParser):
  318. case_number_tr = 1 # this one can be got by the td class attribute
  319. reg_date_tr = 2
  320. location_tr = 4
  321. proposal_tr = 5
  322. comments_email_address = "d.c.admin@suffolkcoastal.gov.uk"
  323. class GuildfordParser(AcolnetParser):
  324. case_number_tr = 1
  325. reg_date_tr = 7
  326. location_tr = 2
  327. proposal_tr = 3
  328. #http://www.guildford.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&Root=PgeSearch
  329. class SurreyHeathParser(AcolnetParser):
  330. # This is not working yet.
  331. # _getSearchResponse is an attempt to work around
  332. # cookies and a javascript redirect.
  333. # I may have a bit more of a go at this at some point if I have time.
  334. case_number_tr = 1 # this one can be got by the td class attribute
  335. reg_date_tr = 2
  336. location_tr = 4
  337. proposal_tr = 5
  338. comments_email_address = "development-control@surreyheath.gov.uk"
  339. def _getSearchResponse(self):
  340. # It looks like we sometimes need to do some stuff to get around a
  341. # javascript redirect and cookies.
  342. search_form_request = urllib2.Request(self.base_url)
  343. # Lying about the user-agent doesn't seem to help.
  344. #search_form_request.add_header("user-agent", "Mozilla/5.0 (compatible; Konqu...L/3.5.6 (like Gecko) (Kubuntu)")
  345. search_form_response = urllib2.urlopen(search_form_request)
  346. cookie_jar.extract_cookies(search_form_response, search_form_request)
  347. print search_form_response.geturl()
  348. print search_form_response.info()
  349. print search_form_response.read()
  350. # validate_url = "https://www.public.surreyheath-online.gov.uk/whalecom7cace3215643e22bb7b0b8cc97a7/whalecom0/InternalSite/Validate.asp"
  351. # javascript_redirect_url = urlparse.urljoin(self.base_url, "/whalecom7cace3215643e22bb7b0b8cc97a7/whalecom0/InternalSite/RedirectToOrigURL.asp?site_name=public&secure=1")
  352. # javascript_redirect_request = urllib2.Request(javascript_redirect_url)
  353. # javascript_redirect_request.add_header('Referer', validate_url)
  354. # cookie_jar.add_cookie_header(javascript_redirect_request)
  355. # javascript_redirect_response = urllib2.urlopen(javascript_redirect_request)
  356. # return javascript_redirect_response
  357. if __name__ == '__main__':
  358. day = 22
  359. month = 2
  360. year = 2005
  361. # returns error 400 - bad request
  362. #parser = BridgenorthParser()
  363. # cambridgeshire is a bit different...
  364. # no advanced search page
  365. # canterbury
  366. # results as columns of one table
  367. #parser = SurreyHeathParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  368. parser = GuildfordParser("Guildford", "Guildford", "http://www.guildford.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&Root=PgeSearch")
  369. print parser.getResults(day, month, year)