Automatically exported from code.google.com/p/planningalerts
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

AcolnetParser.py 14 KiB

17年前
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. #!/usr/local/bin/python
  2. import urllib, urllib2
  3. import HTMLParser
  4. #from BeautifulSoup import BeautifulSoup
  5. import urlparse
  6. import re
  7. end_head_regex = re.compile("</head", re.IGNORECASE)
  8. import MultipartPostHandler
  9. # this is not mine, or part of standard python (though it should be!)
  10. # it comes from http://pipe.scs.fsu.edu/PostHandler/MultipartPostHandler.py
  11. from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication
  12. from datetime import date
  13. from time import strptime
  14. date_format = "%d/%m/%Y"
  15. our_date = date(2007,4,25)
  16. #This is to get the system key out of the info url
  17. system_key_regex = re.compile("TheSystemkey=(\d*)", re.IGNORECASE)
  18. class AcolnetParser(HTMLParser.HTMLParser):
  19. case_number_tr = None # this one can be got by the td class attribute
  20. reg_date_tr = None
  21. location_tr = None
  22. proposal_tr = None
  23. # There is no online comment facility in these, so we provide an
  24. # appropriate email address instead
  25. comments_email_address = None
  26. action_regex = re.compile("<form[^>]*action=\"([^\"]*ACTION=UNWRAP&RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)
  27. def __init__(self,
  28. authority_name,
  29. authority_short_name,
  30. base_url,
  31. debug=False):
  32. HTMLParser.HTMLParser.__init__(self)
  33. self.authority_name = authority_name
  34. self.authority_short_name = authority_short_name
  35. self.base_url = base_url
  36. self.debug = debug
  37. self._tr_number = 0
  38. # This will be used to track the subtable depth
  39. # when we are in a results-table, in order to
  40. # avoid adding an application before we have got to
  41. # the end of the results-table
  42. self._subtable_depth = None
  43. self._in_td = False
  44. # This in where we store the results
  45. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  46. # This will store the planning application we are currently working on.
  47. self._current_application = None
  48. def _cleanupHTML(self, html):
  49. """This method should be overridden in subclasses to perform site specific
  50. HTML cleanup."""
  51. return html
  52. def handle_starttag(self, tag, attrs):
  53. #print tag, attrs
  54. if tag == "table":
  55. if self._current_application is None:
  56. # Each application is in a separate table with class "results-table"
  57. for key, value in attrs:
  58. if key == "class" and value == "results-table":
  59. #print "found results-table"
  60. self._current_application = PlanningApplication()
  61. self._tr_number = 0
  62. self._subtable_depth = 0
  63. self._current_application.comment_url = self.comments_email_address
  64. break
  65. else:
  66. # We are already in a results-table, and this is the start of a subtable,
  67. # so increment the subtable depth.
  68. self._subtable_depth += 1
  69. elif self._current_application is not None:
  70. if tag == "tr" and self._subtable_depth == 0:
  71. self._tr_number += 1
  72. if tag == "td":
  73. self._in_td = True
  74. if tag == "a" and self._tr_number == self.case_number_tr:
  75. # this is where we get the info link and the case number
  76. for key, value in attrs:
  77. if key == "href":
  78. self._current_application.info_url = value
  79. system_key = system_key_regex.search(value).groups()[0]
  80. if self.comments_email_address is not None:
  81. self._current_application.comment_url = self.comments_email_address
  82. else:
  83. self._current_application.comment_url = value.replace("PgeResultDetail", "PgeCommentForm")
  84. def handle_data(self, data):
  85. # If we are in the tr which contains the case number,
  86. # then data is the council reference, so
  87. # add it to self._current_application.
  88. if self._in_td:
  89. if self._tr_number == self.case_number_tr:
  90. self._current_application.council_reference = data.strip()
  91. elif self._tr_number == self.reg_date_tr:
  92. # we need to make a date object out of data
  93. date_as_str = ''.join(data.strip().split())
  94. received_date = date(*strptime(date_as_str, date_format)[0:3])
  95. #print received_date
  96. self._current_application.date_received = received_date
  97. elif self._tr_number == self.location_tr:
  98. location = data.strip()
  99. self._current_application.address = location
  100. self._current_application.postcode = getPostcodeFromText(location)
  101. elif self._tr_number == self.proposal_tr:
  102. self._current_application.description = data.strip()
  103. def handle_endtag(self, tag):
  104. #print "ending: ", tag
  105. if tag == "table" and self._current_application is not None:
  106. if self._subtable_depth > 0:
  107. self._subtable_depth -= 1
  108. else:
  109. # We need to add the last application in the table
  110. if self._current_application is not None:
  111. #print "adding application"
  112. self._results.addApplication(self._current_application)
  113. #print self._current_application
  114. self._current_application = None
  115. self._tr_number = None
  116. self._subtable_depth = None
  117. elif tag == "td":
  118. self._in_td = False
  119. def getResultsByDayMonthYear(self, day, month, year):
  120. # first we fetch the search page to get ourselves some session info...
  121. search_form_response = urllib2.urlopen(self.base_url)
  122. search_form_contents = search_form_response.read()
  123. #outfile = open("tmpfile", "w")
  124. #outfile.write(search_form_contents)
  125. # This sometimes causes a problem in HTMLParser, so let's just get the link
  126. # out with a regex...
  127. groups = self.action_regex.search(search_form_contents).groups()
  128. action = groups[0]
  129. #print action
  130. action_url = urlparse.urljoin(self.base_url, action)
  131. #print action_url
  132. our_date = date(year, month, day)
  133. search_data = {"regdate1": our_date.strftime(date_format),
  134. "regdate2": our_date.strftime(date_format),
  135. }
  136. opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
  137. response = opener.open(action_url, search_data)
  138. results_html = response.read()
  139. # This is for doing site specific html cleanup
  140. results_html = self._cleanupHTML(results_html)
  141. #some javascript garbage in the header upsets HTMLParser,
  142. #so we'll just have the body
  143. just_body = "<html>" + end_head_regex.split(results_html)[-1]
  144. #outfile = open(self.authority_short_name + ".debug", "w")
  145. #outfile.write(just_body)
  146. self.feed(just_body)
  147. return self._results
  148. def getResults(self, day, month, year):
  149. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  150. class BaberghParser(AcolnetParser):
  151. case_number_tr = 1 # this one can be got by the td class attribute
  152. reg_date_tr = 2
  153. location_tr = 4
  154. proposal_tr = 5
  155. # It would be nice to scrape this...
  156. comments_email_address = "planning.reception@babergh.gov.uk"
  157. class BasingstokeParser(AcolnetParser):
  158. case_number_tr = 1 # this one can be got by the td class attribute
  159. reg_date_tr = 3
  160. location_tr = 6
  161. proposal_tr = 8
  162. # It would be nice to scrape this...
  163. comments_email_address = "development.control@basingstoke.gov.uk"
  164. class BassetlawParser(AcolnetParser):
  165. case_number_tr = 1 # this one can be got by the td class attribute
  166. reg_date_tr = 2
  167. location_tr = 5
  168. proposal_tr = 6
  169. comments_email_address = "planning@bassetlaw.gov.uk"
  170. def _cleanupHTML(self, html):
  171. """There is a broken div in this page. We don't need any divs, so
  172. let's get rid of them all."""
  173. div_regex = re.compile("</?div[^>]*>", re.IGNORECASE)
  174. return div_regex.sub('', html)
  175. class BridgnorthParser(AcolnetParser):
  176. # This site is currently down...
  177. #search_url = "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"
  178. #authority_name = "Bridgenorth District Council"
  179. #authority_short_name = "Bridgenorth"
  180. case_number_tr = 1 # this one can be got by the td class attribute
  181. reg_date_tr = 2
  182. location_tr = 4
  183. proposal_tr = 5
  184. comments_email_address = "contactus@bridgnorth-dc.gov.uk"
  185. class BuryParser(AcolnetParser):
  186. case_number_tr = 1 # this one can be got by the td class attribute
  187. reg_date_tr = 2
  188. location_tr = 4
  189. proposal_tr = 5
  190. comments_email_address = "development.control@bury.gov.uk"
  191. ## class CanterburyParser(AcolnetParser):
  192. ## search_url = "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
  193. ## case_number_tr = 1 # this one can be got by the td class attribute
  194. ## reg_date_tr = 2
  195. ## location_tr = 4
  196. ## proposal_tr = 5
  197. ## authority_name = "Canterbury City Council"
  198. ## authority_short_name = "Canterbury"
  199. ## comments_email_address = ""
  200. class CarlisleParser(AcolnetParser):
  201. case_number_tr = 1 # this one can be got by the td class attribute
  202. reg_date_tr = 2
  203. location_tr = 5
  204. proposal_tr = 6
  205. comments_email_address = "dc@carlisle.gov.uk"
  206. class DerbyParser(AcolnetParser):
  207. case_number_tr = 1 # this one can be got by the td class attribute
  208. reg_date_tr = 3
  209. location_tr = 4
  210. proposal_tr = 5
  211. comments_email_address = "developmentcontrol@derby.gov.uk"
  212. class CroydonParser(AcolnetParser):
  213. case_number_tr = 1 # this one can be got by the td class attribute
  214. reg_date_tr = 3
  215. location_tr = 5
  216. proposal_tr = 6
  217. comments_email_address = "planning.control@croydon.gov.uk"
  218. class EastLindseyParser(AcolnetParser):
  219. case_number_tr = 1 # this one can be got by the td class attribute
  220. reg_date_tr = 3
  221. location_tr = 5
  222. proposal_tr = 6
  223. comments_email_address = "development.control@e-lindsey.gov.uk"
  224. class FyldeParser(AcolnetParser):
  225. case_number_tr = 1 # this one can be got by the td class attribute
  226. reg_date_tr = 2
  227. location_tr = 4
  228. proposal_tr = 5
  229. comments_email_address = "planning@fylde.gov.uk"
  230. class HarlowParser(AcolnetParser):
  231. case_number_tr = 1 # this one can be got by the td class attribute
  232. reg_date_tr = 2
  233. location_tr = 4
  234. proposal_tr = 5
  235. comments_email_address = "Planning.services@harlow.gov.uk"
  236. class HavantParser(AcolnetParser):
  237. case_number_tr = 1 # this one can be got by the td class attribute
  238. reg_date_tr = 2
  239. location_tr = 4
  240. proposal_tr = 5
  241. comments_email_address = "representations@havant.gov.uk"
  242. class HertsmereParser(AcolnetParser):
  243. case_number_tr = 1 # this one can be got by the td class attribute
  244. reg_date_tr = 2
  245. location_tr = 4
  246. proposal_tr = 5
  247. comments_email_address = "planning@hertsmere.gov.uk"
  248. class LewishamParser(AcolnetParser):
  249. case_number_tr = 1 # this one can be got by the td class attribute
  250. reg_date_tr = 2
  251. location_tr = 4
  252. proposal_tr = 5
  253. comments_email_address = "planning@lewisham.gov.uk"
  254. ## class NorthHertfordshireParser(AcolnetParser):
  255. ## case_number_tr = 1 # this one can be got by the td class attribute
  256. ## reg_date_tr = 2
  257. ## location_tr = 4
  258. ## proposal_tr = 5
  259. ## comments_email_address = "planning@lewisham.gov.uk"
  260. ## class MidSuffolkParser(AcolnetParser):
  261. ## case_number_tr = 1 # this one can be got by the td class attribute
  262. ## reg_date_tr = 2
  263. ## location_tr = 4
  264. ## proposal_tr = 5
  265. ## comments_email_address = "planning@lewisham.gov.uk"
  266. ## #action_regex = re.compile("<FORM .*action=\"(.*ACTION=UNWRAP&RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)
  267. class NewForestParser(AcolnetParser):
  268. # In this case there is an online comment facility at the
  269. # bottom of each view app page...
  270. case_number_tr = 1 # this one can be got by the td class attribute
  271. reg_date_tr = 2
  272. location_tr = 4
  273. proposal_tr = 5
  274. class NorthWiltshireParser(AcolnetParser):
  275. case_number_tr = 1 # this one can be got by the td class attribute
  276. reg_date_tr = 3
  277. location_tr = 6
  278. proposal_tr = 7
  279. class OldhamParser(AcolnetParser):
  280. case_number_tr = 1 # this one can be got by the td class attribute
  281. reg_date_tr = 3
  282. location_tr = 6
  283. proposal_tr = 7
  284. def _cleanupHTML(self, html):
  285. """There is a bad table end tag in this one.
  286. Fix it before we start"""
  287. bad_table_end = '</table summary="Copyright">'
  288. good_table_end = '</table>'
  289. return html.replace(bad_table_end, good_table_end)
  290. class RenfrewshireParser(AcolnetParser):
  291. case_number_tr = 1 # this one can be got by the td class attribute
  292. reg_date_tr = 2
  293. location_tr = 4
  294. proposal_tr = 5
  295. comments_email_address = "pt@renfrewshire.gov.uk"
  296. if __name__ == '__main__':
  297. day = 15
  298. month = 3
  299. year = 2007
  300. # returns error 400 - bad request
  301. #parser = BridgenorthParser()
  302. # cambridgeshire is a bit different...
  303. # no advanced search page
  304. # canterbury
  305. # results as columns of one table
  306. parser = OldhamParser("Oldham", "Oldham", "http://planning.oldham.gov.uk/planning//acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  307. print parser.getResults(day, month, year)