Automatically exported from code.google.com/p/planningalerts
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

AcolnetParser.py 16 KiB

před 17 roky
před 17 roky
před 17 roky
před 17 roky
před 17 roky
před 17 roky
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426
  1. #!/usr/local/bin/python
  2. import urllib, urllib2
  3. import HTMLParser
  4. #from BeautifulSoup import BeautifulSoup
  5. import urlparse
  6. import re
  7. end_head_regex = re.compile("</head", re.IGNORECASE)
  8. import MultipartPostHandler
  9. # this is not mine, or part of standard python (though it should be!)
  10. # it comes from http://pipe.scs.fsu.edu/PostHandler/MultipartPostHandler.py
  11. from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication
  12. from datetime import date
  13. from time import strptime
  14. date_format = "%d/%m/%Y"
  15. our_date = date(2007,4,25)
  16. class AcolnetParser(HTMLParser.HTMLParser):
  17. case_number_tr = None # this one can be got by the td class attribute
  18. reg_date_tr = None
  19. location_tr = None
  20. proposal_tr = None
  21. # There is no online comment facility in these, so we provide an
  22. # appropriate email address instead
  23. comments_email_address = None
  24. def __init__(self,
  25. authority_name,
  26. authority_short_name,
  27. base_url,
  28. debug=False):
  29. HTMLParser.HTMLParser.__init__(self)
  30. self.authority_name = authority_name
  31. self.authority_short_name = authority_short_name
  32. self.base_url = base_url
  33. self.debug = debug
  34. self._tr_number = 0
  35. # This will be used to track the subtable depth
  36. # when we are in a results-table, in order to
  37. # avoid adding an application before we have got to
  38. # the end of the results-table
  39. self._subtable_depth = None
  40. self._in_td = False
  41. # This in where we store the results
  42. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  43. # This will store the planning application we are currently working on.
  44. self._current_application = None
  45. def _cleanupHTML(self, html):
  46. """This method should be overridden in subclasses to perform site specific
  47. HTML cleanup."""
  48. return html
  49. def handle_starttag(self, tag, attrs):
  50. #print tag, attrs
  51. if tag == "table":
  52. if self._current_application is None:
  53. # Each application is in a separate table with class "results-table"
  54. for key, value in attrs:
  55. if key == "class" and value == "results-table":
  56. #print "found results-table"
  57. self._current_application = PlanningApplication()
  58. self._tr_number = 0
  59. self._subtable_depth = 0
  60. self._current_application.comment_url = self.comments_email_address
  61. break
  62. else:
  63. # We are already in a results-table, and this is the start of a subtable,
  64. # so increment the subtable depth.
  65. self._subtable_depth += 1
  66. elif self._current_application is not None:
  67. if tag == "tr" and self._subtable_depth == 0:
  68. self._tr_number += 1
  69. if tag == "td":
  70. self._in_td = True
  71. if self._tr_number == self.case_number_tr:
  72. #get the reference and the info link here
  73. pass
  74. elif self._tr_number == self.reg_date_tr:
  75. #get the registration date here
  76. pass
  77. elif self._tr_number == self.location_tr:
  78. #get the address and postcode here
  79. pass
  80. elif self._tr_number == self.proposal_tr:
  81. #get the description here
  82. pass
  83. if tag == "a" and self._tr_number == self.case_number_tr:
  84. # this is where we get the info link and the case number
  85. for key, value in attrs:
  86. if key == "href":
  87. self._current_application.info_url = value
  88. def handle_data(self, data):
  89. # If we are in the tr which contains the case number,
  90. # then data is the council reference, so
  91. # add it to self._current_application.
  92. if self._in_td:
  93. if self._tr_number == self.case_number_tr:
  94. self._current_application.council_reference = data.strip()
  95. elif self._tr_number == self.reg_date_tr:
  96. # we need to make a date object out of data
  97. date_as_str = ''.join(data.strip().split())
  98. received_date = date(*strptime(date_as_str, date_format)[0:3])
  99. #print received_date
  100. self._current_application.date_received = received_date
  101. elif self._tr_number == self.location_tr:
  102. location = data.strip()
  103. self._current_application.address = location
  104. self._current_application.postcode = getPostcodeFromText(location)
  105. elif self._tr_number == self.proposal_tr:
  106. self._current_application.description = data.strip()
  107. def handle_endtag(self, tag):
  108. #print "ending: ", tag
  109. if tag == "table" and self._current_application is not None:
  110. if self._subtable_depth > 0:
  111. self._subtable_depth -= 1
  112. else:
  113. # We need to add the last application in the table
  114. if self._current_application is not None:
  115. #print "adding application"
  116. self._results.addApplication(self._current_application)
  117. #print self._current_application
  118. self._current_application = None
  119. self._tr_number = None
  120. self._subtable_depth = None
  121. elif tag == "td":
  122. self._in_td = False
  123. def getResultsByDayMonthYear(self, day, month, year):
  124. # first we fetch the search page to get ourselves some session info...
  125. search_form_response = urllib2.urlopen(self.base_url)
  126. search_form_contents = search_form_response.read()
  127. #outfile = open("tmpfile", "w")
  128. #outfile.write(search_form_contents)
  129. # This sometimes causes a problem in HTMLParser, so let's just get the link
  130. # out with a regex...
  131. groups = self.action_regex.search(search_form_contents).groups()
  132. action = groups[0]
  133. #print action
  134. action_url = urlparse.urljoin(self.base_url, action)
  135. #print action_url
  136. our_date = date(year, month, day)
  137. search_data = {"regdate1": our_date.strftime(date_format),
  138. "regdate2": our_date.strftime(date_format),
  139. }
  140. opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
  141. response = opener.open(action_url, search_data)
  142. results_html = response.read()
  143. # This is for doing site specific html cleanup
  144. results_html = self._cleanupHTML(results_html)
  145. #some javascript garbage in the header upsets HTMLParser,
  146. #so we'll just have the body
  147. just_body = "<html>" + end_head_regex.split(results_html)[-1]
  148. #outfile = open(self.authority_short_name + ".debug", "w")
  149. #outfile.write(just_body)
  150. self.feed(just_body)
  151. return self._results
  152. def getResults(self, day, month, year):
  153. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  154. class BaberghParser(AcolnetParser):
  155. #search_url = "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
  156. case_number_tr = 1 # this one can be got by the td class attribute
  157. reg_date_tr = 2
  158. location_tr = 4
  159. proposal_tr = 5
  160. #authority_name = "Babergh District Council"
  161. #authority_short_name = "Babergh"
  162. # It would be nice to scrape this...
  163. comments_email_address = "planning.reception@babergh.gov.uk"
  164. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">")
  165. class BasingstokeParser(AcolnetParser):
  166. #search_url = "http://planning.basingstoke.gov.uk/DCOnline2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
  167. case_number_tr = 1 # this one can be got by the td class attribute
  168. reg_date_tr = 3
  169. location_tr = 6
  170. proposal_tr = 8
  171. #authority_name = "Basingstoke and Deane Borough Council"
  172. #authority_short_name = "Basingstoke and Deane"
  173. # It would be nice to scrape this...
  174. comments_email_address = "development.control@basingstoke.gov.uk"
  175. action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">")
  176. class BassetlawParser(AcolnetParser):
  177. #search_url = "http://www.bassetlaw.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
  178. case_number_tr = 1 # this one can be got by the td class attribute
  179. reg_date_tr = 2
  180. location_tr = 5
  181. proposal_tr = 6
  182. #authority_name = "Bassetlaw District Council"
  183. #authority_short_name = "Bassetlaw"
  184. comments_email_address = "planning@bassetlaw.gov.uk"
  185. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  186. def _cleanupHTML(self, html):
  187. """There is a broken div in this page. We don't need any divs, so
  188. let's get rid of them all."""
  189. div_regex = re.compile("</?div[^>]*>", re.IGNORECASE)
  190. return div_regex.sub('', html)
  191. class BridgenorthParser(AcolnetParser):
  192. #search_url = "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"
  193. case_number_tr = 1 # this one can be got by the td class attribute
  194. reg_date_tr = 2
  195. location_tr = 4
  196. proposal_tr = 5
  197. #authority_name = "Bridgenorth District Council"
  198. #authority_short_name = "Bridgenorth"
  199. comments_email_address = "contactus@bridgnorth-dc.gov.uk"
  200. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  201. class BuryParser(AcolnetParser):
  202. #search_url = "http://e-planning.bury.gov.uk/ePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"
  203. case_number_tr = 1 # this one can be got by the td class attribute
  204. reg_date_tr = 2
  205. location_tr = 4
  206. proposal_tr = 5
  207. #authority_name = "Bury Metropolitan Borough Council"
  208. #authority_short_name = "Bury"
  209. comments_email_address = "development.control@bury.gov.uk"
  210. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  211. ## class CanterburyParser(AcolnetParser):
  212. ## search_url = "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
  213. ## case_number_tr = 1 # this one can be got by the td class attribute
  214. ## reg_date_tr = 2
  215. ## location_tr = 4
  216. ## proposal_tr = 5
  217. ## authority_name = "Canterbury City Council"
  218. ## authority_short_name = "Canterbury"
  219. ## comments_email_address = ""
  220. ## action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">")
  221. class CarlisleParser(AcolnetParser):
  222. #search_url = "http://planning.carlisle.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"
  223. case_number_tr = 1 # this one can be got by the td class attribute
  224. reg_date_tr = 2
  225. location_tr = 5
  226. proposal_tr = 6
  227. #authority_name = "Carlisle City Council"
  228. #authority_short_name = "Carlisle"
  229. comments_email_address = "dc@carlisle.gov.uk"
  230. action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  231. class DerbyParser(AcolnetParser):
  232. #search_url = "http://195.224.106.204/scripts/planningpages02%5CXSLPagesDC_DERBY%5CDCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
  233. case_number_tr = 1 # this one can be got by the td class attribute
  234. reg_date_tr = 3
  235. location_tr = 4
  236. proposal_tr = 5
  237. #authority_name = "Derby City Council"
  238. #authority_short_name = "Derby"
  239. comments_email_address = "developmentcontrol@derby.gov.uk"
  240. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  241. class CroydonParser(AcolnetParser):
  242. case_number_tr = 1 # this one can be got by the td class attribute
  243. reg_date_tr = 3
  244. location_tr = 5
  245. proposal_tr = 6
  246. comments_email_address = "planning.control@croydon.gov.uk"
  247. action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  248. class EastLindseyParser(AcolnetParser):
  249. case_number_tr = 1 # this one can be got by the td class attribute
  250. reg_date_tr = 3
  251. location_tr = 5
  252. proposal_tr = 6
  253. comments_email_address = "development.control@e-lindsey.gov.uk"
  254. action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"return ValidateSearch\(\)\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  255. class FyldeParser(AcolnetParser):
  256. case_number_tr = 1 # this one can be got by the td class attribute
  257. reg_date_tr = 2
  258. location_tr = 4
  259. proposal_tr = 5
  260. comments_email_address = "planning@fylde.gov.uk"
  261. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">")
  262. class HarlowParser(AcolnetParser):
  263. case_number_tr = 1 # this one can be got by the td class attribute
  264. reg_date_tr = 2
  265. location_tr = 4
  266. proposal_tr = 5
  267. comments_email_address = "Planning.services@harlow.gov.uk"
  268. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  269. class HavantParser(AcolnetParser):
  270. case_number_tr = 1 # this one can be got by the td class attribute
  271. reg_date_tr = 2
  272. location_tr = 4
  273. proposal_tr = 5
  274. comments_email_address = "representations@havant.gov.uk"
  275. action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" theme=\"\"[theme]\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  276. class NorthHertfordshireParser(AcolnetParser):
  277. case_number_tr = 1 # this one can be got by the td class attribute
  278. reg_date_tr = 2
  279. location_tr = 4
  280. proposal_tr = 5
  281. comments_email_address = "planningcontrol@north-herts.gov.uk"
  282. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  283. if __name__ == '__main__':
  284. day = 15
  285. month = 3
  286. year = 2007
  287. # working
  288. # parser = BasingstokeParser()
  289. #parser = BaberghParser("Babergh District Council", "Babergh", "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  290. # works with the divs stripped out
  291. #parser = BassetlawParser()
  292. # returns error 400 - bad request
  293. #parser = BridgenorthParser()
  294. # working
  295. #parser = BuryParser()
  296. # cambridgeshire is a bit different...
  297. # no advanced search page
  298. # canterbury
  299. # results as columns of one table
  300. # returns error 400 - bad request
  301. #parser = CarlisleParser()
  302. # working
  303. #parser = DerbyParser()
  304. parser = HavantParser("HavantBC", "Havant", "http://www3.havant.gov.uk/scripts/planningpages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  305. print parser.getResults(day, month, year)