Automatically exported from code.google.com/p/planningalerts
No puede seleccionar más de 25 temas Los temas deben comenzar con una letra o número, pueden incluir guiones ('-') y pueden tener hasta 35 caracteres de largo.

AcolnetParser.py 17 KiB

hace 17 años
hace 17 años
hace 17 años
hace 17 años
hace 17 años
hace 17 años
hace 17 años
hace 17 años
hace 17 años
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444
  1. #!/usr/local/bin/python
  2. import urllib, urllib2
  3. import HTMLParser
  4. #from BeautifulSoup import BeautifulSoup
  5. import urlparse
  6. import re
  7. end_head_regex = re.compile("</head", re.IGNORECASE)
  8. import MultipartPostHandler
  9. # this is not mine, or part of standard python (though it should be!)
  10. # it comes from http://pipe.scs.fsu.edu/PostHandler/MultipartPostHandler.py
  11. from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication
  12. from datetime import date
  13. from time import strptime
  14. date_format = "%d/%m/%Y"
  15. our_date = date(2007,4,25)
  16. class AcolnetParser(HTMLParser.HTMLParser):
  17. case_number_tr = None # this one can be got by the td class attribute
  18. reg_date_tr = None
  19. location_tr = None
  20. proposal_tr = None
  21. # There is no online comment facility in these, so we provide an
  22. # appropriate email address instead
  23. comments_email_address = None
  24. def __init__(self,
  25. authority_name,
  26. authority_short_name,
  27. base_url,
  28. debug=False):
  29. HTMLParser.HTMLParser.__init__(self)
  30. self.authority_name = authority_name
  31. self.authority_short_name = authority_short_name
  32. self.base_url = base_url
  33. self.debug = debug
  34. self._tr_number = 0
  35. # This will be used to track the subtable depth
  36. # when we are in a results-table, in order to
  37. # avoid adding an application before we have got to
  38. # the end of the results-table
  39. self._subtable_depth = None
  40. self._in_td = False
  41. # This in where we store the results
  42. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  43. # This will store the planning application we are currently working on.
  44. self._current_application = None
  45. def _cleanupHTML(self, html):
  46. """This method should be overridden in subclasses to perform site specific
  47. HTML cleanup."""
  48. return html
  49. def handle_starttag(self, tag, attrs):
  50. #print tag, attrs
  51. if tag == "table":
  52. if self._current_application is None:
  53. # Each application is in a separate table with class "results-table"
  54. for key, value in attrs:
  55. if key == "class" and value == "results-table":
  56. #print "found results-table"
  57. self._current_application = PlanningApplication()
  58. self._tr_number = 0
  59. self._subtable_depth = 0
  60. self._current_application.comment_url = self.comments_email_address
  61. break
  62. else:
  63. # We are already in a results-table, and this is the start of a subtable,
  64. # so increment the subtable depth.
  65. self._subtable_depth += 1
  66. elif self._current_application is not None:
  67. if tag == "tr" and self._subtable_depth == 0:
  68. self._tr_number += 1
  69. if tag == "td":
  70. self._in_td = True
  71. if self._tr_number == self.case_number_tr:
  72. #get the reference and the info link here
  73. pass
  74. elif self._tr_number == self.reg_date_tr:
  75. #get the registration date here
  76. pass
  77. elif self._tr_number == self.location_tr:
  78. #get the address and postcode here
  79. pass
  80. elif self._tr_number == self.proposal_tr:
  81. #get the description here
  82. pass
  83. if tag == "a" and self._tr_number == self.case_number_tr:
  84. # this is where we get the info link and the case number
  85. for key, value in attrs:
  86. if key == "href":
  87. self._current_application.info_url = value
  88. def handle_data(self, data):
  89. # If we are in the tr which contains the case number,
  90. # then data is the council reference, so
  91. # add it to self._current_application.
  92. if self._in_td:
  93. if self._tr_number == self.case_number_tr:
  94. self._current_application.council_reference = data.strip()
  95. elif self._tr_number == self.reg_date_tr:
  96. # we need to make a date object out of data
  97. date_as_str = ''.join(data.strip().split())
  98. received_date = date(*strptime(date_as_str, date_format)[0:3])
  99. #print received_date
  100. self._current_application.date_received = received_date
  101. elif self._tr_number == self.location_tr:
  102. location = data.strip()
  103. self._current_application.address = location
  104. self._current_application.postcode = getPostcodeFromText(location)
  105. elif self._tr_number == self.proposal_tr:
  106. self._current_application.description = data.strip()
  107. def handle_endtag(self, tag):
  108. #print "ending: ", tag
  109. if tag == "table" and self._current_application is not None:
  110. if self._subtable_depth > 0:
  111. self._subtable_depth -= 1
  112. else:
  113. # We need to add the last application in the table
  114. if self._current_application is not None:
  115. #print "adding application"
  116. self._results.addApplication(self._current_application)
  117. #print self._current_application
  118. self._current_application = None
  119. self._tr_number = None
  120. self._subtable_depth = None
  121. elif tag == "td":
  122. self._in_td = False
  123. def getResultsByDayMonthYear(self, day, month, year):
  124. # first we fetch the search page to get ourselves some session info...
  125. search_form_response = urllib2.urlopen(self.base_url)
  126. search_form_contents = search_form_response.read()
  127. #outfile = open("tmpfile", "w")
  128. #outfile.write(search_form_contents)
  129. # This sometimes causes a problem in HTMLParser, so let's just get the link
  130. # out with a regex...
  131. groups = self.action_regex.search(search_form_contents).groups()
  132. action = groups[0]
  133. #print action
  134. action_url = urlparse.urljoin(self.base_url, action)
  135. #print action_url
  136. our_date = date(year, month, day)
  137. search_data = {"regdate1": our_date.strftime(date_format),
  138. "regdate2": our_date.strftime(date_format),
  139. }
  140. opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
  141. response = opener.open(action_url, search_data)
  142. results_html = response.read()
  143. # This is for doing site specific html cleanup
  144. results_html = self._cleanupHTML(results_html)
  145. #some javascript garbage in the header upsets HTMLParser,
  146. #so we'll just have the body
  147. just_body = "<html>" + end_head_regex.split(results_html)[-1]
  148. #outfile = open(self.authority_short_name + ".debug", "w")
  149. #outfile.write(just_body)
  150. self.feed(just_body)
  151. return self._results
  152. def getResults(self, day, month, year):
  153. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  154. class BaberghParser(AcolnetParser):
  155. #search_url = "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
  156. case_number_tr = 1 # this one can be got by the td class attribute
  157. reg_date_tr = 2
  158. location_tr = 4
  159. proposal_tr = 5
  160. #authority_name = "Babergh District Council"
  161. #authority_short_name = "Babergh"
  162. # It would be nice to scrape this...
  163. comments_email_address = "planning.reception@babergh.gov.uk"
  164. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">")
  165. class BasingstokeParser(AcolnetParser):
  166. #search_url = "http://planning.basingstoke.gov.uk/DCOnline2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
  167. case_number_tr = 1 # this one can be got by the td class attribute
  168. reg_date_tr = 3
  169. location_tr = 6
  170. proposal_tr = 8
  171. #authority_name = "Basingstoke and Deane Borough Council"
  172. #authority_short_name = "Basingstoke and Deane"
  173. # It would be nice to scrape this...
  174. comments_email_address = "development.control@basingstoke.gov.uk"
  175. action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">")
  176. class BassetlawParser(AcolnetParser):
  177. #search_url = "http://www.bassetlaw.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
  178. case_number_tr = 1 # this one can be got by the td class attribute
  179. reg_date_tr = 2
  180. location_tr = 5
  181. proposal_tr = 6
  182. #authority_name = "Bassetlaw District Council"
  183. #authority_short_name = "Bassetlaw"
  184. comments_email_address = "planning@bassetlaw.gov.uk"
  185. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  186. def _cleanupHTML(self, html):
  187. """There is a broken div in this page. We don't need any divs, so
  188. let's get rid of them all."""
  189. div_regex = re.compile("</?div[^>]*>", re.IGNORECASE)
  190. return div_regex.sub('', html)
  191. class BridgenorthParser(AcolnetParser):
  192. #search_url = "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"
  193. case_number_tr = 1 # this one can be got by the td class attribute
  194. reg_date_tr = 2
  195. location_tr = 4
  196. proposal_tr = 5
  197. #authority_name = "Bridgenorth District Council"
  198. #authority_short_name = "Bridgenorth"
  199. comments_email_address = "contactus@bridgnorth-dc.gov.uk"
  200. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  201. class BuryParser(AcolnetParser):
  202. #search_url = "http://e-planning.bury.gov.uk/ePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"
  203. case_number_tr = 1 # this one can be got by the td class attribute
  204. reg_date_tr = 2
  205. location_tr = 4
  206. proposal_tr = 5
  207. #authority_name = "Bury Metropolitan Borough Council"
  208. #authority_short_name = "Bury"
  209. comments_email_address = "development.control@bury.gov.uk"
  210. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  211. ## class CanterburyParser(AcolnetParser):
  212. ## search_url = "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
  213. ## case_number_tr = 1 # this one can be got by the td class attribute
  214. ## reg_date_tr = 2
  215. ## location_tr = 4
  216. ## proposal_tr = 5
  217. ## authority_name = "Canterbury City Council"
  218. ## authority_short_name = "Canterbury"
  219. ## comments_email_address = ""
  220. ## action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">")
  221. class CarlisleParser(AcolnetParser):
  222. #search_url = "http://planning.carlisle.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"
  223. case_number_tr = 1 # this one can be got by the td class attribute
  224. reg_date_tr = 2
  225. location_tr = 5
  226. proposal_tr = 6
  227. #authority_name = "Carlisle City Council"
  228. #authority_short_name = "Carlisle"
  229. comments_email_address = "dc@carlisle.gov.uk"
  230. action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  231. class DerbyParser(AcolnetParser):
  232. #search_url = "http://195.224.106.204/scripts/planningpages02%5CXSLPagesDC_DERBY%5CDCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
  233. case_number_tr = 1 # this one can be got by the td class attribute
  234. reg_date_tr = 3
  235. location_tr = 4
  236. proposal_tr = 5
  237. #authority_name = "Derby City Council"
  238. #authority_short_name = "Derby"
  239. comments_email_address = "developmentcontrol@derby.gov.uk"
  240. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  241. class CroydonParser(AcolnetParser):
  242. case_number_tr = 1 # this one can be got by the td class attribute
  243. reg_date_tr = 3
  244. location_tr = 5
  245. proposal_tr = 6
  246. comments_email_address = "planning.control@croydon.gov.uk"
  247. action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  248. class EastLindseyParser(AcolnetParser):
  249. case_number_tr = 1 # this one can be got by the td class attribute
  250. reg_date_tr = 3
  251. location_tr = 5
  252. proposal_tr = 6
  253. comments_email_address = "development.control@e-lindsey.gov.uk"
  254. action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"return ValidateSearch\(\)\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  255. class FyldeParser(AcolnetParser):
  256. case_number_tr = 1 # this one can be got by the td class attribute
  257. reg_date_tr = 2
  258. location_tr = 4
  259. proposal_tr = 5
  260. comments_email_address = "planning@fylde.gov.uk"
  261. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">")
  262. class HarlowParser(AcolnetParser):
  263. case_number_tr = 1 # this one can be got by the td class attribute
  264. reg_date_tr = 2
  265. location_tr = 4
  266. proposal_tr = 5
  267. comments_email_address = "Planning.services@harlow.gov.uk"
  268. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  269. class HavantParser(AcolnetParser):
  270. case_number_tr = 1 # this one can be got by the td class attribute
  271. reg_date_tr = 2
  272. location_tr = 4
  273. proposal_tr = 5
  274. comments_email_address = "representations@havant.gov.uk"
  275. action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" theme=\"\"[theme]\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  276. class HertsmereParser(AcolnetParser):
  277. case_number_tr = 1 # this one can be got by the td class attribute
  278. reg_date_tr = 2
  279. location_tr = 4
  280. proposal_tr = 5
  281. comments_email_address = "planning@hertsmere.gov.uk"
  282. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  283. class LewishamParser(AcolnetParser):
  284. case_number_tr = 1 # this one can be got by the td class attribute
  285. reg_date_tr = 2
  286. location_tr = 4
  287. proposal_tr = 5
  288. comments_email_address = "planning@hertsmere.gov.uk"
  289. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  290. class NorthHertfordshireParser(AcolnetParser):
  291. case_number_tr = 1 # this one can be got by the td class attribute
  292. reg_date_tr = 2
  293. location_tr = 4
  294. proposal_tr = 5
  295. comments_email_address = "planning@lewisham.gov.uk"
  296. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  297. if __name__ == '__main__':
  298. day = 15
  299. month = 3
  300. year = 2007
  301. # working
  302. # parser = BasingstokeParser()
  303. #parser = BaberghParser("Babergh District Council", "Babergh", "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  304. # works with the divs stripped out
  305. #parser = BassetlawParser()
  306. # returns error 400 - bad request
  307. #parser = BridgenorthParser()
  308. # working
  309. #parser = BuryParser()
  310. # cambridgeshire is a bit different...
  311. # no advanced search page
  312. # canterbury
  313. # results as columns of one table
  314. # returns error 400 - bad request
  315. #parser = CarlisleParser()
  316. # working
  317. #parser = DerbyParser()
  318. parser = HavantParser("HavantBC", "Havant", "http://www3.havant.gov.uk/scripts/planningpages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  319. print parser.getResults(day, month, year)