Automatically exported from code.google.com/p/planningalerts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

AcolnetParser.py 16 KiB

17 vuotta sitten
17 vuotta sitten
17 vuotta sitten
17 vuotta sitten
17 vuotta sitten
17 vuotta sitten
17 vuotta sitten
17 vuotta sitten
17 vuotta sitten
17 vuotta sitten
17 vuotta sitten
17 vuotta sitten
17 vuotta sitten
17 vuotta sitten
17 vuotta sitten
17 vuotta sitten
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426
  1. #!/usr/local/bin/python
  2. import urllib, urllib2
  3. import HTMLParser
  4. #from BeautifulSoup import BeautifulSoup
  5. import urlparse
  6. import re
  7. end_head_regex = re.compile("</head", re.IGNORECASE)
  8. import MultipartPostHandler
  9. # this is not mine, or part of standard python (though it should be!)
  10. # it comes from http://pipe.scs.fsu.edu/PostHandler/MultipartPostHandler.py
  11. from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication
  12. from datetime import date
  13. from time import strptime
  14. date_format = "%d/%m/%Y"
  15. our_date = date(2007,4,25)
  16. class AcolnetParser(HTMLParser.HTMLParser):
  17. case_number_tr = None # this one can be got by the td class attribute
  18. reg_date_tr = None
  19. location_tr = None
  20. proposal_tr = None
  21. # There is no online comment facility in these, so we provide an
  22. # appropriate email address instead
  23. comments_email_address = None
  24. def __init__(self,
  25. authority_name,
  26. authority_short_name,
  27. base_url,
  28. debug=False):
  29. HTMLParser.HTMLParser.__init__(self)
  30. self.authority_name = authority_name
  31. self.authority_short_name = authority_short_name
  32. self.base_url = base_url
  33. self.debug = debug
  34. self._tr_number = 0
  35. # This will be used to track the subtable depth
  36. # when we are in a results-table, in order to
  37. # avoid adding an application before we have got to
  38. # the end of the results-table
  39. self._subtable_depth = None
  40. self._in_td = False
  41. # This in where we store the results
  42. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  43. # This will store the planning application we are currently working on.
  44. self._current_application = None
  45. def _cleanupHTML(self, html):
  46. """This method should be overridden in subclasses to perform site specific
  47. HTML cleanup."""
  48. return html
  49. def handle_starttag(self, tag, attrs):
  50. #print tag, attrs
  51. if tag == "table":
  52. if self._current_application is None:
  53. # Each application is in a separate table with class "results-table"
  54. for key, value in attrs:
  55. if key == "class" and value == "results-table":
  56. #print "found results-table"
  57. self._current_application = PlanningApplication()
  58. self._tr_number = 0
  59. self._subtable_depth = 0
  60. self._current_application.comment_url = self.comments_email_address
  61. break
  62. else:
  63. # We are already in a results-table, and this is the start of a subtable,
  64. # so increment the subtable depth.
  65. self._subtable_depth += 1
  66. elif self._current_application is not None:
  67. if tag == "tr" and self._subtable_depth == 0:
  68. self._tr_number += 1
  69. if tag == "td":
  70. self._in_td = True
  71. if self._tr_number == self.case_number_tr:
  72. #get the reference and the info link here
  73. pass
  74. elif self._tr_number == self.reg_date_tr:
  75. #get the registration date here
  76. pass
  77. elif self._tr_number == self.location_tr:
  78. #get the address and postcode here
  79. pass
  80. elif self._tr_number == self.proposal_tr:
  81. #get the description here
  82. pass
  83. if tag == "a" and self._tr_number == self.case_number_tr:
  84. # this is where we get the info link and the case number
  85. for key, value in attrs:
  86. if key == "href":
  87. self._current_application.info_url = value
  88. def handle_data(self, data):
  89. # If we are in the tr which contains the case number,
  90. # then data is the council reference, so
  91. # add it to self._current_application.
  92. if self._in_td:
  93. if self._tr_number == self.case_number_tr:
  94. self._current_application.council_reference = data.strip()
  95. elif self._tr_number == self.reg_date_tr:
  96. # we need to make a date object out of data
  97. date_as_str = ''.join(data.strip().split())
  98. received_date = date(*strptime(date_as_str, date_format)[0:3])
  99. #print received_date
  100. self._current_application.date_received = received_date
  101. elif self._tr_number == self.location_tr:
  102. location = data.strip()
  103. self._current_application.address = location
  104. self._current_application.postcode = getPostcodeFromText(location)
  105. elif self._tr_number == self.proposal_tr:
  106. self._current_application.description = data.strip()
  107. def handle_endtag(self, tag):
  108. #print "ending: ", tag
  109. if tag == "table" and self._current_application is not None:
  110. if self._subtable_depth > 0:
  111. self._subtable_depth -= 1
  112. else:
  113. # We need to add the last application in the table
  114. if self._current_application is not None:
  115. #print "adding application"
  116. self._results.addApplication(self._current_application)
  117. #print self._current_application
  118. self._current_application = None
  119. self._tr_number = None
  120. self._subtable_depth = None
  121. elif tag == "td":
  122. self._in_td = False
  123. def getResultsByDayMonthYear(self, day, month, year):
  124. # first we fetch the search page to get ourselves some session info...
  125. search_form_response = urllib2.urlopen(self.base_url)
  126. search_form_contents = search_form_response.read()
  127. #outfile = open("tmpfile", "w")
  128. #outfile.write(search_form_contents)
  129. # This sometimes causes a problem in HTMLParser, so let's just get the link
  130. # out with a regex...
  131. groups = self.action_regex.search(search_form_contents).groups()
  132. action = groups[0]
  133. #print action
  134. action_url = urlparse.urljoin(self.base_url, action)
  135. #print action_url
  136. our_date = date(year, month, day)
  137. search_data = {"regdate1": our_date.strftime(date_format),
  138. "regdate2": our_date.strftime(date_format),
  139. }
  140. opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
  141. response = opener.open(action_url, search_data)
  142. results_html = response.read()
  143. # This is for doing site specific html cleanup
  144. results_html = self._cleanupHTML(results_html)
  145. #some javascript garbage in the header upsets HTMLParser,
  146. #so we'll just have the body
  147. just_body = "<html>" + end_head_regex.split(results_html)[-1]
  148. #outfile = open(self.authority_short_name + ".debug", "w")
  149. #outfile.write(just_body)
  150. self.feed(just_body)
  151. return self._results
  152. def getResults(self, day, month, year):
  153. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  154. class BaberghParser(AcolnetParser):
  155. #search_url = "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
  156. case_number_tr = 1 # this one can be got by the td class attribute
  157. reg_date_tr = 2
  158. location_tr = 4
  159. proposal_tr = 5
  160. #authority_name = "Babergh District Council"
  161. #authority_short_name = "Babergh"
  162. # It would be nice to scrape this...
  163. comments_email_address = "planning.reception@babergh.gov.uk"
  164. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">")
  165. class BasingstokeParser(AcolnetParser):
  166. #search_url = "http://planning.basingstoke.gov.uk/DCOnline2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
  167. case_number_tr = 1 # this one can be got by the td class attribute
  168. reg_date_tr = 3
  169. location_tr = 6
  170. proposal_tr = 8
  171. #authority_name = "Basingstoke and Deane Borough Council"
  172. #authority_short_name = "Basingstoke and Deane"
  173. # It would be nice to scrape this...
  174. comments_email_address = "development.control@basingstoke.gov.uk"
  175. action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">")
  176. class BassetlawParser(AcolnetParser):
  177. #search_url = "http://www.bassetlaw.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
  178. case_number_tr = 1 # this one can be got by the td class attribute
  179. reg_date_tr = 2
  180. location_tr = 5
  181. proposal_tr = 6
  182. #authority_name = "Bassetlaw District Council"
  183. #authority_short_name = "Bassetlaw"
  184. comments_email_address = "planning@bassetlaw.gov.uk"
  185. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  186. def _cleanupHTML(self, html):
  187. """There is a broken div in this page. We don't need any divs, so
  188. let's get rid of them all."""
  189. div_regex = re.compile("</?div[^>]*>", re.IGNORECASE)
  190. return div_regex.sub('', html)
  191. class BridgenorthParser(AcolnetParser):
  192. #search_url = "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"
  193. case_number_tr = 1 # this one can be got by the td class attribute
  194. reg_date_tr = 2
  195. location_tr = 4
  196. proposal_tr = 5
  197. #authority_name = "Bridgenorth District Council"
  198. #authority_short_name = "Bridgenorth"
  199. comments_email_address = "contactus@bridgnorth-dc.gov.uk"
  200. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  201. class BuryParser(AcolnetParser):
  202. #search_url = "http://e-planning.bury.gov.uk/ePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"
  203. case_number_tr = 1 # this one can be got by the td class attribute
  204. reg_date_tr = 2
  205. location_tr = 4
  206. proposal_tr = 5
  207. #authority_name = "Bury Metropolitan Borough Council"
  208. #authority_short_name = "Bury"
  209. comments_email_address = "development.control@bury.gov.uk"
  210. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  211. ## class CanterburyParser(AcolnetParser):
  212. ## search_url = "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
  213. ## case_number_tr = 1 # this one can be got by the td class attribute
  214. ## reg_date_tr = 2
  215. ## location_tr = 4
  216. ## proposal_tr = 5
  217. ## authority_name = "Canterbury City Council"
  218. ## authority_short_name = "Canterbury"
  219. ## comments_email_address = ""
  220. ## action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">")
  221. class CarlisleParser(AcolnetParser):
  222. #search_url = "http://planning.carlisle.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch"
  223. case_number_tr = 1 # this one can be got by the td class attribute
  224. reg_date_tr = 2
  225. location_tr = 5
  226. proposal_tr = 6
  227. #authority_name = "Carlisle City Council"
  228. #authority_short_name = "Carlisle"
  229. comments_email_address = "dc@carlisle.gov.uk"
  230. action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  231. class DerbyParser(AcolnetParser):
  232. #search_url = "http://195.224.106.204/scripts/planningpages02%5CXSLPagesDC_DERBY%5CDCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch"
  233. case_number_tr = 1 # this one can be got by the td class attribute
  234. reg_date_tr = 3
  235. location_tr = 4
  236. proposal_tr = 5
  237. #authority_name = "Derby City Council"
  238. #authority_short_name = "Derby"
  239. comments_email_address = "developmentcontrol@derby.gov.uk"
  240. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  241. class CroydonParser(AcolnetParser):
  242. case_number_tr = 1 # this one can be got by the td class attribute
  243. reg_date_tr = 3
  244. location_tr = 5
  245. proposal_tr = 6
  246. comments_email_address = "planning.control@croydon.gov.uk"
  247. action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  248. class EastLindseyParser(AcolnetParser):
  249. case_number_tr = 1 # this one can be got by the td class attribute
  250. reg_date_tr = 3
  251. location_tr = 5
  252. proposal_tr = 6
  253. comments_email_address = "development.control@e-lindsey.gov.uk"
  254. action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"return ValidateSearch\(\)\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  255. class FyldeParser(AcolnetParser):
  256. case_number_tr = 1 # this one can be got by the td class attribute
  257. reg_date_tr = 2
  258. location_tr = 4
  259. proposal_tr = 5
  260. comments_email_address = "planning@fylde.gov.uk"
  261. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">")
  262. class HarlowParser(AcolnetParser):
  263. case_number_tr = 1 # this one can be got by the td class attribute
  264. reg_date_tr = 2
  265. location_tr = 4
  266. proposal_tr = 5
  267. comments_email_address = "Planning.services@harlow.gov.uk"
  268. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  269. class HavantParser(AcolnetParser):
  270. case_number_tr = 1 # this one can be got by the td class attribute
  271. reg_date_tr = 2
  272. location_tr = 4
  273. proposal_tr = 5
  274. comments_email_address = "representations@havant.gov.uk"
  275. action_regex = re.compile("<form id=\"frmSearch\" onSubmit=\"\"return ValidateSearch\(\)\"\" theme=\"\"[theme]\"\" name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  276. class NorthHertfordshireParser(AcolnetParser):
  277. case_number_tr = 1 # this one can be got by the td class attribute
  278. reg_date_tr = 2
  279. location_tr = 4
  280. proposal_tr = 5
  281. comments_email_address = "planningcontrol@north-herts.gov.uk"
  282. action_regex = re.compile("<FORM name=\"frmSearch\" method=\"post\" action=\"([^\"]*)\" onSubmit=\"return ValidateSearch\(\)\" enctype=\"multipart/form-data\">", re.IGNORECASE)
  283. if __name__ == '__main__':
  284. day = 15
  285. month = 3
  286. year = 2007
  287. # working
  288. # parser = BasingstokeParser()
  289. #parser = BaberghParser("Babergh District Council", "Babergh", "http://planning.babergh.gov.uk/dataOnlinePlanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  290. # works with the divs stripped out
  291. #parser = BassetlawParser()
  292. # returns error 400 - bad request
  293. #parser = BridgenorthParser()
  294. # working
  295. #parser = BuryParser()
  296. # cambridgeshire is a bit different...
  297. # no advanced search page
  298. # canterbury
  299. # results as columns of one table
  300. # returns error 400 - bad request
  301. #parser = CarlisleParser()
  302. # working
  303. #parser = DerbyParser()
  304. parser = HavantParser("HavantBC", "Havant", "http://www3.havant.gov.uk/scripts/planningpages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  305. print parser.getResults(day, month, year)