  1. #!/usr/local/bin/python
  2. import urllib2
  3. import urlparse
  4. from datetime import date
  5. # Use this when we have python 2.5
  6. #import datetime
  7. import re
  8. from BeautifulSoup import BeautifulSoup
  9. # Adding this to try to help Surrey Heath - Duncan 14/9/2007
  10. import cookielib
  11. cookie_jar = cookielib.CookieJar()
  12. ################
  13. import MultipartPostHandler
  14. # this is not mine, or part of standard python (though it should be!)
  15. # it comes from
  16. from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication
  17. date_format = "%d/%m/%Y"
  18. #This is to get the system key out of the info url
  19. system_key_regex = re.compile("TheSystemkey=(\d*)", re.IGNORECASE)
  20. # We allow the optional > for Bridgnorth, which doesn't have broken html
  21. end_head_regex = re.compile("</head>?", re.IGNORECASE)
  22. class AcolnetParser:
  23. received_date_label = "Registration Date:"
  24. received_date_format = "%d/%m/%Y"
  25. comment_qs_template = "ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=%s"
  26. # There is no online comment facility in these, so we provide an
  27. # appropriate email address instead
  28. comments_email_address = None
  29. # The optional amp; is to cope with Oldham, which seems to have started
  30. # quoting this url.
  31. action_regex = re.compile("<form[^>]*action=\"([^\"]*ACTION=UNWRAP&(?:amp;)?RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)
  32. def _getResultsSections(self, soup):
  33. """In most cases, there is a table per app."""
  34. return soup.findAll("table", {"class": "results-table"})
  35. def _getCouncilReference(self, app_table):
  36. # return app_table.findAll("a")[1].string.strip()
  37. return app_table.a.string.strip()
  38. def _getDateReceived(self, app_table):
  39. date_str = ''.join(app_table.find(text=self.received_date_label).findNext("td").string.strip().split())
  40. day, month, year = date_str.split('/')
  41. return date(int(year), int(month), int(day))
  42. # This will be better from python 2.5
  43. #return datetime.datetime.strptime(date_str, self.received_date_format)
  44. def _getAddress(self, app_table):
  45. return app_table.find(text="Location:").findNext("td").string.strip()
  46. def _getDescription(self, app_table):
  47. return app_table.find(text="Proposal:").findNext("td").string.strip()
  48. def _getInfoUrl(self, app_table):
  49. """Returns the info url for this app.
  50. We also set the system key on self._current_application,
  51. as we'll need that for the comment url.
  52. """
  53. url = app_table.a['href']
  54. self._current_application.system_key =[0]
  55. # This is the right way to do this, but it doesn't work in Python 2.5 as
  56. # it doesn't quite implement RFC 3986. This will work fine when we are on
  57. # Python 2.6
  58. # info_url = urlparse.urljoin(self.base_url, url)
  59. # In the meantime, we'll have to work around it. Let's assume url
  60. # is a query string
  61. split_base_url = urlparse.urlsplit(self.base_url)
  62. split_info_url = urlparse.urlsplit(url)
  63. info_url = urlparse.urlunsplit(split_base_url[:3] + (split_info_url.query,) + split_base_url[4:])
  64. return info_url
  65. def _getCommentUrl(self, app_table):
  66. """This must be run after _getInfoUrl"""
  67. if self.comments_email_address:
  68. return self.comments_email_address
  69. split_info_url = urlparse.urlsplit(self._current_application.info_url)
  70. comment_qs = self.comment_qs_template %self._current_application.system_key
  71. return urlparse.urlunsplit(split_info_url[:3] + (comment_qs,) + split_info_url[4:])
  72. def __init__(self,
  73. authority_name,
  74. authority_short_name,
  75. base_url,
  76. debug=False):
  77. self.authority_name = authority_name
  78. self.authority_short_name = authority_short_name
  79. self.base_url = base_url
  80. self.debug = debug
  81. # This in where we store the results
  82. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  83. # This will store the planning application we are currently working on.
  84. self._current_application = None
  85. def _cleanupHTML(self, html):
  86. """This method should be overridden in subclasses to perform site specific
  87. HTML cleanup."""
  88. return html
  89. def _getSearchResponse(self):
  90. # It looks like we sometimes need to do some stuff to get around a
  91. # javascript redirect and cookies.
  92. search_form_request = urllib2.Request(self.base_url)
  93. search_form_response = urllib2.urlopen(search_form_request)
  94. return search_form_response
  95. def getResultsByDayMonthYear(self, day, month, year):
  96. # first we fetch the search page to get ourselves some session info...
  97. search_form_response = self._getSearchResponse()
  98. search_form_contents =
  99. # This sometimes causes a problem in HTMLParser, so let's just get the link
  100. # out with a regex...
  101. groups =
  102. action = groups[0]
  103. #print action
  104. # This is to handle the amp; which seems to have appeared in this
  105. # url on the Oldham site
  106. action = ''.join(action.split('amp;'))
  107. action_url = urlparse.urljoin(self.base_url, action)
  108. #print action_url
  109. our_date = date(year, month, day)
  110. search_data = {"regdate1": our_date.strftime(date_format),
  111. "regdate2": our_date.strftime(date_format),
  112. }
  113. opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
  114. response =, search_data)
  115. results_html =
  116. # This is for doing site specific html cleanup
  117. results_html = self._cleanupHTML(results_html)
  118. #some javascript garbage in the header upsets HTMLParser,
  119. #so we'll just have the body
  120. just_body = "<html>" + end_head_regex.split(results_html)[-1]
  121. #self.feed(just_body)
  122. soup = BeautifulSoup(just_body)
  123. # Each app is in a table of it's own.
  124. results_tables = self._getResultsSections(soup)
  125. for app_table in results_tables:
  126. self._current_application = PlanningApplication()
  127. self._current_application.council_reference = self._getCouncilReference(app_table)
  128. self._current_application.address = self._getAddress(app_table)
  129. # Get the postcode from the address
  130. self._current_application.postcode = getPostcodeFromText(self._current_application.address)
  131. self._current_application.description = self._getDescription(app_table)
  132. self._current_application.info_url = self._getInfoUrl(app_table)
  133. self._current_application.comment_url = self._getCommentUrl(app_table)
  134. self._current_application.date_received = self._getDateReceived(app_table)
  135. self._results.addApplication(self._current_application)
  136. return self._results
  137. def getResults(self, day, month, year):
  138. results = self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  139. # import pdb;pdb.set_trace()
  140. return results
  141. class BridgnorthParser(AcolnetParser):
  142. def _getResultsSections(self, soup):
  143. return soup.findAll("table", {"class": "app"})
  144. def _getCouncilReference(self, app_table):
  145. return app_table.a.string.split()[-1]
  146. def _getCommentUrl(self, app_table):
  147. """This must be run after _getInfoUrl"""
  148. #
  149. return self._current_application.info_url.replace("NewPages", "PgeCommentForm")
  150. class BlackpoolParser(AcolnetParser):
  151. received_date_label = "Application Date:"
  152. def _getResultsSections(self, soup):
  153. return soup.findAll("table", {"class": "acolnet-results-table"})
  154. def _getCommentUrl(self, app_table):
  155. ref = self._getCouncilReference(app_table)
  156. return "" + ref.replace('/','%2F')
  157. class CanterburyParser(AcolnetParser):
  158. """Here the apps are one row each in a big table."""
  159. comments_email_address = ""
  160. def _getResultsSections(self, soup):
  161. return soup.find("table").findAll("tr")[1:]
  162. def _getDateReceived(self, app_table):
  163. date_str = app_table.findAll("td")[3].string.strip()
  164. day, month, year = date_str.split('/')
  165. return date(int(year), int(month), int(day))
  166. # This will be better once we have python 2.5
  167. #return datetime.datetime.strptime(date_str, self.received_date_format)
  168. def _getAddress(self, app_table):
  169. return app_table.findAll("td")[1].string.strip()
  170. def _getDescription(self, app_table):
  171. return app_table.findAll("td")[2].string.strip()
  172. class GreenwichParser(AcolnetParser):
  173. received_date_label = "Registration date:"
  174. comment_qs_template = "ACTION=UNWRAP&RIPNAME=Root.PgeCommentNeighbourForm&TheSystemkey=%s"
  175. def _getInfoUrl(self, app_table):
  176. return AcolnetParser._getInfoUrl(self, app_table).replace('/?', '/', 1)
  177. #Kensington and chelsea is sufficiently different, it may as well be handled separately
  178. class MidBedsParser(AcolnetParser):
  179. def _getCouncilReference(self, app_table):
  180. return app_table.findAll("a")[1].string.strip()
  181. class OldhamParser(AcolnetParser):
  182. def _cleanupHTML(self, html):
  183. """There is a bad table end tag in this one.
  184. Fix it before we start"""
  185. bad_table_end = '</table summary="Copyright">'
  186. good_table_end = '</table>'
  187. return html.replace(bad_table_end, good_table_end)
  188. class SouthwarkParser(AcolnetParser):
  189. def _getDateReceived(self, app_table):
  190. date_str = ''.join(app_table.find(text="Statutory start date:").findNext("td").string.strip().split())
  191. day, month, year = date_str.split('/')
  192. return date(int(year), int(month), int(day))
  193. # Use this once we have python 2.5
  194. #return datetime.datetime.strptime(date_str, self.received_date_format)
  195. class SurreyHeathParser(AcolnetParser):
  196. # This is not working yet.
  197. # _getSearchResponse is an attempt to work around
  198. # cookies and a javascript redirect.
  199. # I may have a bit more of a go at this at some point if I have time.
  200. case_number_tr = 1 # this one can be got by the td class attribute
  201. reg_date_tr = 2
  202. location_tr = 4
  203. proposal_tr = 5
  204. comments_email_address = ""
  205. def _getSearchResponse(self):
  206. # It looks like we sometimes need to do some stuff to get around a
  207. # javascript redirect and cookies.
  208. search_form_request = urllib2.Request(self.base_url)
  209. # Lying about the user-agent doesn't seem to help.
  210. #search_form_request.add_header("user-agent", "Mozilla/5.0 (compatible; Konqu...L/3.5.6 (like Gecko) (Kubuntu)")
  211. search_form_response = urllib2.urlopen(search_form_request)
  212. cookie_jar.extract_cookies(search_form_response, search_form_request)
  213. print search_form_response.geturl()
  214. print
  215. print
  216. # validate_url = ""
  217. # javascript_redirect_url = urlparse.urljoin(self.base_url, "/whalecom7cace3215643e22bb7b0b8cc97a7/whalecom0/InternalSite/RedirectToOrigURL.asp?site_name=public&secure=1")
  218. # javascript_redirect_request = urllib2.Request(javascript_redirect_url)
  219. # javascript_redirect_request.add_header('Referer', validate_url)
  220. # cookie_jar.add_cookie_header(javascript_redirect_request)
  221. # javascript_redirect_response = urllib2.urlopen(javascript_redirect_request)
  222. # return javascript_redirect_response
  223. class BoltonLikeParser(AcolnetParser):
  224. """Note that Bolton has ceased to be BoltonLike with its latest change of url."""
  225. def _getCouncilReference(self, app_table):
  226. return app_table.findAll("a")[1].string.strip()
  227. class LewishamParser(BoltonLikeParser):
  228. comments_email_address = ""
  229. class BassetlawParser(AcolnetParser):
  230. comments_email_address = ""
  231. def _cleanupHTML(self, html):
  232. """There is a broken div in this page. We don't need any divs, so
  233. let's get rid of them all."""
  234. div_regex = re.compile("</?div[^>]*>", re.IGNORECASE)
  235. return div_regex.sub('', html)
  236. class HarlowParser(AcolnetParser):
  237. def _getCommentUrl(self, app_table):
  238. """This must be run after _getInfoUrl"""
  239. #
  240. return self._current_application.info_url.replace("PgeResultDetail", "PgeCommentNeighbourForm&amp;hasreference=no")
  241. if __name__ == '__main__':
  242. day = 6
  243. month = 8
  244. year = 2010
  245. #parser = AcolnetParser("Babergh", "Babergh", "")
  246. #parser = AcolnetParser("Barnet", "Barnet", "")
  247. #parser = AcolnetParser("Basingstoke", "Basingstoke", "")
  248. # parser = BassetlawParser("Bassetlaw", "Bassetlaw", "")
  249. # parser = AcolnetParser("Bolton", "Bolton", "")
  250. #parser = BridgnorthParser("Bridgnorth", "Bridgnorth", "")
  251. #parser = AcolnetParser("Bury", "Bury", "")
  252. # parser = CanterburyParser("Canterbury", "Canterbury", "")
  253. # parser = AcolnetParser("Carlisle", "Carlisle", "")
  254. #parser = AcolnetParser("Croydon", "Croydon", "")
  255. #parser = AcolnetParser("Derby", "Derby", "")
  256. #parser = AcolnetParser("East Lindsey", "East Lindsey", "", "AcolnetParser")
  257. parser = AcolnetParser("Exeter City Council", "Exeter", "")
  258. # parser = AcolnetParser("Stoke on Trent City Council", "Stoke", "")
  259. #parser = BoltonParser("Fylde", "Fylde", "")
  260. #parser = AcolnetParser("Guildford", "Guildford", "")
  261. #parser = AcolnetParser("Harlow", "Harlow", "")
  262. #parser = AcolnetParser("Havant", "Havant", "")
  263. #parser = BoltonLikeParser("Hertsmere", "Hertsmere", "")
  264. #parser = LewishamParser("Lewisham", "Lewisham", "")
  265. #parser = AcolnetParser("Mid Suffolk", "Mid Suffolk", "")
  266. #parser = AcolnetParser("New Forest District Council", "New Forest DC", "")
  267. #parser = AcolnetParser("New Forest National Park Authority", "New Forest NPA", "")
  268. #parser = AcolnetParser("North Hertfordshire", "North Herts", "")
  269. #parser = AcolnetParser("North Wiltshire", "North Wilts", "")
  270. #parser = OldhamParser("Oldham", "Oldham", "")
  271. #parser = BoltonLikeParser("Renfrewshire", "Renfrewshire", "")
  272. #parser = AcolnetParser("South Bedfordshire", "South Bedfordshire", "")
  273. #parser = SouthwarkParser("London Borough of Southwark", "Southwark", "")
  274. #parser = AcolnetParser("Suffolk Coastal", "Suffolk Coastal", "")
  275. # parser = AcolnetParser("Surrey Heath", "Surrey Heath", "")
  276. # parser = MidBedsParser("Mid Bedfordshire District Council", "Mid Beds", "")
  277. # parser = AcolnetParser("Cambridgeshire County Council", "Cambridgeshire", "")
  278. # parser = AcolnetParser("East Hampshire District Council", "East Hampshire", "")
  279. # parser = AcolnetParser("Stockport Metropolitan Borough Council", "Stockport", "")
  280. # parser = BlackpoolParser("Blackpool Borough Council", "Blackpool", "")
  281. # parser = GreenwichParser("London Borough of Greenwich", "Greenwich", "")
  282. print parser.getResults(day, month, year)