Automatically exported from code.google.com/p/planningalerts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

AcolnetParser.py 17 KiB

17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
16 years ago
16 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364
  1. #!/usr/local/bin/python
  2. import urllib2
  3. import urlparse
  4. from datetime import date
  5. # Use this when we have python 2.5
  6. #import datetime
  7. import re
  8. from BeautifulSoup import BeautifulSoup
  9. # Adding this to try to help Surrey Heath - Duncan 14/9/2007
  10. import cookielib
  11. cookie_jar = cookielib.CookieJar()
  12. ################
  13. import MultipartPostHandler
  14. # this is not mine, or part of standard python (though it should be!)
  15. # it comes from http://pipe.scs.fsu.edu/PostHandler/MultipartPostHandler.py
  16. from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication
  17. date_format = "%d/%m/%Y"
  18. #This is to get the system key out of the info url
  19. system_key_regex = re.compile("TheSystemkey=(\d*)", re.IGNORECASE)
  20. # We allow the optional > for Bridgnorth, which doesn't have broken html
  21. end_head_regex = re.compile("</head>?", re.IGNORECASE)
  22. class AcolnetParser:
  23. received_date_format = "%d/%m/%Y"
  24. comment_qs_template = "ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=%s"
  25. # There is no online comment facility in these, so we provide an
  26. # appropriate email address instead
  27. comments_email_address = None
  28. # The optional amp; is to cope with Oldham, which seems to have started
  29. # quoting this url.
  30. action_regex = re.compile("<form[^>]*action=\"([^\"]*ACTION=UNWRAP&(?:amp;)?RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)
  31. def _getResultsSections(self, soup):
  32. """In most cases, there is a table per app."""
  33. return soup.findAll("table", {"class": "results-table"})
  34. def _getCouncilReference(self, app_table):
  35. # return app_table.findAll("a")[1].string.strip()
  36. return app_table.a.string.strip()
  37. def _getDateReceived(self, app_table):
  38. date_str = ''.join(app_table.find(text="Registration Date:").findNext("td").string.strip().split())
  39. day, month, year = date_str.split('/')
  40. return date(int(year), int(month), int(day))
  41. # This will be better from python 2.5
  42. #return datetime.datetime.strptime(date_str, self.received_date_format)
  43. def _getAddress(self, app_table):
  44. return app_table.find(text="Location:").findNext("td").string.strip()
  45. def _getDescription(self, app_table):
  46. return app_table.find(text="Proposal:").findNext("td").string.strip()
  47. def _getInfoUrl(self, app_table):
  48. """Returns the info url for this app.
  49. We also set the system key on self._current_application,
  50. as we'll need that for the comment url.
  51. """
  52. url = app_table.a['href']
  53. self._current_application.system_key = system_key_regex.search(url).groups()[0]
  54. return urlparse.urljoin(self.base_url, url)
  55. def _getCommentUrl(self, app_table):
  56. """This must be run after _getInfoUrl"""
  57. if self.comments_email_address:
  58. return self.comments_email_address
  59. split_info_url = urlparse.urlsplit(self._current_application.info_url)
  60. comment_qs = self.comment_qs_template %self._current_application.system_key
  61. return urlparse.urlunsplit(split_info_url[:3] + (comment_qs,) + split_info_url[4:])
  62. def __init__(self,
  63. authority_name,
  64. authority_short_name,
  65. base_url,
  66. debug=False):
  67. self.authority_name = authority_name
  68. self.authority_short_name = authority_short_name
  69. self.base_url = base_url
  70. self.debug = debug
  71. # This in where we store the results
  72. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  73. # This will store the planning application we are currently working on.
  74. self._current_application = None
  75. def _cleanupHTML(self, html):
  76. """This method should be overridden in subclasses to perform site specific
  77. HTML cleanup."""
  78. return html
  79. def _getSearchResponse(self):
  80. # It looks like we sometimes need to do some stuff to get around a
  81. # javascript redirect and cookies.
  82. search_form_request = urllib2.Request(self.base_url)
  83. search_form_response = urllib2.urlopen(search_form_request)
  84. return search_form_response
  85. def getResultsByDayMonthYear(self, day, month, year):
  86. # first we fetch the search page to get ourselves some session info...
  87. search_form_response = self._getSearchResponse()
  88. search_form_contents = search_form_response.read()
  89. # This sometimes causes a problem in HTMLParser, so let's just get the link
  90. # out with a regex...
  91. groups = self.action_regex.search(search_form_contents).groups()
  92. action = groups[0]
  93. #print action
  94. # This is to handle the amp; which seems to have appeared in this
  95. # url on the Oldham site
  96. action = ''.join(action.split('amp;'))
  97. action_url = urlparse.urljoin(self.base_url, action)
  98. #print action_url
  99. our_date = date(year, month, day)
  100. search_data = {"regdate1": our_date.strftime(date_format),
  101. "regdate2": our_date.strftime(date_format),
  102. }
  103. opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
  104. response = opener.open(action_url, search_data)
  105. results_html = response.read()
  106. # This is for doing site specific html cleanup
  107. results_html = self._cleanupHTML(results_html)
  108. #some javascript garbage in the header upsets HTMLParser,
  109. #so we'll just have the body
  110. just_body = "<html>" + end_head_regex.split(results_html)[-1]
  111. #self.feed(just_body)
  112. soup = BeautifulSoup(just_body)
  113. # Each app is in a table of it's own.
  114. results_tables = self._getResultsSections(soup)
  115. for app_table in results_tables:
  116. self._current_application = PlanningApplication()
  117. self._current_application.council_reference = self._getCouncilReference(app_table)
  118. self._current_application.address = self._getAddress(app_table)
  119. # Get the postcode from the address
  120. self._current_application.postcode = getPostcodeFromText(self._current_application.address)
  121. self._current_application.description = self._getDescription(app_table)
  122. self._current_application.info_url = self._getInfoUrl(app_table)
  123. self._current_application.comment_url = self._getCommentUrl(app_table)
  124. self._current_application.date_received = self._getDateReceived(app_table)
  125. self._results.addApplication(self._current_application)
  126. return self._results
  127. def getResults(self, day, month, year):
  128. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  129. class BridgnorthParser(AcolnetParser):
  130. def _getResultsSections(self, soup):
  131. return soup.findAll("table", {"class": "app"})
  132. def _getCouncilReference(self, app_table):
  133. return app_table.a.string.split()[-1]
  134. def _getCommentUrl(self, app_table):
  135. """This must be run after _getInfoUrl"""
  136. #http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=46958
  137. return self._current_application.info_url.replace("NewPages", "PgeCommentForm")
  138. # Cambridgeshire, although an Acolnet site, is so different that it
  139. # may as well be handled completely separately.
  140. class CanterburyParser(AcolnetParser):
  141. """Here the apps are one row each in a big table."""
  142. def _getResultsSections(self, soup):
  143. return soup.find("table").findAll("tr")[1:]
  144. def _getDateReceived(self, app_table):
  145. date_str = app_table.findAll("td")[3].string.strip()
  146. day, month, year = date_str.split('/')
  147. return date(int(year), int(month), int(day))
  148. # This will be better once we have python 2.5
  149. #return datetime.datetime.strptime(date_str, self.received_date_format)
  150. def _getAddress(self, app_table):
  151. return app_table.findAll("td")[1].string.strip()
  152. def _getDescription(self, app_table):
  153. return app_table.findAll("td")[2].string.strip()
  154. #Kensington and chelsea is sufficiently different, it may as well be handled separately
  155. # Mid Bedfordshire - there is an acolnet here, but you have to have a username
  156. # and password to access it!
  157. class OldhamParser(AcolnetParser):
  158. def _cleanupHTML(self, html):
  159. """There is a bad table end tag in this one.
  160. Fix it before we start"""
  161. bad_table_end = '</table summary="Copyright">'
  162. good_table_end = '</table>'
  163. return html.replace(bad_table_end, good_table_end)
  164. class SouthwarkParser(AcolnetParser):
  165. def _getDateReceived(self, app_table):
  166. date_str = ''.join(app_table.find(text="Statutory start date:").findNext("td").string.strip().split())
  167. day, month, year = date_str.split('/')
  168. return date(int(year), int(month), int(day))
  169. # Use this once we have python 2.5
  170. #return datetime.datetime.strptime(date_str, self.received_date_format)
  171. class SurreyHeathParser(AcolnetParser):
  172. # This is not working yet.
  173. # _getSearchResponse is an attempt to work around
  174. # cookies and a javascript redirect.
  175. # I may have a bit more of a go at this at some point if I have time.
  176. case_number_tr = 1 # this one can be got by the td class attribute
  177. reg_date_tr = 2
  178. location_tr = 4
  179. proposal_tr = 5
  180. comments_email_address = "development-control@surreyheath.gov.uk"
  181. def _getSearchResponse(self):
  182. # It looks like we sometimes need to do some stuff to get around a
  183. # javascript redirect and cookies.
  184. search_form_request = urllib2.Request(self.base_url)
  185. # Lying about the user-agent doesn't seem to help.
  186. #search_form_request.add_header("user-agent", "Mozilla/5.0 (compatible; Konqu...L/3.5.6 (like Gecko) (Kubuntu)")
  187. search_form_response = urllib2.urlopen(search_form_request)
  188. cookie_jar.extract_cookies(search_form_response, search_form_request)
  189. print search_form_response.geturl()
  190. print search_form_response.info()
  191. print search_form_response.read()
  192. # validate_url = "https://www.public.surreyheath-online.gov.uk/whalecom7cace3215643e22bb7b0b8cc97a7/whalecom0/InternalSite/Validate.asp"
  193. # javascript_redirect_url = urlparse.urljoin(self.base_url, "/whalecom7cace3215643e22bb7b0b8cc97a7/whalecom0/InternalSite/RedirectToOrigURL.asp?site_name=public&secure=1")
  194. # javascript_redirect_request = urllib2.Request(javascript_redirect_url)
  195. # javascript_redirect_request.add_header('Referer', validate_url)
  196. # cookie_jar.add_cookie_header(javascript_redirect_request)
  197. # javascript_redirect_response = urllib2.urlopen(javascript_redirect_request)
  198. # return javascript_redirect_response
  199. # Wychavon is rather different, and will need some thought. There is no
  200. # advanced search page
  201. class BoltonLikeParser(AcolnetParser):
  202. """Note that Bolton has ceased to be BoltonLike with its latest change of url."""
  203. def _getCouncilReference(self, app_table):
  204. return app_table.findAll("a")[1].string.strip()
  205. class LewishamParser(BoltonLikeParser):
  206. comments_email_address = "planning@lewisham.com"
  207. class BassetlawParser(BoltonLikeParser):
  208. comments_email_address = "planning@bassetlaw.gov.uk"
  209. def _cleanupHTML(self, html):
  210. """There is a broken div in this page. We don't need any divs, so
  211. let's get rid of them all."""
  212. div_regex = re.compile("</?div[^>]*>", re.IGNORECASE)
  213. return div_regex.sub('', html)
  214. class HarlowParser(AcolnetParser):
  215. def _getCommentUrl(self, app_table):
  216. """This must be run after _getInfoUrl"""
  217. #http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=46958
  218. return self._current_application.info_url.replace("PgeResultDetail", "PgeCommentNeighbourForm&amp;hasreference=no")
  219. if __name__ == '__main__':
  220. day = 4
  221. month = 4
  222. year = 2008
  223. #parser = AcolnetParser("Babergh", "Babergh", "http://planning.babergh.gov.uk/dcdatav2//acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  224. #parser = AcolnetParser("Basingstoke", "Basingstoke", "http://planning.basingstoke.gov.uk/DCOnline2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  225. #parser = BassetlawParser("Bassetlaw", "Bassetlaw", "http://www.bassetlaw.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  226. #parser = AcolnetParser("Bolton", "Bolton", "http://www.planning.bolton.gov.uk/DCOnlineV2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  227. #parser = BridgnorthParser("Bridgnorth", "Bridgnorth", "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  228. #parser = AcolnetParser("Bury", "Bury", "http://e-planning.bury.gov.uk/DCWebPages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  229. #parser = CanterburyParser("Canterbury", "Canterbury", "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  230. parser = BoltonLikeParser("Carlisle", "Carlisle", "http://planning.carlisle.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  231. #parser = AcolnetParser("Croydon", "Croydon", "http://planning.croydon.gov.uk/DCWebPages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  232. #parser = AcolnetParser("Derby", "Derby", "http://eplanning.derby.gov.uk/acolnet/planningpages02/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  233. #parser = AcolnetParser("East Lindsey", "East Lindsey", "http://www.e-lindsey.gov.uk/planning/AcolnetCGI.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser")
  234. #parser = AcolnetParser("Exeter City Council", "Exeter", "http://pub.exeter.gov.uk/scripts/Acolnet/dataonlineplanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  235. #parser = BoltonParser("Fylde", "Fylde", "http://www2.fylde.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  236. #parser = AcolnetParser("Guildford", "Guildford", "http://www.guildford.gov.uk/DLDC_Version_2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  237. #parser = AcolnetParser("Harlow", "Harlow", "http://planning.harlow.gov.uk/DLDC_Version_2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  238. #parser = AcolnetParser("Havant", "Havant", "http://www3.havant.gov.uk/scripts/planningpages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  239. #parser = BoltonLikeParser("Hertsmere", "Hertsmere", "http://www2.hertsmere.gov.uk/ACOLNET/DCOnline//acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  240. #parser = LewishamParser("Lewisham", "Lewisham", "http://acolnet.lewisham.gov.uk/lewis-xslpagesdc/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  241. #parser = AcolnetParser("Mid Suffolk", "Mid Suffolk", "http://planning.midsuffolk.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  242. #parser = BoltonLikeParser("New Forest District Council", "New Forest DC", "http://web3.newforest.gov.uk/planningonline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  243. #parser = BoltonLikeParser("New Forest National Park Authority", "New Forest NPA", "http://web01.newforestnpa.gov.uk/planningpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  244. #parser = AcolnetParser("North Hertfordshire", "North Herts", "http://www.north-herts.gov.uk/dcdataonline/Pages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  245. #parser = AcolnetParser("North Wiltshire", "North Wilts", "http://planning.northwilts.gov.uk/DCOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  246. #parser = OldhamParser("Oldham", "Oldham", "http://planning.oldham.gov.uk/planning/AcolNetCGI.gov?ACTION=UNWRAP&Root=PgeSearch")
  247. #parser = BoltonLikeParser("Renfrewshire", "Renfrewshire", "http://planning.renfrewshire.gov.uk/acolnetDCpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  248. #parser = AcolnetParser("South Bedfordshire", "South Bedfordshire", "http://planning.southbeds.gov.uk/plantech/DCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  249. #parser = SouthwarkParser("London Borough of Southwark", "Southwark", "http://planningonline.southwarksites.com/planningonline2/AcolNetCGI.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  250. #parser = AcolnetParser("Suffolk Coastal", "Suffolk Coastal", "http://apps3.suffolkcoastal.gov.uk/DCDataV2/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  251. #parser = AcolnetParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  252. #parser = AcolnetParser("Stockport Metropolitan Borough Council", "Stockport", "http://planning.stockport.gov.uk/PlanningData/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  253. print parser.getResults(day, month, year)