Automatically exported from code.google.com/p/planningalerts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

AcolnetParser.py 18 KiB

17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
16 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386
  1. #!/usr/local/bin/python
  2. import urllib2
  3. import urlparse
  4. from datetime import date
  5. # Use this when we have python 2.5
  6. #import datetime
  7. import re
  8. from BeautifulSoup import BeautifulSoup
  9. # Adding this to try to help Surrey Heath - Duncan 14/9/2007
  10. import cookielib
  11. cookie_jar = cookielib.CookieJar()
  12. ################
  13. import MultipartPostHandler
  14. # this is not mine, or part of standard python (though it should be!)
  15. # it comes from http://pipe.scs.fsu.edu/PostHandler/MultipartPostHandler.py
  16. from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication
  17. date_format = "%d/%m/%Y"
  18. #This is to get the system key out of the info url
  19. system_key_regex = re.compile("TheSystemkey=(\d*)", re.IGNORECASE)
  20. # We allow the optional > for Bridgnorth, which doesn't have broken html
  21. end_head_regex = re.compile("</head>?", re.IGNORECASE)
  22. class AcolnetParser:
  23. received_date_label = "Registration Date:"
  24. received_date_format = "%d/%m/%Y"
  25. comment_qs_template = "ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=%s"
  26. # There is no online comment facility in these, so we provide an
  27. # appropriate email address instead
  28. comments_email_address = None
  29. # The optional amp; is to cope with Oldham, which seems to have started
  30. # quoting this url.
  31. action_regex = re.compile("<form[^>]*action=\"([^\"]*ACTION=UNWRAP&(?:amp;)?RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)
  32. def _getResultsSections(self, soup):
  33. """In most cases, there is a table per app."""
  34. return soup.findAll("table", {"class": "results-table"})
  35. def _getCouncilReference(self, app_table):
  36. # return app_table.findAll("a")[1].string.strip()
  37. return app_table.a.string.strip()
  38. def _getDateReceived(self, app_table):
  39. date_str = ''.join(app_table.find(text=self.received_date_label).findNext("td").string.strip().split())
  40. day, month, year = date_str.split('/')
  41. return date(int(year), int(month), int(day))
  42. # This will be better from python 2.5
  43. #return datetime.datetime.strptime(date_str, self.received_date_format)
  44. def _getAddress(self, app_table):
  45. return app_table.find(text="Location:").findNext("td").string.strip()
  46. def _getDescription(self, app_table):
  47. return app_table.find(text="Proposal:").findNext("td").string.strip()
  48. def _getInfoUrl(self, app_table):
  49. """Returns the info url for this app.
  50. We also set the system key on self._current_application,
  51. as we'll need that for the comment url.
  52. """
  53. url = app_table.a['href']
  54. self._current_application.system_key = system_key_regex.search(url).groups()[0]
  55. return urlparse.urljoin(self.base_url, url)
  56. def _getCommentUrl(self, app_table):
  57. """This must be run after _getInfoUrl"""
  58. if self.comments_email_address:
  59. return self.comments_email_address
  60. split_info_url = urlparse.urlsplit(self._current_application.info_url)
  61. comment_qs = self.comment_qs_template %self._current_application.system_key
  62. return urlparse.urlunsplit(split_info_url[:3] + (comment_qs,) + split_info_url[4:])
  63. def __init__(self,
  64. authority_name,
  65. authority_short_name,
  66. base_url,
  67. debug=False):
  68. self.authority_name = authority_name
  69. self.authority_short_name = authority_short_name
  70. self.base_url = base_url
  71. self.debug = debug
  72. # This in where we store the results
  73. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  74. # This will store the planning application we are currently working on.
  75. self._current_application = None
  76. def _cleanupHTML(self, html):
  77. """This method should be overridden in subclasses to perform site specific
  78. HTML cleanup."""
  79. return html
  80. def _getSearchResponse(self):
  81. # It looks like we sometimes need to do some stuff to get around a
  82. # javascript redirect and cookies.
  83. search_form_request = urllib2.Request(self.base_url)
  84. search_form_response = urllib2.urlopen(search_form_request)
  85. return search_form_response
  86. def getResultsByDayMonthYear(self, day, month, year):
  87. # first we fetch the search page to get ourselves some session info...
  88. search_form_response = self._getSearchResponse()
  89. search_form_contents = search_form_response.read()
  90. # This sometimes causes a problem in HTMLParser, so let's just get the link
  91. # out with a regex...
  92. groups = self.action_regex.search(search_form_contents).groups()
  93. action = groups[0]
  94. #print action
  95. # This is to handle the amp; which seems to have appeared in this
  96. # url on the Oldham site
  97. action = ''.join(action.split('amp;'))
  98. action_url = urlparse.urljoin(self.base_url, action)
  99. #print action_url
  100. our_date = date(year, month, day)
  101. search_data = {"regdate1": our_date.strftime(date_format),
  102. "regdate2": our_date.strftime(date_format),
  103. }
  104. opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
  105. response = opener.open(action_url, search_data)
  106. results_html = response.read()
  107. # This is for doing site specific html cleanup
  108. results_html = self._cleanupHTML(results_html)
  109. #some javascript garbage in the header upsets HTMLParser,
  110. #so we'll just have the body
  111. just_body = "<html>" + end_head_regex.split(results_html)[-1]
  112. #self.feed(just_body)
  113. soup = BeautifulSoup(just_body)
  114. # Each app is in a table of it's own.
  115. results_tables = self._getResultsSections(soup)
  116. for app_table in results_tables:
  117. self._current_application = PlanningApplication()
  118. self._current_application.council_reference = self._getCouncilReference(app_table)
  119. self._current_application.address = self._getAddress(app_table)
  120. # Get the postcode from the address
  121. self._current_application.postcode = getPostcodeFromText(self._current_application.address)
  122. self._current_application.description = self._getDescription(app_table)
  123. self._current_application.info_url = self._getInfoUrl(app_table)
  124. self._current_application.comment_url = self._getCommentUrl(app_table)
  125. self._current_application.date_received = self._getDateReceived(app_table)
  126. self._results.addApplication(self._current_application)
  127. return self._results
  128. def getResults(self, day, month, year):
  129. results = self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  130. # import pdb;pdb.set_trace()
  131. return results
  132. class BridgnorthParser(AcolnetParser):
  133. def _getResultsSections(self, soup):
  134. return soup.findAll("table", {"class": "app"})
  135. def _getCouncilReference(self, app_table):
  136. return app_table.a.string.split()[-1]
  137. def _getCommentUrl(self, app_table):
  138. """This must be run after _getInfoUrl"""
  139. #http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=46958
  140. return self._current_application.info_url.replace("NewPages", "PgeCommentForm")
  141. class BlackpoolParser(AcolnetParser):
  142. received_date_label = "Application Date:"
  143. def _getResultsSections(self, soup):
  144. return soup.findAll("table", {"class": "acolnet-results-table"})
  145. def _getCommentUrl(self, app_table):
  146. ref = self._getCouncilReference(app_table)
  147. return "https://www.blackpool.gov.uk/Services/M-R/PlanningApplications/Forms/PlanningNeighbourResponseForm.htm?Application_No=" + ref.replace('/','%2F')
  148. class CanterburyParser(AcolnetParser):
  149. """Here the apps are one row each in a big table."""
  150. comments_email_address = "development.control@canterbury.gov.uk"
  151. def _getResultsSections(self, soup):
  152. return soup.find("table").findAll("tr")[1:]
  153. def _getDateReceived(self, app_table):
  154. date_str = app_table.findAll("td")[3].string.strip()
  155. day, month, year = date_str.split('/')
  156. return date(int(year), int(month), int(day))
  157. # This will be better once we have python 2.5
  158. #return datetime.datetime.strptime(date_str, self.received_date_format)
  159. def _getAddress(self, app_table):
  160. return app_table.findAll("td")[1].string.strip()
  161. def _getDescription(self, app_table):
  162. return app_table.findAll("td")[2].string.strip()
  163. class GreenwichParser(AcolnetParser):
  164. received_date_label = "Registration date:"
  165. comment_qs_template = "ACTION=UNWRAP&RIPNAME=Root.PgeCommentNeighbourForm&TheSystemkey=%s"
  166. def _getInfoUrl(self, app_table):
  167. return AcolnetParser._getInfoUrl(self, app_table).replace('/?', '/acolnetcgi.gov?', 1)
  168. #Kensington and chelsea is sufficiently different, it may as well be handled separately
  169. class MidBedsParser(AcolnetParser):
  170. def _getCouncilReference(self, app_table):
  171. return app_table.findAll("a")[1].string.strip()
  172. class OldhamParser(AcolnetParser):
  173. def _cleanupHTML(self, html):
  174. """There is a bad table end tag in this one.
  175. Fix it before we start"""
  176. bad_table_end = '</table summary="Copyright">'
  177. good_table_end = '</table>'
  178. return html.replace(bad_table_end, good_table_end)
  179. class SouthwarkParser(AcolnetParser):
  180. def _getDateReceived(self, app_table):
  181. date_str = ''.join(app_table.find(text="Statutory start date:").findNext("td").string.strip().split())
  182. day, month, year = date_str.split('/')
  183. return date(int(year), int(month), int(day))
  184. # Use this once we have python 2.5
  185. #return datetime.datetime.strptime(date_str, self.received_date_format)
  186. class SurreyHeathParser(AcolnetParser):
  187. # This is not working yet.
  188. # _getSearchResponse is an attempt to work around
  189. # cookies and a javascript redirect.
  190. # I may have a bit more of a go at this at some point if I have time.
  191. case_number_tr = 1 # this one can be got by the td class attribute
  192. reg_date_tr = 2
  193. location_tr = 4
  194. proposal_tr = 5
  195. comments_email_address = "development-control@surreyheath.gov.uk"
  196. def _getSearchResponse(self):
  197. # It looks like we sometimes need to do some stuff to get around a
  198. # javascript redirect and cookies.
  199. search_form_request = urllib2.Request(self.base_url)
  200. # Lying about the user-agent doesn't seem to help.
  201. #search_form_request.add_header("user-agent", "Mozilla/5.0 (compatible; Konqu...L/3.5.6 (like Gecko) (Kubuntu)")
  202. search_form_response = urllib2.urlopen(search_form_request)
  203. cookie_jar.extract_cookies(search_form_response, search_form_request)
  204. print search_form_response.geturl()
  205. print search_form_response.info()
  206. print search_form_response.read()
  207. # validate_url = "https://www.public.surreyheath-online.gov.uk/whalecom7cace3215643e22bb7b0b8cc97a7/whalecom0/InternalSite/Validate.asp"
  208. # javascript_redirect_url = urlparse.urljoin(self.base_url, "/whalecom7cace3215643e22bb7b0b8cc97a7/whalecom0/InternalSite/RedirectToOrigURL.asp?site_name=public&secure=1")
  209. # javascript_redirect_request = urllib2.Request(javascript_redirect_url)
  210. # javascript_redirect_request.add_header('Referer', validate_url)
  211. # cookie_jar.add_cookie_header(javascript_redirect_request)
  212. # javascript_redirect_response = urllib2.urlopen(javascript_redirect_request)
  213. # return javascript_redirect_response
  214. class BoltonLikeParser(AcolnetParser):
  215. """Note that Bolton has ceased to be BoltonLike with its latest change of url."""
  216. def _getCouncilReference(self, app_table):
  217. return app_table.findAll("a")[1].string.strip()
  218. class LewishamParser(BoltonLikeParser):
  219. comments_email_address = "planning@lewisham.gov.uk"
  220. class BassetlawParser(AcolnetParser):
  221. comments_email_address = "planning@bassetlaw.gov.uk"
  222. def _cleanupHTML(self, html):
  223. """There is a broken div in this page. We don't need any divs, so
  224. let's get rid of them all."""
  225. div_regex = re.compile("</?div[^>]*>", re.IGNORECASE)
  226. return div_regex.sub('', html)
  227. class HarlowParser(AcolnetParser):
  228. def _getCommentUrl(self, app_table):
  229. """This must be run after _getInfoUrl"""
  230. #http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=46958
  231. return self._current_application.info_url.replace("PgeResultDetail", "PgeCommentNeighbourForm&amp;hasreference=no")
  232. if __name__ == '__main__':
  233. day = 12
  234. month = 6
  235. year = 2009
  236. #parser = AcolnetParser("Babergh", "Babergh", "http://planning.babergh.gov.uk/dcdatav2//acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  237. #parser = AcolnetParser("Barnet", "Barnet", "http://194.75.183.100/planning-cases/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  238. #parser = AcolnetParser("Basingstoke", "Basingstoke", "http://planning.basingstoke.gov.uk/DCOnline2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  239. # parser = BassetlawParser("Bassetlaw", "Bassetlaw", "http://www.bassetlaw.gov.uk/planning/AcolNetCGI.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  240. parser = AcolnetParser("Bolton", "Bolton", "http://www.planning.bolton.gov.uk/DCOnlineV2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  241. #parser = BridgnorthParser("Bridgnorth", "Bridgnorth", "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  242. #parser = AcolnetParser("Bury", "Bury", "http://e-planning.bury.gov.uk/DCWebPages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  243. # parser = CanterburyParser("Canterbury", "Canterbury", "http://www2.canterbury.gov.uk/planning/acolnetcgi.cgi?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  244. # parser = AcolnetParser("Carlisle", "Carlisle", "http://planning.carlisle.gov.uk/PlanData/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  245. #parser = AcolnetParser("Croydon", "Croydon", "http://planning.croydon.gov.uk/DCWebPages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  246. #parser = AcolnetParser("Derby", "Derby", "http://eplanning.derby.gov.uk/acolnet/planningpages02/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  247. #parser = AcolnetParser("East Lindsey", "East Lindsey", "http://www.e-lindsey.gov.uk/planning/AcolnetCGI.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser")
  248. #parser = AcolnetParser("Exeter City Council", "Exeter", "http://pub.exeter.gov.uk/scripts/Acolnet/dataonlineplanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  249. #parser = BoltonParser("Fylde", "Fylde", "http://www2.fylde.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  250. #parser = AcolnetParser("Guildford", "Guildford", "http://www.guildford.gov.uk/DLDC_Version_2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  251. #parser = AcolnetParser("Harlow", "Harlow", "http://planning.harlow.gov.uk/DLDC_Version_2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  252. #parser = AcolnetParser("Havant", "Havant", "http://www3.havant.gov.uk/scripts/planningpages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  253. #parser = BoltonLikeParser("Hertsmere", "Hertsmere", "http://www2.hertsmere.gov.uk/ACOLNET/DCOnline//acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  254. #parser = LewishamParser("Lewisham", "Lewisham", "http://acolnet.lewisham.gov.uk/lewis-xslpagesdc/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  255. #parser = AcolnetParser("Mid Suffolk", "Mid Suffolk", "http://planning.midsuffolk.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  256. #parser = AcolnetParser("New Forest District Council", "New Forest DC", "http://web3.newforest.gov.uk/planningonline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  257. #parser = AcolnetParser("New Forest National Park Authority", "New Forest NPA", "http://web01.newforestnpa.gov.uk/Pages3/AcolNetCGI.dcgov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  258. #parser = AcolnetParser("North Hertfordshire", "North Herts", "http://www.north-herts.gov.uk/dcdataonline/Pages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  259. #parser = AcolnetParser("North Wiltshire", "North Wilts", "http://planning.northwilts.gov.uk/DCOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  260. #parser = OldhamParser("Oldham", "Oldham", "http://planning.oldham.gov.uk/planning/AcolNetCGI.gov?ACTION=UNWRAP&Root=PgeSearch")
  261. #parser = BoltonLikeParser("Renfrewshire", "Renfrewshire", "http://planning.renfrewshire.gov.uk/acolnetDCpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  262. #parser = AcolnetParser("South Bedfordshire", "South Bedfordshire", "http://planning.southbeds.gov.uk/plantech/DCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  263. #parser = SouthwarkParser("London Borough of Southwark", "Southwark", "http://planningonline.southwarksites.com/planningonline2/AcolNetCGI.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  264. #parser = AcolnetParser("Suffolk Coastal", "Suffolk Coastal", "http://apps3.suffolkcoastal.gov.uk/DCDataV2/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  265. # parser = AcolnetParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  266. # parser = MidBedsParser("Mid Bedfordshire District Council", "Mid Beds", "http://www.midbeds.gov.uk/acolnetDC/DCpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  267. # parser = AcolnetParser("Cambridgeshire County Council", "Cambridgeshire", "http://planapps2.cambridgeshire.gov.uk/DCWebPages/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  268. # parser = AcolnetParser("East Hampshire District Council", "East Hampshire", "http://planningdevelopment.easthants.gov.uk/dconline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  269. # parser = AcolnetParser("Stockport Metropolitan Borough Council", "Stockport", "http://planning.stockport.gov.uk/PlanningData/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  270. # parser = BlackpoolParser("Blackpool Borough Council", "Blackpool", "http://www2.blackpool.gov.uk/PlanningApplications/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  271. # parser = GreenwichParser("London Borough of Greenwich", "Greenwich", "http://onlineplanning.greenwich.gov.uk/acolnet/planningonline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  272. print parser.getResults(day, month, year)