Automatically exported from code.google.com/p/planningalerts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

337 regels
15 KiB

  1. #!/usr/local/bin/python
  2. import urllib2
  3. import urlparse
  4. from datetime import date
  5. import datetime
  6. import re
  7. from BeautifulSoup import BeautifulSoup
  8. # Adding this to try to help Surrey Heath - Duncan 14/9/2007
  9. import cookielib
  10. cookie_jar = cookielib.CookieJar()
  11. ################
  12. import MultipartPostHandler
  13. # this is not mine, or part of standard python (though it should be!)
  14. # it comes from http://pipe.scs.fsu.edu/PostHandler/MultipartPostHandler.py
  15. from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication
  16. date_format = "%d/%m/%Y"
  17. #This is to get the system key out of the info url
  18. system_key_regex = re.compile("TheSystemkey=(\d*)", re.IGNORECASE)
  19. # We allow the optional > for Bridgnorth, which doesn't have broken html
  20. end_head_regex = re.compile("</head>?", re.IGNORECASE)
  21. class AcolnetParser(HTMLParser.HTMLParser):
  22. received_date_format = "%d/%m/%Y"
  23. comment_qs_template = "ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=%s"
  24. # There is no online comment facility in these, so we provide an
  25. # appropriate email address instead
  26. comments_email_address = None
  27. # The optional amp; is to cope with Oldham, which seems to have started
  28. # quoting this url.
  29. action_regex = re.compile("<form[^>]*action=\"([^\"]*ACTION=UNWRAP&(?:amp;)?RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)
  30. def _getResultsSections(self, soup):
  31. """In most cases, there is a table per app."""
  32. return soup.findAll("table", {"class": "results-table"})
  33. def _getCouncilReference(self, app_table):
  34. return app_table.a.string.strip()
  35. def _getDateReceived(self, app_table):
  36. date_str = ''.join(app_table.find(text="Registration Date:").findNext("td").string.strip().split())
  37. return datetime.datetime.strptime(date_str, self.received_date_format)
  38. def _getAddress(self, app_table):
  39. return app_table.find(text="Location:").findNext("td").string.strip()
  40. def _getDescription(self, app_table):
  41. return app_table.find(text="Proposal:").findNext("td").string.strip()
  42. def _getInfoUrl(self, app_table):
  43. """Returns the info url for this app.
  44. We also set the system key on self._current_application,
  45. as we'll need that for the comment url.
  46. """
  47. url = app_table.a['href']
  48. self._current_application.system_key = system_key_regex.search(url).groups()[0]
  49. return urlparse.urljoin(self.base_url, url)
  50. def _getCommentUrl(self, app_table):
  51. """This must be run after _getInfoUrl"""
  52. if self.comments_email_address:
  53. return self.comments_email_address
  54. split_info_url = urlparse.urlsplit(self._current_application.info_url)
  55. comment_qs = self.comment_qs_template %self._current_application.system_key
  56. return urlparse.urlunsplit(split_info_url[:3] + (comment_qs,) + split_info_url[4:])
  57. def __init__(self,
  58. authority_name,
  59. authority_short_name,
  60. base_url,
  61. debug=False):
  62. self.authority_name = authority_name
  63. self.authority_short_name = authority_short_name
  64. self.base_url = base_url
  65. self.debug = debug
  66. # This in where we store the results
  67. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  68. # This will store the planning application we are currently working on.
  69. self._current_application = None
  70. def _cleanupHTML(self, html):
  71. """This method should be overridden in subclasses to perform site specific
  72. HTML cleanup."""
  73. return html
  74. def _getSearchResponse(self):
  75. # It looks like we sometimes need to do some stuff to get around a
  76. # javascript redirect and cookies.
  77. search_form_request = urllib2.Request(self.base_url)
  78. search_form_response = urllib2.urlopen(search_form_request)
  79. return search_form_response
  80. def getResultsByDayMonthYear(self, day, month, year):
  81. # first we fetch the search page to get ourselves some session info...
  82. search_form_response = self._getSearchResponse()
  83. search_form_contents = search_form_response.read()
  84. # This sometimes causes a problem in HTMLParser, so let's just get the link
  85. # out with a regex...
  86. groups = self.action_regex.search(search_form_contents).groups()
  87. action = groups[0]
  88. #print action
  89. # This is to handle the amp; which seems to have appeared in this
  90. # url on the Oldham site
  91. action = ''.join(action.split('amp;'))
  92. action_url = urlparse.urljoin(self.base_url, action)
  93. print action_url
  94. our_date = date(year, month, day)
  95. search_data = {"regdate1": our_date.strftime(date_format),
  96. "regdate2": our_date.strftime(date_format),
  97. }
  98. opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
  99. response = opener.open(action_url, search_data)
  100. results_html = response.read()
  101. # This is for doing site specific html cleanup
  102. results_html = self._cleanupHTML(results_html)
  103. #some javascript garbage in the header upsets HTMLParser,
  104. #so we'll just have the body
  105. just_body = "<html>" + end_head_regex.split(results_html)[-1]
  106. #self.feed(just_body)
  107. soup = BeautifulSoup(just_body)
  108. # Each app is in a table of it's own.
  109. results_tables = self._getResultsSections(soup)
  110. for app_table in results_tables:
  111. self._current_application = PlanningApplication()
  112. self._current_application.council_reference = self._getCouncilReference(app_table)
  113. self._current_application.address = self._getAddress(app_table)
  114. # Get the postcode from the address
  115. self._current_application.postcode = getPostcodeFromText(self._current_application.address)
  116. self._current_application.description = self._getDescription(app_table)
  117. self._current_application.info_url = self._getInfoUrl(app_table)
  118. self._current_application.comment_url = self._getCommentUrl(app_table)
  119. self._current_application.date_received = self._getDateReceived(app_table)
  120. self._results.addApplication(self._current_application)
  121. return self._results
  122. def getResults(self, day, month, year):
  123. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  124. class BassetlawParser(AcolnetParser):
  125. comments_email_address = "planning@bassetlaw.gov.uk"
  126. def _cleanupHTML(self, html):
  127. """There is a broken div in this page. We don't need any divs, so
  128. let's get rid of them all."""
  129. div_regex = re.compile("</?div[^>]*>", re.IGNORECASE)
  130. return div_regex.sub('', html)
  131. class BridgnorthParser(AcolnetParser):
  132. def _getResultsSections(self, soup):
  133. return soup.findAll("table", {"class": "app"})
  134. def _getCouncilReference(self, app_table):
  135. return app_table.a.string.split()[-1]
  136. def _getCommentUrl(self, app_table):
  137. """This must be run after _getInfoUrl"""
  138. #http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=46958
  139. return self._current_application.info_url.replace("NewPages", "PgeCommentForm")
  140. # Cambridgeshire, although an Acolnet site, is so different that it
  141. # may as well be handled completely separately.
  142. class CanterburyParser(AcolnetParser):
  143. """Here the apps are one row each in a big table."""
  144. def _getResultsSections(self, soup):
  145. return soup.find("table", {"class": "results-table"}).findAll("tr")[1:]
  146. def _getDateReceived(self, app_table):
  147. date_str = app_table.findAll("td")[3].string.strip()
  148. return datetime.datetime.strptime(date_str, self.received_date_format)
  149. def _getAddress(self, app_table):
  150. return app_table.findAll("td")[1].string.strip()
  151. def _getDescription(self, app_table):
  152. return app_table.findAll("td")[2].string.strip()
  153. #Kensington and chelsea is sufficiently different, it may as well be handled separately
  154. # Mid Bedfordshire - there is an acolnet here, but you have to have a username
  155. # and password to access it!
  156. class OldhamParser(AcolnetParser):
  157. def _cleanupHTML(self, html):
  158. """There is a bad table end tag in this one.
  159. Fix it before we start"""
  160. bad_table_end = '</table summary="Copyright">'
  161. good_table_end = '</table>'
  162. return html.replace(bad_table_end, good_table_end)
  163. class SouthwarkParser(AcolnetParser):
  164. def _getDateReceived(self, app_table):
  165. date_str = ''.join(app_table.find(text="Statutory start date:").findNext("td").string.strip().split())
  166. return datetime.datetime.strptime(date_str, self.received_date_format)
  167. class SurreyHeathParser(AcolnetParser):
  168. # This is not working yet.
  169. # _getSearchResponse is an attempt to work around
  170. # cookies and a javascript redirect.
  171. # I may have a bit more of a go at this at some point if I have time.
  172. case_number_tr = 1 # this one can be got by the td class attribute
  173. reg_date_tr = 2
  174. location_tr = 4
  175. proposal_tr = 5
  176. comments_email_address = "development-control@surreyheath.gov.uk"
  177. def _getSearchResponse(self):
  178. # It looks like we sometimes need to do some stuff to get around a
  179. # javascript redirect and cookies.
  180. search_form_request = urllib2.Request(self.base_url)
  181. # Lying about the user-agent doesn't seem to help.
  182. #search_form_request.add_header("user-agent", "Mozilla/5.0 (compatible; Konqu...L/3.5.6 (like Gecko) (Kubuntu)")
  183. search_form_response = urllib2.urlopen(search_form_request)
  184. cookie_jar.extract_cookies(search_form_response, search_form_request)
  185. print search_form_response.geturl()
  186. print search_form_response.info()
  187. print search_form_response.read()
  188. # validate_url = "https://www.public.surreyheath-online.gov.uk/whalecom7cace3215643e22bb7b0b8cc97a7/whalecom0/InternalSite/Validate.asp"
  189. # javascript_redirect_url = urlparse.urljoin(self.base_url, "/whalecom7cace3215643e22bb7b0b8cc97a7/whalecom0/InternalSite/RedirectToOrigURL.asp?site_name=public&secure=1")
  190. # javascript_redirect_request = urllib2.Request(javascript_redirect_url)
  191. # javascript_redirect_request.add_header('Referer', validate_url)
  192. # cookie_jar.add_cookie_header(javascript_redirect_request)
  193. # javascript_redirect_response = urllib2.urlopen(javascript_redirect_request)
  194. # return javascript_redirect_response
  195. # Wychavon is rather different, and will need some thought. There is no
  196. # advanced search page
  197. if __name__ == '__main__':
  198. day = 30
  199. month = 11
  200. year = 2007
  201. #parser = AcolnetParser("Babergh", "Babergh", "http://planning.babergh.gov.uk/dcdatav2//acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  202. #parser = AcolnetParser("Basingstoke", "Basingstoke", "http://planning.basingstoke.gov.uk/DCOnline2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  203. #parser = BassetlawParser("Bassetlaw", "Bassetlaw", "http://www.bassetlaw.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  204. #parser = AcolnetParser("Bolton", "Bolton", "http://www.planning.bolton.gov.uk/PlanningSearch/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  205. # parser = BridgnorthParser("Bridgnorth", "Bridgnorth", "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  206. #parser = AcolnetParser("Bury", "Bury", "http://e-planning.bury.gov.uk/DCWebPages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  207. #parser = CanterburyParser("Canterbury", "Canterbury", "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  208. #parser = AcolnetParser("Carlisle", "Carlisle", "http://planning.carlisle.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  209. #parser = AcolnetParser("Croydon", "Croydon", "http://planning.croydon.gov.uk/DCWebPages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  210. #parser = AcolnetParser("Derby", "Derby", "http://eplanning.derby.gov.uk/acolnet/planningpages02/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  211. #parser = AcolnetParser("East Lindsey", "East Lindsey", "http://www.e-lindsey.gov.uk/planning/AcolnetCGI.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser")
  212. #parser = AcolnetParser("Exeter City Council", "Exeter", "http://pub.exeter.gov.uk/scripts/Acolnet/dataonlineplanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  213. #parser = AcolnetParser("Fylde", "Fylde", "http://www2.fylde.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  214. #parser = AcolnetParser("Guildford", "Guildford", "http://www.guildford.gov.uk/DLDC_Version_2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  215. #parser = AcolnetParser("Harlow", "Harlow", "http://planning.harlow.gov.uk/PlanningSearch/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  216. #parser = AcolnetParser("Havant", "Havant", "http://www3.havant.gov.uk/scripts/planningpages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  217. #parser = AcolnetParser("Hertsmere", "Hertsmere", "http://www2.hertsmere.gov.uk/ACOLNET/DCOnline//acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  218. #parser = AcolnetParser("Lewisham", "Lewisham", "http://acolnet.lewisham.gov.uk/lewis-xslpagesdc/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  219. #parser = AcolnetParser("Mid Suffolk", "Mid Suffolk", "http://planning.midsuffolk.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  220. #parser = AcolnetParser("New Forest District Council", "New Forest DC", "http://web3.newforest.gov.uk/planningonline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  221. #parser = AcolnetParser("New Forest National Park Authority", "New Forest NPA", "http://web01.newforestnpa.gov.uk/planningpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  222. #parser = AcolnetParser("North Hertfordshire", "North Herts", "http://www.north-herts.gov.uk/dcdataonline/Pages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  223. #parser = AcolnetParser("North Wiltshire", "North Wilts", "http://planning.northwilts.gov.uk/DCOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  224. #parser = OldhamParser("Oldham", "Oldham", "http://planning.oldham.gov.uk/planning/AcolNetCGI.gov?ACTION=UNWRAP&Root=PgeSearch")
  225. #parser = AcolnetParser("Renfrewshire", "Renfrewshire", "http://planning.renfrewshire.gov.uk/acolnetDCpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  226. #parser = AcolnetParser("South Bedfordshire", "South Bedfordshire", "http://planning.southbeds.gov.uk/plantech/DCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  227. #parser = SouthwarkParser("London Borough of Southwark", "Southwark", "http://planningonline.southwarksites.com/planningonline2/AcolNetCGI.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  228. #parser = AcolnetParser("Suffolk Coastal", "Suffolk Coastal", "http://apps3.suffolkcoastal.gov.uk/DCDataV2/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  229. #parser = AcolnetParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  230. print parser.getResults(day, month, year)