Automatically exported from code.google.com/p/planningalerts
Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

AcolnetParser.py 15 KiB

há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. #!/usr/local/bin/python
  2. import urllib2
  3. import urlparse
  4. from datetime import date
  5. import datetime
  6. import re
  7. from BeautifulSoup import BeautifulSoup
  8. # Adding this to try to help Surrey Heath - Duncan 14/9/2007
  9. import cookielib
  10. cookie_jar = cookielib.CookieJar()
  11. ################
  12. import MultipartPostHandler
  13. # this is not mine, or part of standard python (though it should be!)
  14. # it comes from http://pipe.scs.fsu.edu/PostHandler/MultipartPostHandler.py
  15. from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication
  16. date_format = "%d/%m/%Y"
  17. #This is to get the system key out of the info url
  18. system_key_regex = re.compile("TheSystemkey=(\d*)", re.IGNORECASE)
  19. # We allow the optional > for Bridgnorth, which doesn't have broken html
  20. end_head_regex = re.compile("</head>?", re.IGNORECASE)
  21. class AcolnetParser(HTMLParser.HTMLParser):
  22. received_date_format = "%d/%m/%Y"
  23. comment_qs_template = "ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=%s"
  24. # There is no online comment facility in these, so we provide an
  25. # appropriate email address instead
  26. comments_email_address = None
  27. # The optional amp; is to cope with Oldham, which seems to have started
  28. # quoting this url.
  29. action_regex = re.compile("<form[^>]*action=\"([^\"]*ACTION=UNWRAP&(?:amp;)?RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)
  30. def _getResultsSections(self, soup):
  31. """In most cases, there is a table per app."""
  32. return soup.findAll("table", {"class": "results-table"})
  33. def _getCouncilReference(self, app_table):
  34. return app_table.a.string.strip()
  35. def _getDateReceived(self, app_table):
  36. date_str = ''.join(app_table.find(text="Registration Date:").findNext("td").string.strip().split())
  37. return datetime.datetime.strptime(date_str, self.received_date_format)
  38. def _getAddress(self, app_table):
  39. return app_table.find(text="Location:").findNext("td").string.strip()
  40. def _getDescription(self, app_table):
  41. return app_table.find(text="Proposal:").findNext("td").string.strip()
  42. def _getInfoUrl(self, app_table):
  43. """Returns the info url for this app.
  44. We also set the system key on self._current_application,
  45. as we'll need that for the comment url.
  46. """
  47. url = app_table.a['href']
  48. self._current_application.system_key = system_key_regex.search(url).groups()[0]
  49. return urlparse.urljoin(self.base_url, url)
  50. def _getCommentUrl(self, app_table):
  51. """This must be run after _getInfoUrl"""
  52. if self.comments_email_address:
  53. return self.comments_email_address
  54. split_info_url = urlparse.urlsplit(self._current_application.info_url)
  55. comment_qs = self.comment_qs_template %self._current_application.system_key
  56. return urlparse.urlunsplit(split_info_url[:3] + (comment_qs,) + split_info_url[4:])
  57. def __init__(self,
  58. authority_name,
  59. authority_short_name,
  60. base_url,
  61. debug=False):
  62. self.authority_name = authority_name
  63. self.authority_short_name = authority_short_name
  64. self.base_url = base_url
  65. self.debug = debug
  66. # This in where we store the results
  67. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  68. # This will store the planning application we are currently working on.
  69. self._current_application = None
  70. def _cleanupHTML(self, html):
  71. """This method should be overridden in subclasses to perform site specific
  72. HTML cleanup."""
  73. return html
  74. def _getSearchResponse(self):
  75. # It looks like we sometimes need to do some stuff to get around a
  76. # javascript redirect and cookies.
  77. search_form_request = urllib2.Request(self.base_url)
  78. search_form_response = urllib2.urlopen(search_form_request)
  79. return search_form_response
  80. def getResultsByDayMonthYear(self, day, month, year):
  81. # first we fetch the search page to get ourselves some session info...
  82. search_form_response = self._getSearchResponse()
  83. search_form_contents = search_form_response.read()
  84. # This sometimes causes a problem in HTMLParser, so let's just get the link
  85. # out with a regex...
  86. groups = self.action_regex.search(search_form_contents).groups()
  87. action = groups[0]
  88. #print action
  89. # This is to handle the amp; which seems to have appeared in this
  90. # url on the Oldham site
  91. action = ''.join(action.split('amp;'))
  92. action_url = urlparse.urljoin(self.base_url, action)
  93. print action_url
  94. our_date = date(year, month, day)
  95. search_data = {"regdate1": our_date.strftime(date_format),
  96. "regdate2": our_date.strftime(date_format),
  97. }
  98. opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
  99. response = opener.open(action_url, search_data)
  100. results_html = response.read()
  101. # This is for doing site specific html cleanup
  102. results_html = self._cleanupHTML(results_html)
  103. #some javascript garbage in the header upsets HTMLParser,
  104. #so we'll just have the body
  105. just_body = "<html>" + end_head_regex.split(results_html)[-1]
  106. #self.feed(just_body)
  107. soup = BeautifulSoup(just_body)
  108. # Each app is in a table of it's own.
  109. results_tables = self._getResultsSections(soup)
  110. for app_table in results_tables:
  111. self._current_application = PlanningApplication()
  112. self._current_application.council_reference = self._getCouncilReference(app_table)
  113. self._current_application.address = self._getAddress(app_table)
  114. # Get the postcode from the address
  115. self._current_application.postcode = getPostcodeFromText(self._current_application.address)
  116. self._current_application.description = self._getDescription(app_table)
  117. self._current_application.info_url = self._getInfoUrl(app_table)
  118. self._current_application.comment_url = self._getCommentUrl(app_table)
  119. self._current_application.date_received = self._getDateReceived(app_table)
  120. self._results.addApplication(self._current_application)
  121. return self._results
  122. def getResults(self, day, month, year):
  123. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  124. class BassetlawParser(AcolnetParser):
  125. comments_email_address = "planning@bassetlaw.gov.uk"
  126. def _cleanupHTML(self, html):
  127. """There is a broken div in this page. We don't need any divs, so
  128. let's get rid of them all."""
  129. div_regex = re.compile("</?div[^>]*>", re.IGNORECASE)
  130. return div_regex.sub('', html)
  131. class BridgnorthParser(AcolnetParser):
  132. def _getResultsSections(self, soup):
  133. return soup.findAll("table", {"class": "app"})
  134. def _getCouncilReference(self, app_table):
  135. return app_table.a.string.split()[-1]
  136. def _getCommentUrl(self, app_table):
  137. """This must be run after _getInfoUrl"""
  138. #http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=46958
  139. return self._current_application.info_url.replace("NewPages", "PgeCommentForm")
  140. # Cambridgeshire, although an Acolnet site, is so different that it
  141. # may as well be handled completely separately.
  142. class CanterburyParser(AcolnetParser):
  143. """Here the apps are one row each in a big table."""
  144. def _getResultsSections(self, soup):
  145. return soup.find("table", {"class": "results-table"}).findAll("tr")[1:]
  146. def _getDateReceived(self, app_table):
  147. date_str = app_table.findAll("td")[3].string.strip()
  148. return datetime.datetime.strptime(date_str, self.received_date_format)
  149. def _getAddress(self, app_table):
  150. return app_table.findAll("td")[1].string.strip()
  151. def _getDescription(self, app_table):
  152. return app_table.findAll("td")[2].string.strip()
  153. #Kensington and chelsea is sufficiently different, it may as well be handled separately
  154. # Mid Bedfordshire - there is an acolnet here, but you have to have a username
  155. # and password to access it!
  156. class OldhamParser(AcolnetParser):
  157. def _cleanupHTML(self, html):
  158. """There is a bad table end tag in this one.
  159. Fix it before we start"""
  160. bad_table_end = '</table summary="Copyright">'
  161. good_table_end = '</table>'
  162. return html.replace(bad_table_end, good_table_end)
  163. class SouthwarkParser(AcolnetParser):
  164. def _getDateReceived(self, app_table):
  165. date_str = ''.join(app_table.find(text="Statutory start date:").findNext("td").string.strip().split())
  166. return datetime.datetime.strptime(date_str, self.received_date_format)
  167. class SurreyHeathParser(AcolnetParser):
  168. # This is not working yet.
  169. # _getSearchResponse is an attempt to work around
  170. # cookies and a javascript redirect.
  171. # I may have a bit more of a go at this at some point if I have time.
  172. case_number_tr = 1 # this one can be got by the td class attribute
  173. reg_date_tr = 2
  174. location_tr = 4
  175. proposal_tr = 5
  176. comments_email_address = "development-control@surreyheath.gov.uk"
  177. def _getSearchResponse(self):
  178. # It looks like we sometimes need to do some stuff to get around a
  179. # javascript redirect and cookies.
  180. search_form_request = urllib2.Request(self.base_url)
  181. # Lying about the user-agent doesn't seem to help.
  182. #search_form_request.add_header("user-agent", "Mozilla/5.0 (compatible; Konqu...L/3.5.6 (like Gecko) (Kubuntu)")
  183. search_form_response = urllib2.urlopen(search_form_request)
  184. cookie_jar.extract_cookies(search_form_response, search_form_request)
  185. print search_form_response.geturl()
  186. print search_form_response.info()
  187. print search_form_response.read()
  188. # validate_url = "https://www.public.surreyheath-online.gov.uk/whalecom7cace3215643e22bb7b0b8cc97a7/whalecom0/InternalSite/Validate.asp"
  189. # javascript_redirect_url = urlparse.urljoin(self.base_url, "/whalecom7cace3215643e22bb7b0b8cc97a7/whalecom0/InternalSite/RedirectToOrigURL.asp?site_name=public&secure=1")
  190. # javascript_redirect_request = urllib2.Request(javascript_redirect_url)
  191. # javascript_redirect_request.add_header('Referer', validate_url)
  192. # cookie_jar.add_cookie_header(javascript_redirect_request)
  193. # javascript_redirect_response = urllib2.urlopen(javascript_redirect_request)
  194. # return javascript_redirect_response
  195. # Wychavon is rather different, and will need some thought. There is no
  196. # advanced search page
  197. if __name__ == '__main__':
  198. day = 30
  199. month = 11
  200. year = 2007
  201. #parser = AcolnetParser("Babergh", "Babergh", "http://planning.babergh.gov.uk/dcdatav2//acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  202. #parser = AcolnetParser("Basingstoke", "Basingstoke", "http://planning.basingstoke.gov.uk/DCOnline2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  203. #parser = BassetlawParser("Bassetlaw", "Bassetlaw", "http://www.bassetlaw.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  204. #parser = AcolnetParser("Bolton", "Bolton", "http://www.planning.bolton.gov.uk/PlanningSearch/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  205. # parser = BridgnorthParser("Bridgnorth", "Bridgnorth", "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  206. #parser = AcolnetParser("Bury", "Bury", "http://e-planning.bury.gov.uk/DCWebPages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  207. #parser = CanterburyParser("Canterbury", "Canterbury", "http://planning.canterbury.gov.uk/scripts/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  208. #parser = AcolnetParser("Carlisle", "Carlisle", "http://planning.carlisle.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  209. #parser = AcolnetParser("Croydon", "Croydon", "http://planning.croydon.gov.uk/DCWebPages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  210. #parser = AcolnetParser("Derby", "Derby", "http://eplanning.derby.gov.uk/acolnet/planningpages02/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  211. #parser = AcolnetParser("East Lindsey", "East Lindsey", "http://www.e-lindsey.gov.uk/planning/AcolnetCGI.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser")
  212. #parser = AcolnetParser("Exeter City Council", "Exeter", "http://pub.exeter.gov.uk/scripts/Acolnet/dataonlineplanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  213. #parser = AcolnetParser("Fylde", "Fylde", "http://www2.fylde.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  214. #parser = AcolnetParser("Guildford", "Guildford", "http://www.guildford.gov.uk/DLDC_Version_2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  215. #parser = AcolnetParser("Harlow", "Harlow", "http://planning.harlow.gov.uk/PlanningSearch/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  216. #parser = AcolnetParser("Havant", "Havant", "http://www3.havant.gov.uk/scripts/planningpages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  217. #parser = AcolnetParser("Hertsmere", "Hertsmere", "http://www2.hertsmere.gov.uk/ACOLNET/DCOnline//acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  218. #parser = AcolnetParser("Lewisham", "Lewisham", "http://acolnet.lewisham.gov.uk/lewis-xslpagesdc/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  219. #parser = AcolnetParser("Mid Suffolk", "Mid Suffolk", "http://planning.midsuffolk.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  220. #parser = AcolnetParser("New Forest District Council", "New Forest DC", "http://web3.newforest.gov.uk/planningonline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  221. #parser = AcolnetParser("New Forest National Park Authority", "New Forest NPA", "http://web01.newforestnpa.gov.uk/planningpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  222. #parser = AcolnetParser("North Hertfordshire", "North Herts", "http://www.north-herts.gov.uk/dcdataonline/Pages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  223. #parser = AcolnetParser("North Wiltshire", "North Wilts", "http://planning.northwilts.gov.uk/DCOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  224. #parser = OldhamParser("Oldham", "Oldham", "http://planning.oldham.gov.uk/planning/AcolNetCGI.gov?ACTION=UNWRAP&Root=PgeSearch")
  225. #parser = AcolnetParser("Renfrewshire", "Renfrewshire", "http://planning.renfrewshire.gov.uk/acolnetDCpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  226. #parser = AcolnetParser("South Bedfordshire", "South Bedfordshire", "http://planning.southbeds.gov.uk/plantech/DCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  227. #parser = SouthwarkParser("London Borough of Southwark", "Southwark", "http://planningonline.southwarksites.com/planningonline2/AcolNetCGI.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  228. #parser = AcolnetParser("Suffolk Coastal", "Suffolk Coastal", "http://apps3.suffolkcoastal.gov.uk/DCDataV2/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  229. #parser = AcolnetParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  230. print parser.getResults(day, month, year)