Automatically exported from code.google.com/p/planningalerts
Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

AcolnetParser.py 19 KiB

há 16 anos
há 16 anos
há 16 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 17 anos
há 16 anos
há 17 anos
há 17 anos
há 17 anos
há 16 anos
há 16 anos
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400
  1. #!/usr/local/bin/python
  2. import urllib2
  3. import urlparse
  4. from datetime import date
  5. # Use this when we have python 2.5
  6. #import datetime
  7. import re
  8. from BeautifulSoup import BeautifulSoup
  9. # Adding this to try to help Surrey Heath - Duncan 14/9/2007
  10. import cookielib
  11. cookie_jar = cookielib.CookieJar()
  12. ################
  13. import MultipartPostHandler
  14. # this is not mine, or part of standard python (though it should be!)
  15. # it comes from http://pipe.scs.fsu.edu/PostHandler/MultipartPostHandler.py
  16. from PlanningUtils import getPostcodeFromText, PlanningAuthorityResults, PlanningApplication
  17. date_format = "%d/%m/%Y"
  18. #This is to get the system key out of the info url
  19. system_key_regex = re.compile("TheSystemkey=(\d*)", re.IGNORECASE)
  20. # We allow the optional > for Bridgnorth, which doesn't have broken html
  21. end_head_regex = re.compile("</head>?", re.IGNORECASE)
  22. class AcolnetParser:
  23. received_date_label = "Registration Date:"
  24. received_date_format = "%d/%m/%Y"
  25. comment_qs_template = "ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=%s"
  26. # There is no online comment facility in these, so we provide an
  27. # appropriate email address instead
  28. comments_email_address = None
  29. # The optional amp; is to cope with Oldham, which seems to have started
  30. # quoting this url.
  31. action_regex = re.compile("<form[^>]*action=\"([^\"]*ACTION=UNWRAP&(?:amp;)?RIPSESSION=[^\"]*)\"[^>]*>", re.IGNORECASE)
  32. def _getResultsSections(self, soup):
  33. """In most cases, there is a table per app."""
  34. return soup.findAll("table", {"class": "results-table"})
  35. def _getCouncilReference(self, app_table):
  36. # return app_table.findAll("a")[1].string.strip()
  37. return app_table.a.string.strip()
  38. def _getDateReceived(self, app_table):
  39. date_str = ''.join(app_table.find(text=self.received_date_label).findNext("td").string.strip().split())
  40. day, month, year = date_str.split('/')
  41. return date(int(year), int(month), int(day))
  42. # This will be better from python 2.5
  43. #return datetime.datetime.strptime(date_str, self.received_date_format)
  44. def _getAddress(self, app_table):
  45. return app_table.find(text="Location:").findNext("td").string.strip()
  46. def _getDescription(self, app_table):
  47. return app_table.find(text="Proposal:").findNext("td").string.strip()
  48. def _getInfoUrl(self, app_table):
  49. """Returns the info url for this app.
  50. We also set the system key on self._current_application,
  51. as we'll need that for the comment url.
  52. """
  53. url = app_table.a['href']
  54. self._current_application.system_key = system_key_regex.search(url).groups()[0]
  55. # This is the right way to do this, but it doesn't work in Python 2.5 as
  56. # it doesn't quite implement RFC 3986. This will work fine when we are on
  57. # Python 2.6
  58. # info_url = urlparse.urljoin(self.base_url, url)
  59. # In the meantime, we'll have to work around it. Let's assume url
  60. # is a query string
  61. split_base_url = urlparse.urlsplit(self.base_url)
  62. split_info_url = urlparse.urlsplit(url)
  63. info_url = urlparse.urlunsplit(split_base_url[:3] + (split_info_url.query,) + split_base_url[4:])
  64. return info_url
  65. def _getCommentUrl(self, app_table):
  66. """This must be run after _getInfoUrl"""
  67. if self.comments_email_address:
  68. return self.comments_email_address
  69. split_info_url = urlparse.urlsplit(self._current_application.info_url)
  70. comment_qs = self.comment_qs_template %self._current_application.system_key
  71. return urlparse.urlunsplit(split_info_url[:3] + (comment_qs,) + split_info_url[4:])
  72. def __init__(self,
  73. authority_name,
  74. authority_short_name,
  75. base_url,
  76. debug=False):
  77. self.authority_name = authority_name
  78. self.authority_short_name = authority_short_name
  79. self.base_url = base_url
  80. self.debug = debug
  81. # This in where we store the results
  82. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  83. # This will store the planning application we are currently working on.
  84. self._current_application = None
  85. def _cleanupHTML(self, html):
  86. """This method should be overridden in subclasses to perform site specific
  87. HTML cleanup."""
  88. return html
  89. def _getSearchResponse(self):
  90. # It looks like we sometimes need to do some stuff to get around a
  91. # javascript redirect and cookies.
  92. search_form_request = urllib2.Request(self.base_url)
  93. search_form_response = urllib2.urlopen(search_form_request)
  94. return search_form_response
  95. def getResultsByDayMonthYear(self, day, month, year):
  96. # first we fetch the search page to get ourselves some session info...
  97. search_form_response = self._getSearchResponse()
  98. search_form_contents = search_form_response.read()
  99. # This sometimes causes a problem in HTMLParser, so let's just get the link
  100. # out with a regex...
  101. groups = self.action_regex.search(search_form_contents).groups()
  102. action = groups[0]
  103. #print action
  104. # This is to handle the amp; which seems to have appeared in this
  105. # url on the Oldham site
  106. action = ''.join(action.split('amp;'))
  107. action_url = urlparse.urljoin(self.base_url, action)
  108. #print action_url
  109. our_date = date(year, month, day)
  110. search_data = {"regdate1": our_date.strftime(date_format),
  111. "regdate2": our_date.strftime(date_format),
  112. }
  113. opener = urllib2.build_opener(MultipartPostHandler.MultipartPostHandler)
  114. response = opener.open(action_url, search_data)
  115. results_html = response.read()
  116. # This is for doing site specific html cleanup
  117. results_html = self._cleanupHTML(results_html)
  118. #some javascript garbage in the header upsets HTMLParser,
  119. #so we'll just have the body
  120. just_body = "<html>" + end_head_regex.split(results_html)[-1]
  121. #self.feed(just_body)
  122. soup = BeautifulSoup(just_body)
  123. # Each app is in a table of it's own.
  124. results_tables = self._getResultsSections(soup)
  125. for app_table in results_tables:
  126. self._current_application = PlanningApplication()
  127. self._current_application.council_reference = self._getCouncilReference(app_table)
  128. self._current_application.address = self._getAddress(app_table)
  129. # Get the postcode from the address
  130. self._current_application.postcode = getPostcodeFromText(self._current_application.address)
  131. self._current_application.description = self._getDescription(app_table)
  132. self._current_application.info_url = self._getInfoUrl(app_table)
  133. self._current_application.comment_url = self._getCommentUrl(app_table)
  134. self._current_application.date_received = self._getDateReceived(app_table)
  135. self._results.addApplication(self._current_application)
  136. return self._results
  137. def getResults(self, day, month, year):
  138. results = self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  139. # import pdb;pdb.set_trace()
  140. return results
  141. class BridgnorthParser(AcolnetParser):
  142. def _getResultsSections(self, soup):
  143. return soup.findAll("table", {"class": "app"})
  144. def _getCouncilReference(self, app_table):
  145. return app_table.a.string.split()[-1]
  146. def _getCommentUrl(self, app_table):
  147. """This must be run after _getInfoUrl"""
  148. #http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=46958
  149. return self._current_application.info_url.replace("NewPages", "PgeCommentForm")
  150. class BlackpoolParser(AcolnetParser):
  151. received_date_label = "Application Date:"
  152. def _getResultsSections(self, soup):
  153. return soup.findAll("table", {"class": "acolnet-results-table"})
  154. def _getCommentUrl(self, app_table):
  155. ref = self._getCouncilReference(app_table)
  156. return "https://www.blackpool.gov.uk/Services/M-R/PlanningApplications/Forms/PlanningNeighbourResponseForm.htm?Application_No=" + ref.replace('/','%2F')
  157. class CanterburyParser(AcolnetParser):
  158. """Here the apps are one row each in a big table."""
  159. comments_email_address = "development.control@canterbury.gov.uk"
  160. def _getResultsSections(self, soup):
  161. return soup.find("table").findAll("tr")[1:]
  162. def _getDateReceived(self, app_table):
  163. date_str = app_table.findAll("td")[3].string.strip()
  164. day, month, year = date_str.split('/')
  165. return date(int(year), int(month), int(day))
  166. # This will be better once we have python 2.5
  167. #return datetime.datetime.strptime(date_str, self.received_date_format)
  168. def _getAddress(self, app_table):
  169. return app_table.findAll("td")[1].string.strip()
  170. def _getDescription(self, app_table):
  171. return app_table.findAll("td")[2].string.strip()
  172. class GreenwichParser(AcolnetParser):
  173. received_date_label = "Registration date:"
  174. comment_qs_template = "ACTION=UNWRAP&RIPNAME=Root.PgeCommentNeighbourForm&TheSystemkey=%s"
  175. def _getInfoUrl(self, app_table):
  176. return AcolnetParser._getInfoUrl(self, app_table).replace('/?', '/acolnetcgi.gov?', 1)
  177. #Kensington and chelsea is sufficiently different, it may as well be handled separately
  178. class MidBedsParser(AcolnetParser):
  179. def _getCouncilReference(self, app_table):
  180. return app_table.findAll("a")[1].string.strip()
  181. class OldhamParser(AcolnetParser):
  182. def _cleanupHTML(self, html):
  183. """There is a bad table end tag in this one.
  184. Fix it before we start"""
  185. bad_table_end = '</table summary="Copyright">'
  186. good_table_end = '</table>'
  187. return html.replace(bad_table_end, good_table_end)
  188. class SouthwarkParser(AcolnetParser):
  189. def _getDateReceived(self, app_table):
  190. date_str = ''.join(app_table.find(text="Statutory start date:").findNext("td").string.strip().split())
  191. day, month, year = date_str.split('/')
  192. return date(int(year), int(month), int(day))
  193. # Use this once we have python 2.5
  194. #return datetime.datetime.strptime(date_str, self.received_date_format)
  195. class SurreyHeathParser(AcolnetParser):
  196. # This is not working yet.
  197. # _getSearchResponse is an attempt to work around
  198. # cookies and a javascript redirect.
  199. # I may have a bit more of a go at this at some point if I have time.
  200. case_number_tr = 1 # this one can be got by the td class attribute
  201. reg_date_tr = 2
  202. location_tr = 4
  203. proposal_tr = 5
  204. comments_email_address = "development-control@surreyheath.gov.uk"
  205. def _getSearchResponse(self):
  206. # It looks like we sometimes need to do some stuff to get around a
  207. # javascript redirect and cookies.
  208. search_form_request = urllib2.Request(self.base_url)
  209. # Lying about the user-agent doesn't seem to help.
  210. #search_form_request.add_header("user-agent", "Mozilla/5.0 (compatible; Konqu...L/3.5.6 (like Gecko) (Kubuntu)")
  211. search_form_response = urllib2.urlopen(search_form_request)
  212. cookie_jar.extract_cookies(search_form_response, search_form_request)
  213. print search_form_response.geturl()
  214. print search_form_response.info()
  215. print search_form_response.read()
  216. # validate_url = "https://www.public.surreyheath-online.gov.uk/whalecom7cace3215643e22bb7b0b8cc97a7/whalecom0/InternalSite/Validate.asp"
  217. # javascript_redirect_url = urlparse.urljoin(self.base_url, "/whalecom7cace3215643e22bb7b0b8cc97a7/whalecom0/InternalSite/RedirectToOrigURL.asp?site_name=public&secure=1")
  218. # javascript_redirect_request = urllib2.Request(javascript_redirect_url)
  219. # javascript_redirect_request.add_header('Referer', validate_url)
  220. # cookie_jar.add_cookie_header(javascript_redirect_request)
  221. # javascript_redirect_response = urllib2.urlopen(javascript_redirect_request)
  222. # return javascript_redirect_response
  223. class BoltonLikeParser(AcolnetParser):
  224. """Note that Bolton has ceased to be BoltonLike with its latest change of url."""
  225. def _getCouncilReference(self, app_table):
  226. return app_table.findAll("a")[1].string.strip()
  227. class LewishamParser(BoltonLikeParser):
  228. comments_email_address = "planning@lewisham.gov.uk"
  229. class BassetlawParser(AcolnetParser):
  230. comments_email_address = "planning@bassetlaw.gov.uk"
  231. def _cleanupHTML(self, html):
  232. """There is a broken div in this page. We don't need any divs, so
  233. let's get rid of them all."""
  234. div_regex = re.compile("</?div[^>]*>", re.IGNORECASE)
  235. return div_regex.sub('', html)
  236. class HarlowParser(AcolnetParser):
  237. def _getCommentUrl(self, app_table):
  238. """This must be run after _getInfoUrl"""
  239. #http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeCommentForm&TheSystemkey=46958
  240. return self._current_application.info_url.replace("PgeResultDetail", "PgeCommentNeighbourForm&amp;hasreference=no")
  241. if __name__ == '__main__':
  242. day = 21
  243. month = 9
  244. year = 2010
  245. #parser = AcolnetParser("Babergh", "Babergh", "http://planning.babergh.gov.uk/dcdatav2//acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  246. parser = AcolnetParser("Barnet", "Barnet", "http://194.75.183.100/planning-cases/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  247. #parser = AcolnetParser("Basingstoke", "Basingstoke", "http://planning.basingstoke.gov.uk/DCOnline2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  248. # parser = BassetlawParser("Bassetlaw", "Bassetlaw", "http://www.bassetlaw.gov.uk/planning/AcolNetCGI.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  249. # parser = AcolnetParser("Bolton", "Bolton", "http://www.planning.bolton.gov.uk/DCOnlineV2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  250. #parser = BridgnorthParser("Bridgnorth", "Bridgnorth", "http://www2.bridgnorth-dc.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  251. #parser = AcolnetParser("Bury", "Bury", "http://e-planning.bury.gov.uk/DCWebPages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  252. # parser = CanterburyParser("Canterbury", "Canterbury", "http://www2.canterbury.gov.uk/planning/acolnetcgi.cgi?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  253. # parser = AcolnetParser("Carlisle", "Carlisle", "http://planning.carlisle.gov.uk/PlanData/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  254. #parser = AcolnetParser("Croydon", "Croydon", "http://planning.croydon.gov.uk/DCWebPages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  255. #parser = AcolnetParser("Derby", "Derby", "http://eplanning.derby.gov.uk/acolnet/planningpages02/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  256. #parser = AcolnetParser("East Lindsey", "East Lindsey", "http://www.e-lindsey.gov.uk/planning/AcolnetCGI.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser")
  257. #parser = AcolnetParser("Exeter City Council", "Exeter", "http://pub.exeter.gov.uk/scripts/Acolnet/dataonlineplanning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  258. # parser = AcolnetParser("Stoke on Trent City Council", "Stoke", "http://www.planning.stoke.gov.uk/dataonlineplanning/AcolNetCGI.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  259. #parser = BoltonParser("Fylde", "Fylde", "http://www2.fylde.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  260. #parser = AcolnetParser("Guildford", "Guildford", "http://www.guildford.gov.uk/DLDC_Version_2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  261. #parser = AcolnetParser("Harlow", "Harlow", "http://planning.harlow.gov.uk/DLDC_Version_2/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  262. #parser = AcolnetParser("Havant", "Havant", "http://www3.havant.gov.uk/scripts/planningpages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  263. #parser = BoltonLikeParser("Hertsmere", "Hertsmere", "http://www2.hertsmere.gov.uk/ACOLNET/DCOnline//acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  264. #parser = LewishamParser("Lewisham", "Lewisham", "http://acolnet.lewisham.gov.uk/lewis-xslpagesdc/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  265. #parser = AcolnetParser("Mid Suffolk", "Mid Suffolk", "http://planning.midsuffolk.gov.uk/planning/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  266. #parser = AcolnetParser("New Forest District Council", "New Forest DC", "http://web3.newforest.gov.uk/planningonline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  267. #parser = AcolnetParser("New Forest National Park Authority", "New Forest NPA", "http://web01.newforestnpa.gov.uk/Pages3/AcolNetCGI.dcgov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  268. #parser = AcolnetParser("North Hertfordshire", "North Herts", "http://www.north-herts.gov.uk/dcdataonline/Pages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  269. #parser = AcolnetParser("North Wiltshire", "North Wilts", "http://planning.northwilts.gov.uk/DCOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  270. #parser = OldhamParser("Oldham", "Oldham", "http://planning.oldham.gov.uk/planning/AcolNetCGI.gov?ACTION=UNWRAP&Root=PgeSearch")
  271. #parser = BoltonLikeParser("Renfrewshire", "Renfrewshire", "http://planning.renfrewshire.gov.uk/acolnetDCpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  272. #parser = AcolnetParser("South Bedfordshire", "South Bedfordshire", "http://planning.southbeds.gov.uk/plantech/DCWebPages/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.PgeSearch")
  273. #parser = SouthwarkParser("London Borough of Southwark", "Southwark", "http://planningonline.southwarksites.com/planningonline2/AcolNetCGI.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  274. #parser = AcolnetParser("Suffolk Coastal", "Suffolk Coastal", "http://apps3.suffolkcoastal.gov.uk/DCDataV2/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  275. # parser = AcolnetParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  276. # parser = MidBedsParser("Mid Bedfordshire District Council", "Mid Beds", "http://www.midbeds.gov.uk/acolnetDC/DCpages/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  277. # parser = AcolnetParser("Cambridgeshire County Council", "Cambridgeshire", "http://planapps2.cambridgeshire.gov.uk/DCWebPages/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  278. # parser = AcolnetParser("East Hampshire District Council", "East Hampshire", "http://planningdevelopment.easthants.gov.uk/dconline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  279. # parser = AcolnetParser("Stockport Metropolitan Borough Council", "Stockport", "http://planning.stockport.gov.uk/PlanningData/AcolNetCGI.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  280. # parser = BlackpoolParser("Blackpool Borough Council", "Blackpool", "http://www2.blackpool.gov.uk/PlanningApplications/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  281. # parser = GreenwichParser("London Borough of Greenwich", "Greenwich", "http://onlineplanning.greenwich.gov.uk/acolnet/planningonline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
  282. print parser.getResults(day, month, year)