Automatically exported from code.google.com/p/planningalerts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

преди 16 години
преди 16 години
преди 16 години
преди 16 години
преди 16 години
преди 16 години
преди 16 години
преди 16 години
преди 16 години
преди 16 години
преди 16 години
преди 16 години
преди 16 години
преди 16 години
преди 16 години
преди 16 години
преди 16 години
преди 16 години
преди 16 години
преди 16 години
преди 16 години
преди 16 години
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. import urllib2
  2. import urllib
  3. import urlparse
  4. import datetime, time
  5. import cookielib
  6. cookie_jar = cookielib.CookieJar()
  7. from BeautifulSoup import BeautifulSoup
  8. from PlanningUtils import PlanningApplication, \
  9. PlanningAuthorityResults, \
  10. getPostcodeFromText
  11. search_date_format = "%d-%m-%Y" # Format used for the accepted date when searching
  12. possible_date_formats = [search_date_format, "%d/%m/%Y"]
  13. class CookieAddingHTTPRedirectHandler(urllib2.HTTPRedirectHandler):
  14. """The standard python HttpRedirectHandler doesn't add a cookie to the new request after a 302. This handler does."""
  15. def redirect_request(self, req, fp, code, msg, headers, newurl):
  16. new_request = urllib2.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, newurl)
  17. # We need to add a cookie from the cookie_jar
  18. cookie_jar.add_cookie_header(new_request)
  19. return new_request
  20. cookie_handling_opener = urllib2.build_opener(CookieAddingHTTPRedirectHandler())
  21. class OcellaParser:
  22. received_date_format = search_date_format
  23. def __init__(self,
  24. authority_name,
  25. authority_short_name,
  26. base_url,
  27. debug=False):
  28. self.authority_name = authority_name
  29. self.authority_short_name = authority_short_name
  30. self.base_url = base_url
  31. self.debug = debug
  32. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  33. # These will be used to store the column numbers of the appropriate items in the results table
  34. self.reference_col = None
  35. self.address_col = None
  36. self.description_col = None
  37. self.received_date_col = None
  38. self.accepted_date_col = None
  39. def getResultsByDayMonthYear(self, day, month, year):
  40. search_date = datetime.date(year, month, day)
  41. # First get the search page
  42. get_request = urllib2.Request(self.base_url)
  43. get_response = urllib2.urlopen(get_request)
  44. cookie_jar.extract_cookies(get_response, get_request)
  45. get_soup = BeautifulSoup(get_response.read())
  46. # We need to find where the post action goes
  47. action = get_soup.form['action']
  48. try:
  49. session_id = get_soup.find('input', {'name': 'p_session_id'})['value']
  50. except TypeError:
  51. # In the case of Middlesbrough, there is no session cookie,
  52. # but it seems we don't need it...
  53. session_id = None
  54. # # From Breckland
  55. # p_object_name=FRM_WEEKLY_LIST.DEFAULT.SUBMIT_TOP.01
  56. # p_instance=1
  57. # p_event_type=ON_CLICK
  58. # p_user_args=
  59. # p_session_id=53573
  60. # p_page_url=http%3A%2F%2Fwplan01.intranet.breckland.gov.uk%3A7778%2Fportal%2Fpage%3F_pageid%3D33%2C30988%26_dad%3Dportal%26_schema%3DPORTAL
  61. # FRM_WEEKLY_LIST.DEFAULT.START_DATE.01=02-06-2008
  62. # FRM_WEEKLY_LIST.DEFAULT.END_DATE.01=09-06-2008
  63. # FRM_WEEKLY_LIST.DEFAULT.PARISH.01=
  64. post_data = urllib.urlencode(
  65. [('p_object_name', 'FRM_WEEKLY_LIST.DEFAULT.SUBMIT_TOP.01'),
  66. ('p_instance', '1'),
  67. ('p_event_type', 'ON_CLICK'),
  68. ('p_user_args', ''),
  69. ('p_session_id', session_id),
  70. ('p_page_url', self.base_url),
  71. ('FRM_WEEKLY_LIST.DEFAULT.START_DATE.01', search_date.strftime(search_date_format)),
  72. ('FRM_WEEKLY_LIST.DEFAULT.END_DATE.01', search_date.strftime(search_date_format)),
  73. ('FRM_WEEKLY_LIST.DEFAULT.PARISH.01', ''),
  74. ]
  75. )
  76. post_request = urllib2.Request(action, post_data)
  77. cookie_jar.add_cookie_header(post_request)
  78. post_request.add_header('Referer', self.base_url)
  79. post_response = cookie_handling_opener.open(post_request)
  80. post_soup = BeautifulSoup(post_response.read())
  81. results_table = post_soup.find("table", summary="Printing Table Headers")
  82. trs = results_table.findAll("tr")
  83. # We'll use the headings in the first tr to find out what columns the address, description, etc are in.
  84. ths = trs[0].findAll("th")
  85. th_index = 0
  86. for th in ths:
  87. th_content = th.font.string.strip()
  88. if th_content == 'Reference' or th_content == 'Application Ref':
  89. self.reference_col = th_index
  90. elif th_content == 'Location':
  91. self.address_col = th_index
  92. elif th_content == 'Proposal':
  93. self.description_col = th_index
  94. elif th_content == 'Development Description':
  95. self.description_col = th_index
  96. elif th_content == 'Received Date' or th_content == 'Date Received':
  97. self.received_date_col = th_index
  98. elif th_content == 'Accepted Date':
  99. self.accepted_date_col = th_index
  100. th_index += 1
  101. # If there is a received date, we'll use that, otherwise, we'll have to settle for the accepted date.
  102. self.received_date_col = self.received_date_col or self.accepted_date_col
  103. # We want all the trs except the first one, which is just headers,
  104. # and the last, which is empty
  105. trs = trs[1:-1]
  106. for tr in trs:
  107. self._current_application = PlanningApplication()
  108. tds = tr.findAll("td")
  109. self._current_application.council_reference = (tds[self.reference_col].font.a or tds[self.reference_col].a.font).string.strip()
  110. date_string = tds[self.received_date_col]
  111. for possible_format in possible_date_formats:
  112. try:
  113. self._current_application.date_received = datetime.datetime(*(time.strptime(tds[self.received_date_col].font.string.strip(), possible_format)[0:6]))
  114. except ValueError:
  115. pass
  116. self._current_application.address = tds[self.address_col].font.string.strip()
  117. self._current_application.postcode = getPostcodeFromText(self._current_application.address)
  118. self._current_application.description = tds[self.description_col].font.string.strip()
  119. self._current_application.info_url = tds[self.reference_col].a['href']
  120. # This is what a comment url looks like
  121. # It seems to be no problem to remove the sessionid (which is in any case blank...)
  122. # I can't see a good way to avoid having to go to the info page to find the moduleid though.
  123. #http://wplan01.intranet.breckland.gov.uk:7778/pls/portal/PORTAL.wwa_app_module.link?p_arg_names=_moduleid&p_arg_values=8941787057&p_arg_names=_sessionid&p_arg_values=&p_arg_names=APPLICATION_REFERENCE&p_arg_values=3PL%2F2008%2F0877%2FF
  124. # For the moment, we'll just use the info url, as that seems to work.
  125. self._current_application.comment_url = self._current_application.info_url
  126. self._results.addApplication(self._current_application)
  127. return self._results
  128. def getResults(self, day, month, year):
  129. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  130. if __name__ == '__main__':
  131. # parser = OcellaParser("Arun", "Arun", "http://www.arun.gov.uk/iplanning/portal/page?_pageid=33,4139&_dad=portal&_schema=PORTAL")
  132. # parser = OcellaParser("Breckland Council", "Breckland", "http://wplan01.intranet.breckland.gov.uk:7778/portal/page?_pageid=33,30988&_dad=portal&_schema=PORTAL")
  133. # parser = OcellaParser("Ellesmere Port", "Ellesmere Port", "http://ocella.epnbc.gov.uk/portal/page?_pageid=33,38205&_dad=portal&_schema=PORTAL")
  134. # parser = OcellaParser("Fareham", "Fareham", "http://eocella.fareham.gov.uk/portal/page?_pageid=33,31754&_dad=portal&_schema=PORTAL")
  135. # parser = OcellaParser("Hillingdon", "Hillingdon", "http://w09.hillingdon.gov.uk/portal/page?_pageid=33,82093&_dad=portal&_schema=PORTAL")
  136. # parser = OcellaParser("Middlesbrough", "Middlesbrough", "http://planserv.middlesbrough.gov.uk/portal/page?_pageid=33,4166&_dad=portal&_schema=PORTAL")
  137. # parser = OcellaParser("North East Lincolnshire", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,68034&_dad=portal&_schema=PORTAL")
  138. # parser = OcellaParser("Uttlesford", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page?_pageid=33,35447&_dad=portal&_schema=PORTAL")
  139. # Bad status line? Try changing browser id string?
  140. # parser = OcellaParser("Bridgend", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL")
  141. # Post never comes back
  142. # parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL")
  143. # Can't find the URL similar to the others, even though it is clearly Ocella
  144. # We get a 406 at the moment. Try browser id string?
  145. parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/search")
  146. print parser.getResults(21,5,2008)
  147. #TODO
  148. # 1) Sort out proper comment url?
  149. # 2) Check for pagination
  150. # 3) Check no results case