Automatically exported from code.google.com/p/planningalerts
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 
 
 
 
 

218 linhas
8.7 KiB

  1. #!/usr/local/bin/python
  2. import urllib, urllib2
  3. import urlparse
  4. import datetime
  5. import re
  6. import BeautifulSoup
  7. import cookielib
  8. cookie_jar = cookielib.CookieJar()
  9. from PlanningUtils import fixNewlines, getPostcodeFromText, PlanningAuthorityResults, PlanningApplication
  10. def index_or_none(a_list, item):
  11. """
  12. Returns the index of item in a_list, or None, if it isn't in the list.
  13. """
  14. return a_list.count(item) and a_list.index(item)
  15. class PublicAccessParser(object):
  16. """This is the class which parses the PublicAccess search results page.
  17. """
  18. search_form_url_end = "DcApplication/application_searchform.aspx"
  19. search_results_url_end = "DcApplication/application_searchresults.aspx"
  20. comments_url_end = "DcApplication/application_comments_entryform.aspx"
  21. # For some sites (Hambleton, for example) we need to leave in the empty
  22. # strings.
  23. data_template = (
  24. ("searchtype", "ADV"),
  25. ("caseNo", ""),
  26. ("PPReference", ""),
  27. ("AltReference", ""),
  28. ("srchtype", ""),
  29. ("srchstatus", ""),
  30. ("srchdecision", ""),
  31. ("srchapstatus", ""),
  32. ("srchappealdecision", ""),
  33. ("srchwardcode", ""),
  34. ("srchparishcode", ""),
  35. ("srchagentdetails", ""),
  36. ("srchDateReceivedStart", "%(day)02d/%(month)02d/%(year)04d"),
  37. ("srchDateReceivedEnd", "%(day)02d/%(month)02d/%(year)04d"),
  38. ("srchDateValidStart", ""),
  39. ("srchDateValidEnd", ""),
  40. ("srchDateCommitteeStart", ""),
  41. ("srchDateCommitteeEnd", ""),
  42. )
  43. def __init__(self,
  44. authority_name,
  45. authority_short_name,
  46. base_url,
  47. debug=False):
  48. self.authority_name = authority_name
  49. self.authority_short_name = authority_short_name
  50. self.base_url = base_url
  51. self.debug = debug
  52. # The object which stores our set of planning application results
  53. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  54. def fetch_setting_cookie(self, url, data=None):
  55. request = urllib2.Request(url, data)
  56. cookie_jar.add_cookie_header(request)
  57. response = urllib2.urlopen(request)
  58. cookie_jar.extract_cookies(response, request)
  59. return response
  60. def get_search_page(self):
  61. return self.fetch_setting_cookie(urlparse.urljoin(self.base_url, self.search_form_url_end))
  62. def get_response_1(self, data):
  63. return self.fetch_setting_cookie(urlparse.urljoin(self.base_url, self.search_results_url_end), data)
  64. # def get_data_2(self, day, month, year):
  65. # search_data2 = urllib.urlencode((("szSearchDescription","Applications received between %(day)02d/%(month)02d/%(year)d and %(day)02d/%(month)02d/%(year)d"%{"day":day ,"month": month ,"year": year}), ("searchType","ADV"), ("bccaseno",""), ("currentpage","1"), ("pagesize","100"), ("module","P3")))
  66. # if self.debug:
  67. # print search_data2
  68. # return search_data2
  69. # def get_response_2(self, data):
  70. # # This time we want to do a get request, so add the search data into the url
  71. # url = urlparse.urljoin(self.base_url, self.search_results_url_end + "?" + data)
  72. # return self.fetch_setting_cookie(url)
  73. def get_data_1(self, replacement_dict):
  74. # It seems urllib.urlencode isn't happy with the generator here,
  75. # so we'd best make it a tuple...
  76. data_tuple = tuple(((key, value %replacement_dict) for (key, value) in self.data_template))
  77. data = urllib.urlencode(data_tuple)
  78. return data
  79. def get_replacement_dict(self, day, month, year, search_response):
  80. return {"day": day,
  81. "month": month,
  82. "year": year}
  83. def get_useful_response(self, day, month, year):
  84. # We're only doing this to get a cookie
  85. search_response = self.get_search_page()
  86. replacement_dict = self.get_replacement_dict(day, month, year, search_response)
  87. data = self.get_data_1(replacement_dict)
  88. return self.get_response_1(data)
  89. def get_contents(self, day, month, year):
  90. useful_response = self.get_useful_response(day, month, year)
  91. contents = fixNewlines(useful_response.read())
  92. if self.debug:
  93. print contents
  94. return contents
  95. def getResultsByDayMonthYear(self, day, month, year):
  96. search_date = datetime.date(year, month, day)
  97. contents = self.get_contents(day, month, year)
  98. soup = BeautifulSoup.BeautifulSoup(contents)
  99. results_table = soup.find("table", {"class": "cResultsForm"})
  100. # First, we work out what column each thing of interest is in from the headings
  101. headings = [x.string for x in results_table.findAll("th")]
  102. ref_col = index_or_none(headings, "Application Ref.") or \
  103. index_or_none(headings, "Case Number") or \
  104. index_or_none(headings, "Application Number")
  105. address_col = headings.index("Address")
  106. description_col = headings.index("Proposal")
  107. comments_url = urlparse.urljoin(self.base_url, self.comments_url_end)
  108. for tr in results_table.findAll("tr")[1:]:
  109. application = PlanningApplication()
  110. application.date_received = search_date
  111. tds = tr.findAll(re.compile("t[dh]"))
  112. application.council_reference = tds[ref_col].string.strip()
  113. application.address = tds[address_col].string.strip()
  114. application.description = tds[description_col].string.strip()
  115. application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])
  116. # We need the query string from this url to make the comments_url
  117. query_string = urlparse.urlsplit(application.info_url)[3]
  118. # This is probably slightly naughty, but I'm just going to add the querystring
  119. # on to the end manually
  120. application.comment_url = "%s?%s" %(comments_url, query_string)
  121. self._results.addApplication(application)
  122. return self._results
  123. def getResults(self, day, month, year):
  124. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  125. class HambletonParser(PublicAccessParser):
  126. data_template = PublicAccessParser.data_template + (("s8fid", "%(s8fid)s"),)
  127. def get_replacement_dict(self, day, month, year, search_response):
  128. replacement_dict = super(HambletonParser, self).get_replacement_dict(day, month, year, search_response)
  129. # We need an input s8fid from this.
  130. # BeautifulSoup doesn't like it, so we'll have to use a regex.
  131. search_contents = search_response.read()
  132. #<input type=hidden name="s8fid" value="112455787981" />
  133. s8fid_re = re.compile('<input[^>]*name="s8fid" value="(\d*)" />')
  134. replacement_dict["s8fid"] = s8fid_re.search(search_contents).groups()[0]
  135. return replacement_dict
  136. if __name__ == '__main__':
  137. day = 11
  138. month = 6
  139. year = 2009
  140. # parser = PublicAccessParser("East Northants", "East Northants", "http://publicaccesssrv.east-northamptonshire.gov.uk/PublicAccess/tdc/", True)
  141. # parser = PublicAccessParser("Cherwell District Council", "Cherwell", "http://cherweb.cherwell-dc.gov.uk/publicaccess/tdc/", True)
  142. # parser = HambletonParser("Hambleton District Council", "Hambleton", "http://planning.hambleton.gov.uk/publicaccess/tdc/", True)
  143. # parser = PublicAccessParser("Durham City Council", "Durham", "http://publicaccess.durhamcity.gov.uk/publicaccess/tdc/", True)
  144. # parser = PublicAccessParser("Moray Council", "Moray", "http://public.moray.gov.uk/publicaccess/tdc/", True)
  145. # parser = PublicAccessParser("Sheffield City Council", "Sheffield", "http://planning.sheffield.gov.uk/publicaccess/tdc/")
  146. # parser = PublicAccessParser("London Borough of Barking and Dagenham", "Barking and Dagenham", "http://paweb.barking-dagenham.gov.uk/PublicAccess/tdc/")
  147. # parser = PublicAccessParser("Reading Borough Council", "Reading", "http://planning.reading.gov.uk/publicaccess/tdc/")
  148. # parser = PublicAccessParser("Lancaster City Council", "Lancaster", "http://planapps.lancaster.gov.uk/PublicAccess/tdc/")
  149. # parser = PublicAccessParser("Harrogate Borough Council", "Harrogate", "http://publicaccess.harrogate.gov.uk/publicaccess/tdc/")
  150. # parser = PublicAccessParser("West Lancashire District Council", "West Lancashire", "http://publicaccess.westlancsdc.gov.uk/PublicAccess/tdc/")
  151. parser = PublicAccessParser("Torbay Council", "Torbay", "http://www.torbay.gov.uk/publicaccess/tdc/")
  152. # parser = PublicAccessParser("Oxford City Council", "Oxford", "http://uniformpublicaccess.oxford.gov.uk/publicaccess/tdc/", debug=True)
  153. print parser.getResults(day, month, year)