Automatically exported from code.google.com/p/planningalerts
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.
 
 
 
 
 
 

135 строки
5.1 KiB

  1. import urllib2
  2. import urllib
  3. import urlparse
  4. import datetime, time
  5. import cgi
  6. import cookielib
  7. cookie_jar = cookielib.CookieJar()
  8. from BeautifulSoup import BeautifulSoup
  9. from PlanningUtils import PlanningApplication, \
  10. PlanningAuthorityResults, \
  11. getPostcodeFromText
  12. #date_format = "%d-%m-%Y"
  13. date_format = "%d/%m/%Y"
  14. received_date_format = "%d %B %Y"
  15. import re
  16. # We're going to use this for a re.split
  17. # A whitespace char, "of" or "at" (case independent), and then a whitespace char.
  18. address_finder_re = re.compile("\s(?:of)|(?:at)\s", re.I)
  19. class HaltonParser:
  20. def __init__(self, *args):
  21. self.authority_name = "Halton Borough Council"
  22. self.authority_short_name = "Halton"
  23. self.base_url = "http://www.halton.gov.uk/planningapps/index.asp"
  24. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  25. #CaseNo=&AgtName=&AppName=&DateApValFrom=&DateApValTo=&AdrsNo=&StName=&StTown=&DropWeekDate=28-08-2008&DropAppealStatus=0&DateAppealValFrom=&DateAppealValTo=&Action=Search
  26. def getResultsByDayMonthYear(self, day, month, year):
  27. search_day = datetime.date(year, month, day)
  28. # It seems dates are interpreted as midnight on
  29. post_data = urllib.urlencode(
  30. [
  31. # ("CaseNo", ""),
  32. # ("AppName", ""),
  33. ("DateApValFrom", search_day.strftime(date_format)),
  34. ("DateApValTo", (search_day + datetime.timedelta(1)).strftime(date_format)),
  35. # ("AdrsNo", ""),
  36. # ("StName", ""),
  37. # ("StTown", ""),
  38. ("DropWeekDate", "0"),#search_day.strftime(date_format)),
  39. ("DropAppealStatus", "0"),
  40. # ("DateAppealValFrom", ""),
  41. # ("DateAppealValTo", ""),
  42. ("PageSize", "10"),
  43. ("Action", "Search"),
  44. ]
  45. )
  46. request = urllib2.Request(self.base_url, post_data)
  47. while request:
  48. # Now get the search page
  49. # We need to deal with cookies, since pagination depends on them.
  50. cookie_jar.add_cookie_header(request)
  51. response = urllib2.urlopen(request)
  52. cookie_jar.extract_cookies(response, request)
  53. soup = BeautifulSoup(response.read())
  54. # This should find us each Case on the current page.
  55. caseno_strings = soup.findAll(text="Case No:")
  56. for caseno_string in caseno_strings:
  57. application = PlanningApplication()
  58. application.council_reference = caseno_string.findNext("td").string
  59. application.description = caseno_string.findNext(text="Details of proposal:").findNext("td").string.strip()
  60. application.date_received = datetime.datetime.strptime(caseno_string.findNext(text="Date Received").findNext("td").string, received_date_format).date()
  61. # The address here is included in the description. We'll have to do some heuristics to try to work out where it starts.
  62. # As a first go, we'll try splitting the description on the last occurence of " of " or " at ".
  63. try:
  64. application.address = re.split(address_finder_re, application.description)[-1].strip()
  65. except IndexError:
  66. # If we can't find of or at, we'll just have the description again, it's better than nothing.
  67. application.address = application.description
  68. # We may as well get the postcode from the description rather than the address, in case things have gone wrong
  69. application.postcode = getPostcodeFromText(application.description)
  70. application.comment_url = urlparse.urljoin(self.base_url, caseno_string.findNext("form")['action'])
  71. # Now what to have as info url...
  72. # There is no way to link to a specific app, so we'll just have the search page.
  73. application.info_url = self.base_url
  74. self._results.addApplication(application)
  75. # Now we need to find the post data for the next page, if there is any.
  76. # Find the form with id "formNext", if there is one
  77. next_form = soup.find("form", id="formNext")
  78. if next_form is not None:
  79. action = next_form['action']
  80. # The HTML is borked - the inputs are outside the form, they are all
  81. # in a td which follows it.
  82. inputs = next_form.findNext("td").findAll("input")
  83. post_data = urllib.urlencode([(x['name'], x['value']) for x in inputs])
  84. request = urllib2.Request(urlparse.urljoin(self.base_url, action), post_data)
  85. else:
  86. request = None
  87. return self._results
  88. def getResults(self, day, month, year):
  89. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  90. if __name__ == '__main__':
  91. parser = HaltonParser()
  92. print parser.getResults(4,8,2008)