Automatically exported from code.google.com/p/planningalerts
 
 
 
 
 
 

135 行
5.1 KiB

  1. import urllib2
  2. import urllib
  3. import urlparse
  4. import datetime, time
  5. import cgi
  6. import cookielib
  7. cookie_jar = cookielib.CookieJar()
  8. from BeautifulSoup import BeautifulSoup
  9. from PlanningUtils import PlanningApplication, \
  10. PlanningAuthorityResults, \
  11. getPostcodeFromText
  12. #date_format = "%d-%m-%Y"
  13. date_format = "%d/%m/%Y"
  14. received_date_format = "%d %B %Y"
  15. import re
  16. # We're going to use this for a re.split
  17. # A whitespace char, "of" or "at" (case independent), and then a whitespace char.
  18. address_finder_re = re.compile("\s(?:of)|(?:at)\s", re.I)
  19. class HaltonParser:
  20. def __init__(self, *args):
  21. self.authority_name = "Halton Borough Council"
  22. self.authority_short_name = "Halton"
  23. self.base_url = "http://www.halton.gov.uk/planningapps/index.asp"
  24. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  25. #CaseNo=&AgtName=&AppName=&DateApValFrom=&DateApValTo=&AdrsNo=&StName=&StTown=&DropWeekDate=28-08-2008&DropAppealStatus=0&DateAppealValFrom=&DateAppealValTo=&Action=Search
  26. def getResultsByDayMonthYear(self, day, month, year):
  27. search_day = datetime.date(year, month, day)
  28. # It seems dates are interpreted as midnight on
  29. post_data = urllib.urlencode(
  30. [
  31. # ("CaseNo", ""),
  32. # ("AppName", ""),
  33. ("DateApValFrom", search_day.strftime(date_format)),
  34. ("DateApValTo", (search_day + datetime.timedelta(1)).strftime(date_format)),
  35. # ("AdrsNo", ""),
  36. # ("StName", ""),
  37. # ("StTown", ""),
  38. ("DropWeekDate", "0"),#search_day.strftime(date_format)),
  39. ("DropAppealStatus", "0"),
  40. # ("DateAppealValFrom", ""),
  41. # ("DateAppealValTo", ""),
  42. ("PageSize", "10"),
  43. ("Action", "Search"),
  44. ]
  45. )
  46. request = urllib2.Request(self.base_url, post_data)
  47. while request:
  48. # Now get the search page
  49. # We need to deal with cookies, since pagination depends on them.
  50. cookie_jar.add_cookie_header(request)
  51. response = urllib2.urlopen(request)
  52. cookie_jar.extract_cookies(response, request)
  53. soup = BeautifulSoup(response.read())
  54. # This should find us each Case on the current page.
  55. caseno_strings = soup.findAll(text="Case No:")
  56. for caseno_string in caseno_strings:
  57. application = PlanningApplication()
  58. application.council_reference = caseno_string.findNext("td").string
  59. application.description = caseno_string.findNext(text="Details of proposal:").findNext("td").string.strip()
  60. application.date_received = datetime.datetime.strptime(caseno_string.findNext(text="Date Received").findNext("td").string, received_date_format).date()
  61. # The address here is included in the description. We'll have to do some heuristics to try to work out where it starts.
  62. # As a first go, we'll try splitting the description on the last occurence of " of " or " at ".
  63. try:
  64. application.address = re.split(address_finder_re, application.description)[-1].strip()
  65. except IndexError:
  66. # If we can't find of or at, we'll just have the description again, it's better than nothing.
  67. application.address = application.description
  68. # We may as well get the postcode from the description rather than the address, in case things have gone wrong
  69. application.postcode = getPostcodeFromText(application.description)
  70. application.comment_url = urlparse.urljoin(self.base_url, caseno_string.findNext("form")['action'])
  71. # Now what to have as info url...
  72. # There is no way to link to a specific app, so we'll just have the search page.
  73. application.info_url = self.base_url
  74. self._results.addApplication(application)
  75. # Now we need to find the post data for the next page, if there is any.
  76. # Find the form with id "formNext", if there is one
  77. next_form = soup.find("form", id="formNext")
  78. if next_form is not None:
  79. action = next_form['action']
  80. # The HTML is borked - the inputs are outside the form, they are all
  81. # in a td which follows it.
  82. inputs = next_form.findNext("td").findAll("input")
  83. post_data = urllib.urlencode([(x['name'], x['value']) for x in inputs])
  84. request = urllib2.Request(urlparse.urljoin(self.base_url, action), post_data)
  85. else:
  86. request = None
  87. return self._results
  88. def getResults(self, day, month, year):
  89. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  90. if __name__ == '__main__':
  91. parser = HaltonParser()
  92. print parser.getResults(4,8,2008)