Automatically exported from code.google.com/p/planningalerts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

135 lines
5.1 KiB

  1. import urllib2
  2. import urllib
  3. import urlparse
  4. import datetime, time
  5. import cgi
  6. import cookielib
  7. cookie_jar = cookielib.CookieJar()
  8. from BeautifulSoup import BeautifulSoup
  9. from PlanningUtils import PlanningApplication, \
  10. PlanningAuthorityResults, \
  11. getPostcodeFromText
  12. #date_format = "%d-%m-%Y"
  13. date_format = "%d/%m/%Y"
  14. received_date_format = "%d %B %Y"
  15. import re
  16. # We're going to use this for a re.split
  17. # A whitespace char, "of" or "at" (case independent), and then a whitespace char.
  18. address_finder_re = re.compile("\s(?:of)|(?:at)\s", re.I)
  19. class HaltonParser:
  20. def __init__(self, *args):
  21. self.authority_name = "Halton Borough Council"
  22. self.authority_short_name = "Halton"
  23. self.base_url = "http://www.halton.gov.uk/planningapps/index.asp"
  24. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  25. #CaseNo=&AgtName=&AppName=&DateApValFrom=&DateApValTo=&AdrsNo=&StName=&StTown=&DropWeekDate=28-08-2008&DropAppealStatus=0&DateAppealValFrom=&DateAppealValTo=&Action=Search
  26. def getResultsByDayMonthYear(self, day, month, year):
  27. search_day = datetime.date(year, month, day)
  28. # It seems dates are interpreted as midnight on
  29. post_data = urllib.urlencode(
  30. [
  31. # ("CaseNo", ""),
  32. # ("AppName", ""),
  33. ("DateApValFrom", search_day.strftime(date_format)),
  34. ("DateApValTo", (search_day + datetime.timedelta(1)).strftime(date_format)),
  35. # ("AdrsNo", ""),
  36. # ("StName", ""),
  37. # ("StTown", ""),
  38. ("DropWeekDate", "0"),#search_day.strftime(date_format)),
  39. ("DropAppealStatus", "0"),
  40. # ("DateAppealValFrom", ""),
  41. # ("DateAppealValTo", ""),
  42. ("PageSize", "10"),
  43. ("Action", "Search"),
  44. ]
  45. )
  46. request = urllib2.Request(self.base_url, post_data)
  47. while request:
  48. # Now get the search page
  49. # We need to deal with cookies, since pagination depends on them.
  50. cookie_jar.add_cookie_header(request)
  51. response = urllib2.urlopen(request)
  52. cookie_jar.extract_cookies(response, request)
  53. soup = BeautifulSoup(response.read())
  54. # This should find us each Case on the current page.
  55. caseno_strings = soup.findAll(text="Case No:")
  56. for caseno_string in caseno_strings:
  57. application = PlanningApplication()
  58. application.council_reference = caseno_string.findNext("td").string
  59. application.description = caseno_string.findNext(text="Details of proposal:").findNext("td").string.strip()
  60. application.date_received = datetime.datetime.strptime(caseno_string.findNext(text="Date Received").findNext("td").string, received_date_format).date()
  61. # The address here is included in the description. We'll have to do some heuristics to try to work out where it starts.
  62. # As a first go, we'll try splitting the description on the last occurence of " of " or " at ".
  63. try:
  64. application.address = re.split(address_finder_re, application.description)[-1].strip()
  65. except IndexError:
  66. # If we can't find of or at, we'll just have the description again, it's better than nothing.
  67. application.address = application.description
  68. # We may as well get the postcode from the description rather than the address, in case things have gone wrong
  69. application.postcode = getPostcodeFromText(application.description)
  70. application.comment_url = urlparse.urljoin(self.base_url, caseno_string.findNext("form")['action'])
  71. # Now what to have as info url...
  72. # There is no way to link to a specific app, so we'll just have the search page.
  73. application.info_url = self.base_url
  74. self._results.addApplication(application)
  75. # Now we need to find the post data for the next page, if there is any.
  76. # Find the form with id "formNext", if there is one
  77. next_form = soup.find("form", id="formNext")
  78. if next_form is not None:
  79. action = next_form['action']
  80. # The HTML is borked - the inputs are outside the form, they are all
  81. # in a td which follows it.
  82. inputs = next_form.findNext("td").findAll("input")
  83. post_data = urllib.urlencode([(x['name'], x['value']) for x in inputs])
  84. request = urllib2.Request(urlparse.urljoin(self.base_url, action), post_data)
  85. else:
  86. request = None
  87. return self._results
  88. def getResults(self, day, month, year):
  89. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  90. if __name__ == '__main__':
  91. parser = HaltonParser()
  92. print parser.getResults(4,8,2008)