Automatically exported from code.google.com/p/planningalerts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

Westminster.py 4.7 KiB

16 vuotta sitten
16 vuotta sitten
16 vuotta sitten
16 vuotta sitten
16 vuotta sitten
16 vuotta sitten
16 vuotta sitten
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. """
  2. This is the screenscraper for Westminster City Council.
  3. I have just noticed that there is a PublicAccess underneath all this, but
  4. it only has the apps in for which they are accepting comments, so I think
  5. we may as well use this url and get the lot...
  6. This is the PublicAccess url:
  7. http://publicaccess.westminster.gov.uk/publicaccess/
  8. """
  9. import urllib2
  10. import urllib
  11. import urlparse
  12. import datetime, time
  13. import cgi
  14. import sys
  15. from BeautifulSoup import BeautifulSoup
  16. from PlanningUtils import PlanningApplication, \
  17. PlanningAuthorityResults, \
  18. getPostcodeFromText
  19. date_format = "%d%%2F%m%%2F%Y"
  20. class WestminsterParser:
  21. def __init__(self, *args):
  22. self.authority_name = "City of Westminster"
  23. self.authority_short_name = "Westminster"
  24. self.base_url = "http://www3.westminster.gov.uk/planningapplications/currentsearch-results.cfm"
  25. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  26. def getResultsByDayMonthYear(self, day, month, year):
  27. search_day = datetime.date(year, month, day)
  28. # post_data = [
  29. # ("EFNO", ""),
  30. # ("STName", ""),
  31. # ("STNUMB", ""),
  32. # ("ADRSNO", ""),
  33. # ("WARD", "AllWards"),
  34. # ("AGT", ""),
  35. # ("ATCDE", "AllApps"),
  36. # ("DECDE", "AllDecs"),
  37. # ("DTErec", search_day.strftime(date_format)),
  38. # ("DTErecTo", search_day.strftime(date_format)),
  39. # ("DTEvalid", ""),
  40. # ("DTEvalidTo", ""),
  41. # ("APDECDE", "AllAppDecs"),
  42. # ("submit", "Start+Search"),
  43. # ]
  44. post_data = "REFNO=&STName=&STNUMB=&ADRSNO=&WARD=AllWards&AGT=&ATCDE=AllApps&DECDE=AllDecs&DTErec=%(date)s&DTErecTo=%(date)s&DTEvalid=&DTEvalidTo=&APDECDE=AllAppDecs&submit=Start+Search" %{"date": search_day.strftime(date_format)}
  45. while post_data:
  46. # Now get the search page
  47. sys.stderr.write("Fetching: %s\n" %self.base_url)
  48. sys.stderr.write("post data: %s\n" %post_data)
  49. response = urllib2.urlopen(self.base_url, post_data)
  50. sys.stderr.write("Got it\n")
  51. soup = BeautifulSoup(response.read())
  52. sys.stderr.write("Created soup\n")
  53. results_form = soup.find("form", {"name": "currentsearchresultsNext"})
  54. # Sort out the post_data for the next page, if there is one
  55. # If there is no next page then there will be no inputs in the form.
  56. # In this case, post_data will be '', which is false.
  57. sys.stderr.write("Found form containing results\n")
  58. post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")])
  59. sys.stderr.write("Got post data\n")
  60. # Each result has one link, and they are the only links in the form
  61. links = results_form.findAll("a")
  62. sys.stderr.write("Got list of links\n")
  63. for link in links:
  64. sys.stderr.write("Working on link: %s\n" %link['href'])
  65. application = PlanningApplication()
  66. application.date_received = search_day
  67. application.info_url = urlparse.urljoin(self.base_url, link['href'])
  68. application.council_reference = link.string.strip()
  69. application.address = link.findNext("td").string.strip()
  70. application.postcode = getPostcodeFromText(application.address)
  71. application.description = link.findNext("tr").findAll("td")[-1].string.strip()
  72. # To get the comment url, we're going to have to go to each info url :-(
  73. sys.stderr.write("Fetching: %s\n" %application.info_url)
  74. info_response = urllib2.urlopen(application.info_url)
  75. sys.stderr.write("Got it\n")
  76. info_soup = BeautifulSoup(info_response)
  77. comment_nav_string = info_soup.find(text="Comment on this case")
  78. if comment_nav_string:
  79. application.comment_url = comment_nav_string.parent['href']
  80. else:
  81. application.comment_url = "No Comments"
  82. #http://publicaccess.westminster.gov.uk/publicaccess/tdc/dcapplication/application_comments_entryform.aspx?caseno=K586GHRP03500
  83. self._results.addApplication(application)
  84. sys.stderr.write("Finished that link\n")
  85. sys.stderr.write("Finished while loop, returning stuff.\n")
  86. return self._results
  87. def getResults(self, day, month, year):
  88. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  89. if __name__ == '__main__':
  90. parser = WestminsterParser()
  91. print parser.getResults(1,8,2008)