Automatically exported from code.google.com/p/planningalerts
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.
 
 
 
 
 
 

171 řádky
5.6 KiB

  1. """
  2. This is the screenscraper for Westminster City Council.
  3. I have just noticed that there is a PublicAccess underneath all this, but
  4. it only has the apps in for which they are accepting comments, so I think
  5. we may as well use this url and get the lot...
  6. This is the PublicAccess url:
  7. http://publicaccess.westminster.gov.uk/publicaccess/
  8. """
  9. import urllib
  10. import urlparse
  11. import pycurl
  12. import StringIO
  13. import datetime, time
  14. import cgi
  15. import sys
  16. from BeautifulSoup import BeautifulSoup
  17. from PlanningUtils import PlanningApplication, \
  18. PlanningAuthorityResults, \
  19. getPostcodeFromText
  20. date_format = "%d%%2F%m%%2F%Y"
  21. class WestminsterParser:
  22. def __init__(self, *args):
  23. self.authority_name = "City of Westminster"
  24. self.authority_short_name = "Westminster"
  25. self.base_url = "http://www3.westminster.gov.uk/planningapplications/currentsearch-results.cfm"
  26. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  27. def getResultsByDayMonthYear(self, day, month, year):
  28. search_day = datetime.date(year, month, day)
  29. # post_data = [
  30. # ("EFNO", ""),
  31. # ("STName", ""),
  32. # ("STNUMB", ""),
  33. # ("ADRSNO", ""),
  34. # ("WARD", "AllWards"),
  35. # ("AGT", ""),
  36. # ("ATCDE", "AllApps"),
  37. # ("DECDE", "AllDecs"),
  38. # ("DTErec", search_day.strftime(date_format)),
  39. # ("DTErecTo", search_day.strftime(date_format)),
  40. # ("DTEvalid", ""),
  41. # ("DTEvalidTo", ""),
  42. # ("APDECDE", "AllAppDecs"),
  43. # ("submit", "Start+Search"),
  44. # ]
  45. post_data = "REFNO=&STName=&STNUMB=&ADRSNO=&WARD=AllWards&AGT=&ATCDE=AllApps&DECDE=AllDecs&DTErec=%(date)s&DTErecTo=%(date)s&DTEvalid=&DTEvalidTo=&APDECDE=AllAppDecs&submit=Start+Search" %{"date": search_day.strftime(date_format)}
  46. while post_data:
  47. # Now get the search page
  48. sys.stderr.write("Fetching: %s\n" %self.base_url)
  49. sys.stderr.write("post data: %s\n" %post_data)
  50. # This gives us something to use as the callback
  51. fakefile = StringIO.StringIO()
  52. curlobj = pycurl.Curl()
  53. curlobj.setopt(pycurl.URL, self.base_url)
  54. curlobj.setopt(pycurl.POST, True)
  55. curlobj.setopt(pycurl.POSTFIELDS, post_data)
  56. curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
  57. curlobj.setopt(pycurl.FOLLOWLOCATION, True)
  58. curlobj.setopt(pycurl.MAXREDIRS, 10)
  59. curlobj.perform()
  60. sys.stderr.write("Got it\n")
  61. soup = BeautifulSoup(fakefile.getvalue())
  62. # We may as well free up the memory used by fakefile
  63. fakefile.close()
  64. sys.stderr.write("Created soup\n")
  65. results_form = soup.find("form", {"name": "currentsearchresultsNext"})
  66. # Sort out the post_data for the next page, if there is one
  67. # If there is no next page then there will be no inputs in the form.
  68. # In this case, post_data will be '', which is false.
  69. sys.stderr.write("Found form containing results\n")
  70. post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")])
  71. sys.stderr.write("Got post data\n")
  72. # Each result has one link, and they are the only links in the form
  73. links = results_form.findAll("a")
  74. sys.stderr.write("Got list of links\n")
  75. for link in links:
  76. sys.stderr.write("Working on link: %s\n" %link['href'])
  77. application = PlanningApplication()
  78. application.date_received = search_day
  79. application.info_url = urlparse.urljoin(self.base_url, link['href'])
  80. application.council_reference = link.string.strip()
  81. application.address = link.findNext("td").string.strip()
  82. application.postcode = getPostcodeFromText(application.address)
  83. application.description = link.findNext("tr").findAll("td")[-1].string.strip()
  84. # To get the comment url, we're going to have to go to each info url :-(
  85. sys.stderr.write("Fetching: %s\n" %application.info_url)
  86. fakefile = StringIO.StringIO()
  87. curlobj.setopt(pycurl.HTTPGET, True)
  88. curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
  89. # We have to convert the info url to ascii for curl
  90. curlobj.setopt(pycurl.URL, application.info_url.encode("ascii"))
  91. curlobj.perform()
  92. sys.stderr.write("Got it\n")
  93. info_soup = BeautifulSoup(fakefile.getvalue())
  94. fakefile.close()
  95. comment_nav_string = info_soup.find(text="Comment on this case")
  96. if comment_nav_string:
  97. application.comment_url = comment_nav_string.parent['href']
  98. else:
  99. application.comment_url = "No Comments"
  100. #http://publicaccess.westminster.gov.uk/publicaccess/tdc/dcapplication/application_comments_entryform.aspx?caseno=K586GHRP03500
  101. self._results.addApplication(application)
  102. sys.stderr.write("Finished that link\n")
  103. sys.stderr.write("Finished while loop, returning stuff.\n")
  104. return self._results
  105. def getResults(self, day, month, year):
  106. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  107. if __name__ == '__main__':
  108. parser = WestminsterParser()
  109. print parser.getResults(1,8,2008)