Automatically exported from code.google.com/p/planningalerts
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.
 
 
 
 
 
 

113 řádky
4.0 KiB

  1. """
  2. This is the screenscraper for Westminster City Council.
  3. I have just noticed that there is a PublicAccess underneath all this, but
  4. it only has the apps in for which they are accepting comments, so I think
  5. we may as well use this url and get the lot...
  6. This is the PublicAccess url:
  7. http://publicaccess.westminster.gov.uk/publicaccess/
  8. """
  9. import urllib2
  10. import urllib
  11. import urlparse
  12. import datetime, time
  13. import cgi
  14. from BeautifulSoup import BeautifulSoup
  15. from PlanningUtils import PlanningApplication, \
  16. PlanningAuthorityResults, \
  17. getPostcodeFromText
  18. date_format = "%d%%2F%m%%2F%Y"
  19. class WestminsterParser:
  20. def __init__(self, *args):
  21. self.authority_name = "City of Westminster"
  22. self.authority_short_name = "Westminster"
  23. self.base_url = "http://www3.westminster.gov.uk/planningapplications/currentsearch-results.cfm"
  24. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  25. def getResultsByDayMonthYear(self, day, month, year):
  26. search_day = datetime.date(year, month, day)
  27. # post_data = [
  28. # ("EFNO", ""),
  29. # ("STName", ""),
  30. # ("STNUMB", ""),
  31. # ("ADRSNO", ""),
  32. # ("WARD", "AllWards"),
  33. # ("AGT", ""),
  34. # ("ATCDE", "AllApps"),
  35. # ("DECDE", "AllDecs"),
  36. # ("DTErec", search_day.strftime(date_format)),
  37. # ("DTErecTo", search_day.strftime(date_format)),
  38. # ("DTEvalid", ""),
  39. # ("DTEvalidTo", ""),
  40. # ("APDECDE", "AllAppDecs"),
  41. # ("submit", "Start+Search"),
  42. # ]
  43. post_data = "REFNO=&STName=&STNUMB=&ADRSNO=&WARD=AllWards&AGT=&ATCDE=AllApps&DECDE=AllDecs&DTErec=%(date)s&DTErecTo=%(date)s&DTEvalid=&DTEvalidTo=&APDECDE=AllAppDecs&submit=Start+Search" %{"date": search_day.strftime(date_format)}
  44. while post_data:
  45. # Now get the search page
  46. response = urllib2.urlopen(self.base_url, post_data)
  47. soup = BeautifulSoup(response.read())
  48. results_form = soup.find("form", {"name": "currentsearchresultsNext"})
  49. # Sort out the post_data for the next page, if there is one
  50. # If there is no next page then there will be no inputs in the form.
  51. # In this case, post_data will be '', which is false.
  52. post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")])
  53. # Each result has one link, and they are the only links in the form
  54. links = results_form.findAll("a")
  55. for link in links:
  56. application = PlanningApplication()
  57. application.date_received = search_day
  58. application.info_url = urlparse.urljoin(self.base_url, link['href'])
  59. application.council_reference = link.string.strip()
  60. application.address = link.findNext("td").string.strip()
  61. application.postcode = getPostcodeFromText(application.address)
  62. application.description = link.findNext("tr").findAll("td")[-1].string.strip()
  63. # To get the comment url, we're going to have to go to each info url :-(
  64. info_response = urllib2.urlopen(application.info_url)
  65. info_soup = BeautifulSoup(info_response)
  66. comment_nav_string = info_soup.find(text="Comment on this case")
  67. if comment_nav_string:
  68. application.comment_url = comment_nav_string.parent['href']
  69. else:
  70. application.comment_url = "No Comments"
  71. #http://publicaccess.westminster.gov.uk/publicaccess/tdc/dcapplication/application_comments_entryform.aspx?caseno=K586GHRP03500
  72. self._results.addApplication(application)
  73. return self._results
  74. def getResults(self, day, month, year):
  75. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  76. if __name__ == '__main__':
  77. parser = WestminsterParser()
  78. print parser.getResults(1,8,2008)