Automatically exported from
25'ten fazla konu seçemezsiniz Konular bir harf veya rakamla başlamalı, kısa çizgiler ('-') içerebilir ve en fazla 35 karakter uzunluğunda olabilir.

124 satır
4.3 KiB

  1. """
  2. This is the screenscraper for Westminster City Council.
  3. I have just noticed that there is a PublicAccess underneath all this, but
  4. it only has the apps in for which they are accepting comments, so I think
  5. we may as well use this url and get the lot...
  6. This is the PublicAccess url:
  8. """
  9. import urllib2
  10. import urllib
  11. import urlparse
  12. import datetime, time
  13. import cgi
  14. import sys
  15. from BeautifulSoup import BeautifulSoup
  16. from PlanningUtils import PlanningApplication, \
  17. PlanningAuthorityResults, \
  18. getPostcodeFromText
  19. date_format = "%d%%2F%m%%2F%Y"
  20. class WestminsterParser:
  21. def __init__(self, *args):
  22. self.authority_name = "City of Westminster"
  23. self.authority_short_name = "Westminster"
  24. self.base_url = ""
  25. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  26. def getResultsByDayMonthYear(self, day, month, year):
  27. search_day =, month, day)
  28. # post_data = [
  29. # ("EFNO", ""),
  30. # ("STName", ""),
  31. # ("STNUMB", ""),
  32. # ("ADRSNO", ""),
  33. # ("WARD", "AllWards"),
  34. # ("AGT", ""),
  35. # ("ATCDE", "AllApps"),
  36. # ("DECDE", "AllDecs"),
  37. # ("DTErec", search_day.strftime(date_format)),
  38. # ("DTErecTo", search_day.strftime(date_format)),
  39. # ("DTEvalid", ""),
  40. # ("DTEvalidTo", ""),
  41. # ("APDECDE", "AllAppDecs"),
  42. # ("submit", "Start+Search"),
  43. # ]
  44. post_data = "REFNO=&STName=&STNUMB=&ADRSNO=&WARD=AllWards&AGT=&ATCDE=AllApps&DECDE=AllDecs&DTErec=%(date)s&DTErecTo=%(date)s&DTEvalid=&DTEvalidTo=&APDECDE=AllAppDecs&submit=Start+Search" %{"date": search_day.strftime(date_format)}
  45. while post_data:
  46. # Now get the search page
  47. sys.stderr.write("Fetching: %s" %self.base_url)
  48. sys.stderr.write("post data: %s" %post_data)
  49. response = urllib2.urlopen(self.base_url, post_data)
  50. sys.stderr.write("Got it")
  51. soup = BeautifulSoup(
  52. results_form = soup.find("form", {"name": "currentsearchresultsNext"})
  53. # Sort out the post_data for the next page, if there is one
  54. # If there is no next page then there will be no inputs in the form.
  55. # In this case, post_data will be '', which is false.
  56. post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")])
  57. # Each result has one link, and they are the only links in the form
  58. links = results_form.findAll("a")
  59. for link in links:
  60. application = PlanningApplication()
  61. application.date_received = search_day
  62. application.info_url = urlparse.urljoin(self.base_url, link['href'])
  63. application.council_reference = link.string.strip()
  64. application.address = link.findNext("td").string.strip()
  65. application.postcode = getPostcodeFromText(application.address)
  66. application.description = link.findNext("tr").findAll("td")[-1].string.strip()
  67. # To get the comment url, we're going to have to go to each info url :-(
  68. sys.stderr.write("Fetching: %s" %application.info_url)
  69. info_response = urllib2.urlopen(application.info_url)
  70. sys.stderr.write("Got it")
  71. info_soup = BeautifulSoup(info_response)
  72. comment_nav_string = info_soup.find(text="Comment on this case")
  73. if comment_nav_string:
  74. application.comment_url = comment_nav_string.parent['href']
  75. else:
  76. application.comment_url = "No Comments"
  77. #
  78. self._results.addApplication(application)
  79. return self._results
  80. def getResults(self, day, month, year):
  81. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  82. if __name__ == '__main__':
  83. parser = WestminsterParser()
  84. print parser.getResults(1,8,2008)