Automatically exported from code.google.com/p/planningalerts
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

IsleOfWight.py 3.6 KiB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. """This is the planning application scraper for the Isle of Wight Council.
  2. In order to avoid this taking far too long, we've had take a few short cuts.
  3. In particular, we're not actually looking up the date each application is
  4. received. Instead, we're just using the date we first scraped it."""
  5. import urllib2
  6. import urllib
  7. import urlparse
  8. import datetime, time
  9. import cgi
  10. from BeautifulSoup import BeautifulSoup
  11. from PlanningUtils import PlanningApplication, \
  12. PlanningAuthorityResults, \
  13. getPostcodeFromText
  14. date_format = "%d/%m/%Y"
  15. class IsleOfWightParser:
  16. def __init__(self, *args):
  17. self.authority_name = "Isle of Wight Council"
  18. self.authority_short_name = "Isle of Wight"
  19. self.base_url = "http://www.iwight.com/council/departments/planning/appsdip/PlanAppSearch.aspx?__EVENTTARGET=lnkShowAll"
  20. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  21. def getResultsByDayMonthYear(self):
  22. # Note that we don't take the day, month and year parameters here.
  23. # First get the search page
  24. request = urllib2.Request(self.base_url)
  25. response = urllib2.urlopen(request)
  26. soup = BeautifulSoup(response.read())
  27. trs = soup.findAll("tr", {"class": "dbResults"})
  28. for tr in trs:
  29. application = PlanningApplication()
  30. tds = tr.findAll("td")
  31. application.council_reference = tds[0].a.contents[0].strip()
  32. application.address = tds[1].string.strip()
  33. application.postcode = getPostcodeFromText(application.address)
  34. application.description = tds[2].string.strip()
  35. application.info_url= urlparse.urljoin(self.base_url, tds[0].a['href'])
  36. # These bits have been commented out for performance reasons. We can't afford to go to every application's details page ten times a day while it is open. Instead, we'll just set the date_received to be the scrape date. The comment url can be got by using the id in the info url
  37. application.date_received = datetime.datetime.today()
  38. relative_comment_url_template = "PlanAppComment.aspx?appId=%d"
  39. # Get the appId from the info_url
  40. app_id = int(cgi.parse_qs(urlparse.urlsplit(application.info_url)[3])['frmId'][0])
  41. application.comment_url = urlparse.urljoin(self.base_url, relative_comment_url_template %(app_id))
  42. # # I'm afraid we're going to have to get each info url...
  43. # this_app_response = urllib2.urlopen(application.info_url)
  44. # this_app_soup = BeautifulSoup(this_app_response.read())
  45. # # If there is no received date, for some reason. We'll use the publicicty date instead.
  46. # date_string = (this_app_soup.find("span", id="lblTrackRecievedDate") or this_app_soup.find("span", id="lblPubDate")).string
  47. # application.date_received = datetime.datetime(*(time.strptime(date_string, date_format)[0:6]))
  48. # application.comment_url = urlparse.urljoin(self.base_url, this_app_soup.find("a", id="lnkMakeComment")['href'])
  49. self._results.addApplication(application)
  50. return self._results
  51. def getResults(self, day, month, year):
  52. return self.getResultsByDayMonthYear().displayXML()
  53. if __name__ == '__main__':
  54. parser = IsleOfWightParser()
  55. print parser.getResults(21,5,2008)
  56. # http://www.iwight.com/council/departments/planning/appsdip/PlanAppSearch.aspx
  57. # post data
  58. #__EVENTTARGET=lnkShowAll&__EVENTARGUMENT=&__VIEWSTATE=dDwtMTE4MjcxOTIzNjt0PDtsPGk8Mz47PjtsPHQ8O2w8aTw4Pjs%2BO2w8dDxAMDw7Ozs7Ozs7Ozs7Pjs7Pjs%2BPjs%2BPjs%2BBr0XINUf7K2BOMRwqIJMmHvc6bY%3D&txtTCP=&txtPIN=&txtAddress=