Automatically exported from code.google.com/p/planningalerts
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
 
 
 
 
 
 

92 行
3.3 KiB

  1. import urllib2
  2. import urllib
  3. import urlparse
  4. import datetime, time
  5. import cgi
  6. import re
  7. comment_re = re.compile("Submit Comment")
  8. mapref_re = re.compile("Map Ref")
  9. import BeautifulSoup
  10. from PlanningUtils import PlanningApplication, \
  11. PlanningAuthorityResults, \
  12. getPostcodeFromText
  13. class AberdeenshireParser:
  14. def __init__(self, *args):
  15. self.authority_name = "Aberdeenshire Council"
  16. self.authority_short_name = "Aberdeenshire"
  17. self.base_url = "http://www.aberdeenshire.gov.uk/planning/apps/search.asp?startDateSearch=%(day)s%%2F%(month)s%%2F%(year)s&endDateSearch=%(day)s%%2F%(month)s%%2F%(year)s&Submit=Search"
  18. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  19. def getResultsByDayMonthYear(self, day, month, year):
  20. search_day = datetime.date(year, month, day)
  21. next = self.base_url %{"day": day,
  22. "month": month,
  23. "year": year,
  24. }
  25. while next:
  26. # Now get the search page
  27. response = urllib2.urlopen(next)
  28. soup = BeautifulSoup.BeautifulSoup(response.read())
  29. trs = soup.table.findAll("tr")[1:] # First one is just headers
  30. for tr in trs:
  31. application = PlanningApplication()
  32. application.date_received = search_day
  33. application.council_reference = tr.a.string
  34. application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])
  35. tds = tr.findAll("td")
  36. application.address = ' '.join([x.replace(" ", " ").strip() for x in tds[2].contents if type(x) == BeautifulSoup.NavigableString and x.strip()])
  37. application.postcode = getPostcodeFromText(application.address)
  38. application.description = tds[4].string.replace(" ", " ").strip()
  39. # Get the info page in order to find the comment url
  40. # we could do this without a download if it wasn't for the
  41. # sector parameter - I wonder what that is?
  42. info_response = urllib2.urlopen(application.info_url)
  43. info_soup = BeautifulSoup.BeautifulSoup(info_response.read())
  44. comment_navstring = info_soup.find(text=comment_re)
  45. if comment_navstring:
  46. application.comment_url = urlparse.urljoin(self.base_url, info_soup.find(text=comment_re).parent['href'])
  47. else:
  48. application.comment_url = "No Comments"
  49. # While we're at it, let's get the OSGB
  50. application.osgb_x, application.osgb_y = [x.strip() for x in info_soup.find(text=mapref_re).findNext("a").string.strip().split(",")]
  51. self._results.addApplication(application)
  52. next_element = soup.find(text="next").parent
  53. if next_element.name == 'a':
  54. next = urlparse.urljoin(self.base_url, next_element['href'])
  55. else:
  56. next = None
  57. return self._results
  58. def getResults(self, day, month, year):
  59. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  60. if __name__ == '__main__':
  61. parser = AberdeenshireParser()
  62. print parser.getResults(7,8,2008)