Automatically exported from code.google.com/p/planningalerts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

Hampshire.py 2.5 KiB

16 jaren geleden
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. """
  2. This is the scraper for Hampshire.
  3. There appears to be no way to search by date received, so what we'll do is
  4. go to the currently open for consultation page and just use that.
  5. I don't think we need to worry about pagination, as there are hardly any.
  6. """
  7. import urllib2
  8. import urllib
  9. import urlparse
  10. import datetime, time
  11. import cgi
  12. import re
  13. from BeautifulSoup import BeautifulSoup
  14. from PlanningUtils import PlanningApplication, \
  15. PlanningAuthorityResults, \
  16. getPostcodeFromText
  17. date_format = "%d/%m/%Y"
  18. class HampshireParser:
  19. def __init__(self, *args):
  20. self.authority_name = "Hampshire County Council"
  21. self.authority_short_name = "Hampshire"
  22. self.base_url = "http://www3.hants.gov.uk/planning/mineralsandwaste/planning-applications/applications/applications-open.htm"
  23. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  24. def getResultsByDayMonthYear(self, day, month, year):
  25. # Now get the search page
  26. response = urllib2.urlopen(self.base_url)
  27. soup = BeautifulSoup(response.read())
  28. trs = soup.table.table.findAll("tr", {"class": re.compile("(?:odd)|(?:even)")})
  29. for tr in trs:
  30. application = PlanningApplication()
  31. tds = tr.findAll("td")
  32. application.council_reference = tds[0].a.string.strip()
  33. application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])
  34. application.address = tds[2].string.strip()
  35. application.postcode = getPostcodeFromText(application.address)
  36. application.description = tds[3].string.strip()
  37. # Fetch the info url in order to get the date received and the comment url
  38. info_response = urllib2.urlopen(application.info_url)
  39. info_soup = BeautifulSoup(info_response.read())
  40. application.date_received = datetime.datetime.strptime(info_soup.find(text=re.compile("\s*Received:\s*")).findNext("td").string.strip(), date_format).date()
  41. application.comment_url = urlparse.urljoin(self.base_url, info_soup.find("input", value="Comment on this application").parent['action'])
  42. self._results.addApplication(application)
  43. return self._results
  44. def getResults(self, day, month, year):
  45. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  46. if __name__ == '__main__':
  47. parser = HampshireParser()
  48. print parser.getResults(21,5,2008)