Automatically exported from code.google.com/p/planningalerts
 
 
 
 
 
 

79 lines
2.5 KiB

  1. """
  2. This is the scraper for Hampshire.
  3. There appears to be no way to search by date received, so what we'll do is
  4. go to the currently open for consultation page and just use that.
  5. I don't think we need to worry about pagination, as there are hardly any.
  6. """
  7. import urllib2
  8. import urllib
  9. import urlparse
  10. import datetime, time
  11. import cgi
  12. import re
  13. from BeautifulSoup import BeautifulSoup
  14. from PlanningUtils import PlanningApplication, \
  15. PlanningAuthorityResults, \
  16. getPostcodeFromText
  17. date_format = "%d/%m/%Y"
  18. class HampshireParser:
  19. def __init__(self, *args):
  20. self.authority_name = "Hampshire County Council"
  21. self.authority_short_name = "Hampshire"
  22. self.base_url = "http://www3.hants.gov.uk/planning/mineralsandwaste/planning-applications/applications/applications-open.htm"
  23. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  24. def getResultsByDayMonthYear(self, day, month, year):
  25. # Now get the search page
  26. response = urllib2.urlopen(self.base_url)
  27. soup = BeautifulSoup(response.read())
  28. trs = soup.table.table.findAll("tr", {"class": re.compile("(?:odd)|(?:even)")})
  29. for tr in trs:
  30. application = PlanningApplication()
  31. tds = tr.findAll("td")
  32. application.council_reference = tds[0].a.string.strip()
  33. application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])
  34. application.address = tds[2].string.strip()
  35. application.postcode = getPostcodeFromText(application.address)
  36. application.description = tds[3].string.strip()
  37. # Fetch the info url in order to get the date received and the comment url
  38. info_response = urllib2.urlopen(application.info_url)
  39. info_soup = BeautifulSoup(info_response.read())
  40. application.date_received = datetime.datetime.strptime(info_soup.find(text=re.compile("\s*Received:\s*")).findNext("td").string.strip(), date_format).date()
  41. application.comment_url = urlparse.urljoin(self.base_url, info_soup.find("input", value="Comment on this application").parent['action'])
  42. self._results.addApplication(application)
  43. return self._results
  44. def getResults(self, day, month, year):
  45. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  46. if __name__ == '__main__':
  47. parser = HampshireParser()
  48. print parser.getResults(21,5,2008)