Automatically exported from code.google.com/p/planningalerts
 
 
 
 
 
 

94 satır
3.7 KiB

  1. import urllib2
  2. import urllib
  3. import urlparse
  4. import datetime, time
  5. import cgi
  6. import re
  7. from BeautifulSoup import BeautifulSoup
  8. from PlanningUtils import PlanningApplication, \
  9. PlanningAuthorityResults, \
  10. getPostcodeFromText
  11. date_format = "%d/%m/%Y"
  12. class FlintshireParser:
  13. def __init__(self, *args):
  14. self.authority_name = "Flintshire County Council"
  15. self.authority_short_name = "Flintshire"
  16. # I've removed some extra variables from this, it seems to be happy without them, and now doesn't need to paginate...
  17. self.base_url = "http://www.flintshire.gov.uk/webcont/fssplaps.nsf/vwa_Search?searchview&Query=(%%5BfrmDteAppldate%%5D%%20%%3E=%%20%(start_date)s%%20AND%%20%%5BfrmDteAppldate%%5D%%20%%3C=%%20%(end_date)s)"
  18. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  19. def getResultsByDayMonthYear(self, day, month, year):
  20. search_date = datetime.date(year, month, day)
  21. # We'll set the start date to be one day earlier in order to catch the first result on every day at some point - see TODO list
  22. response = urllib2.urlopen(self.base_url %{"end_date": search_date.strftime(date_format),
  23. "start_date": (search_date - datetime.timedelta(1)).strftime(date_format)})
  24. soup = BeautifulSoup(response.read())
  25. # Each app is stored in it's own table
  26. result_tables = soup.findAll("table", border="1")
  27. # For the moment, we'll have to ignore the first result (see TODO list).
  28. for table in result_tables[1:]:
  29. application = PlanningApplication()
  30. # It's not clear to me why this next one isn't the string of the next sibling. This works though!
  31. application.council_reference = table.find(text=re.compile("Reference")).parent.findNextSibling().contents[0]
  32. application.address = table.find(text="Location").parent.findNextSibling().string.strip()
  33. application.postcode = getPostcodeFromText(application.address)
  34. application.info_url = urlparse.urljoin(self.base_url, table.a['href'])
  35. # Let's go to the info_page and get the OSGB and the date_received
  36. info_request = urllib2.Request(application.info_url)
  37. # We need to add the language header in order to get UK style dates
  38. info_request.add_header("Accept-Language", "en-gb,en")
  39. info_response = urllib2.urlopen(info_request)
  40. info_soup = BeautifulSoup(info_response.read())
  41. grid_reference_td = info_soup.find(text="Grid Reference").findNext("td")
  42. x_element = grid_reference_td.font
  43. application.osgb_x = x_element.string.strip()
  44. application.osgb_y = x_element.nextSibling.nextSibling.string.strip()
  45. date_string = info_soup.find(text="Date Valid").findNext("td").string.strip()
  46. application.date_received = datetime.datetime(*(time.strptime(date_string, date_format)[0:6]))
  47. application.description = table.find(text=re.compile("Description of Proposal")).parent.nextSibling.string.strip()
  48. # There is a link to comment from the info page, though I can't click it.
  49. application.comment_url = application.info_url
  50. self._results.addApplication(application)
  51. return self._results
  52. def getResults(self, day, month, year):
  53. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  54. if __name__ == '__main__':
  55. parser = FlintshireParser()
  56. print parser.getResults(22,5,2008)
  57. # TODO
  58. # 1) Email the council about broken first result.
  59. # This is always
  60. # slightly broken (two </td>s for one of the <td>s and upsets beautiful
  61. # soup.