Automatically exported from code.google.com/p/planningalerts
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

Flintshire.py 3.7 KiB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. import urllib2
  2. import urllib
  3. import urlparse
  4. import datetime, time
  5. import cgi
  6. import re
  7. from BeautifulSoup import BeautifulSoup
  8. from PlanningUtils import PlanningApplication, \
  9. PlanningAuthorityResults, \
  10. getPostcodeFromText
  11. date_format = "%d/%m/%Y"
  12. class FlintshireParser:
  13. def __init__(self, *args):
  14. self.authority_name = "Flintshire County Council"
  15. self.authority_short_name = "Flintshire"
  16. # I've removed some extra variables from this, it seems to be happy without them, and now doesn't need to paginate...
  17. self.base_url = "http://www.flintshire.gov.uk/webcont/fssplaps.nsf/vwa_Search?searchview&Query=(%%5BfrmDteAppldate%%5D%%20%%3E=%%20%(start_date)s%%20AND%%20%%5BfrmDteAppldate%%5D%%20%%3C=%%20%(end_date)s)"
  18. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  19. def getResultsByDayMonthYear(self, day, month, year):
  20. search_date = datetime.date(year, month, day)
  21. # We'll set the start date to be one day earlier in order to catch the first result on every day at some point - see TODO list
  22. response = urllib2.urlopen(self.base_url %{"end_date": search_date.strftime(date_format),
  23. "start_date": (search_date - datetime.timedelta(1)).strftime(date_format)})
  24. soup = BeautifulSoup(response.read())
  25. # Each app is stored in it's own table
  26. result_tables = soup.findAll("table", border="1")
  27. # For the moment, we'll have to ignore the first result (see TODO list).
  28. for table in result_tables[1:]:
  29. application = PlanningApplication()
  30. # It's not clear to me why this next one isn't the string of the next sibling. This works though!
  31. application.council_reference = table.find(text=re.compile("Reference")).parent.findNextSibling().contents[0]
  32. application.address = table.find(text="Location").parent.findNextSibling().string.strip()
  33. application.postcode = getPostcodeFromText(application.address)
  34. application.info_url = urlparse.urljoin(self.base_url, table.a['href'])
  35. # Let's go to the info_page and get the OSGB and the date_received
  36. info_request = urllib2.Request(application.info_url)
  37. # We need to add the language header in order to get UK style dates
  38. info_request.add_header("Accept-Language", "en-gb,en")
  39. info_response = urllib2.urlopen(info_request)
  40. info_soup = BeautifulSoup(info_response.read())
  41. grid_reference_td = info_soup.find(text="Grid Reference").findNext("td")
  42. x_element = grid_reference_td.font
  43. application.osgb_x = x_element.string.strip()
  44. application.osgb_y = x_element.nextSibling.nextSibling.string.strip()
  45. date_string = info_soup.find(text="Date Valid").findNext("td").string.strip()
  46. application.date_received = datetime.datetime(*(time.strptime(date_string, date_format)[0:6]))
  47. application.description = table.find(text=re.compile("Description of Proposal")).parent.nextSibling.string.strip()
  48. # There is a link to comment from the info page, though I can't click it.
  49. application.comment_url = application.info_url
  50. self._results.addApplication(application)
  51. return self._results
  52. def getResults(self, day, month, year):
  53. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  54. if __name__ == '__main__':
  55. parser = FlintshireParser()
  56. print parser.getResults(22,5,2008)
  57. # TODO
  58. # 1) Email the council about broken first result.
  59. # This is always
  60. # slightly broken (two </td>s for one of the <td>s and upsets beautiful
  61. # soup.