Automatically exported from code.google.com/p/planningalerts

Shetland.py 4.7 KiB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. """
  2. The Shetland Isles site shows applications from the last 14 days.
  3. These are paginated into groups of ten.
  4. """
  5. import urllib2
  6. import urlparse
  7. import datetime, time
  8. import re
  9. from BeautifulSoup import BeautifulSoup
  10. from PlanningUtils import PlanningApplication, \
  11. PlanningAuthorityResults, \
  12. getPostcodeFromText
  13. date_format = "%d/%m/%Y"
  14. page_count_regex = re.compile("Records 1 to 10 of (\d*) Records Found")
  15. class ShetlandParser:
  16. def __init__(self, *args):
  17. self.authority_name = "Shetland Islands Council"
  18. self.authority_short_name = "Shetland Islands"
  19. self.base_url = "http://www.shetland.gov.uk/planningcontrol/apps/apps.asp?time=14&Orderby=DESC&parish=All&Pref=&Address=&Applicant=&ApplicantBut=View&sortby=PlanRef&offset=%d"
  20. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  21. def getResultsByDayMonthYear(self, day, month, year):
  22. search_date = datetime.datetime(year, month, day)
  23. offset = 0
  24. # First get the search page
  25. response = urllib2.urlopen(self.base_url %(offset))
  26. contents = response.read()
  27. # First let's find out how many records there are (they are displayed ten per page).
  28. match = page_count_regex.search(contents)
  29. app_count = int(match.groups()[0])
  30. while offset < app_count:
  31. if offset != 0:
  32. contents = urllib2.urlopen(self.base_url %(offset)).read()
  33. soup = BeautifulSoup(contents)
  34. # The apps are in the 5th table on the page (not a very good way to get it...)
  35. results_table = soup.findAll("table")[5]
  36. # Now we need to find the trs which contain the apps.
  37. # The first TR is just headers.
  38. # After that they alternate between containing an app and just some display graphics
  39. # until the third from last. After that, they contain more rubbish.
  40. trs = results_table.findAll("tr")[1:-2]
  41. for i in range(len(trs)):
  42. # We are only interested in the trs in even positions in the list.
  43. if i % 2 == 0:
  44. tr = trs[i]
  45. application = PlanningApplication()
  46. comment_url_element = tr.find(text="comment on this planning application").parent
  47. application.date_received = datetime.datetime(*(time.strptime(comment_url_element.findNext("td").string.strip(), date_format)[0:6]))
  48. # If the date of this application is earlier than the date
  49. # we are searching for then don't download it.
  50. # We could optimize this a bit more by not doing the later pages.
  51. if application.date_received < search_date:
  52. break
  53. application.council_reference = tr.a.string
  54. application.comment_url = urlparse.urljoin(self.base_url, comment_url_element['href'])
  55. application.info_url = urlparse.urljoin(self.base_url, tr.a['href'])
  56. info_response = urllib2.urlopen(application.info_url)
  57. info_soup = BeautifulSoup(info_response.read())
  58. info_table = info_soup.findAll("table")[2]
  59. application.description = info_table.find(text="Proposal:").findNext("td").contents[0].strip()
  60. application.postcode = info_table.find(text="Postcode:").findNext("td").contents[0].strip()
  61. # Now to get the address. This will be split across several tds.
  62. address_start_td = info_table.find("td", rowspan="4")
  63. # We need the first bit of the address from this tr
  64. address_bits = [address_start_td.findNext("td").string.strip()]
  65. # We will need the first td from the next three trs after this
  66. for address_tr in address_start_td.findAllNext("tr")[:3]:
  67. address_line = address_tr.td.string.strip()
  68. if address_line:
  69. address_bits.append(address_line)
  70. address_bits.append(application.postcode)
  71. application.address = ', '.join(address_bits)
  72. self._results.addApplication(application)
  73. offset += 10
  74. return self._results
  75. def getResults(self, day, month, year):
  76. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  77. if __name__ == '__main__':
  78. parser = ShetlandParser()
  79. # Note: to test this, you will need to pick a current date.
  80. print parser.getResults(9,6,2008)