Automatically exported from code.google.com/p/planningalerts
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

SouthOxfordshireParser.py 9.9 KiB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. import urllib, urllib2
  2. import HTMLParser
  3. import urlparse
  4. import datetime, time
  5. # This needs a page number inserting
  6. search_url = "http://www.southoxon.gov.uk/ccm/planning/ApplicationList.jsp?PAGE=%d"
  7. # This needs the council reference
  8. comment_url = "https://forms.southoxon.gov.uk/ufs/ufsmain?formid=PLANNINGCOMMENT&PLNGAPPL_REFERENCE=%(reference)s"
  9. authority_name = "South Oxfordshire District Council"
  10. authority_short_name = "South Oxfordshire"
  11. from PlanningUtils import fixNewlines, \
  12. getPostcodeFromText, \
  13. PlanningAuthorityResults, \
  14. PlanningApplication
  15. class SouthOxfordshireParser(HTMLParser.HTMLParser):
  16. """In this case we'll take the date, so that we can avoid doing dowloads for
  17. the other days in this week's file. This date should be a datetime.date object.
  18. """
  19. def __init__(self):
  20. HTMLParser.HTMLParser.__init__(self)
  21. self._requested_date = None
  22. # We'll keep a count of the number of tables we have seen.
  23. # All the interesting stuff is in table 3
  24. self._table_count = 0
  25. # While inside table 3, we'll keep a count of the number of
  26. # <td>s we have seen. What is in which numbered <td> is detailed below.
  27. # 1 reference
  28. # 3 place and description
  29. # 5 date received
  30. # 2 and 4 are just padding
  31. self._td_count = 0
  32. # This is just a flag to say that we are now ready to get the reference
  33. # from the next bit of data
  34. self._get_reference = False
  35. self._data = ''
  36. # this will hold the application we are currently working on.
  37. self._current_application = None
  38. # The object which stores our set of planning application results
  39. self._results = PlanningAuthorityResults(authority_name, authority_short_name)
  40. def handle_starttag(self, tag, attrs):
  41. # if we see a table tag, increment the table count.
  42. if tag == 'table':
  43. self._table_count += 1
  44. # we are only interested in other tags if we are in table 3.
  45. if self._table_count == 3:
  46. # If we are starting a <tr>, create a new PlanningApplication object
  47. # for the application currently being processed
  48. if tag == 'tr':
  49. self._current_application = PlanningApplication()
  50. # if we see a td, increment the <td> count.
  51. if tag == 'td':
  52. self._td_count += 1
  53. # if we are in the first <td>, and we see a link,
  54. # then it is to the info page for this applicaion.
  55. if tag == 'a' and self._td_count == 1:
  56. for key, value in attrs:
  57. if key == 'href':
  58. url_end = value
  59. self._current_application.info_url = urlparse.urljoin(search_url,url_end)
  60. # We now know that the next bit of data is the reference
  61. self._get_reference = True
  62. # href is the only attribute we are interested in.
  63. break
  64. def handle_endtag(self, tag):
  65. # There is no need to do anything unless we are in table 3.
  66. if self._table_count == 3:
  67. # The end <tr> indicates that the current application is finished.
  68. # Now we can fetch the info_page to get the address, postcode,
  69. # and description.
  70. # If we don't have a reference, then we are in the header row,
  71. # which we don't want.
  72. # There is no point in doing this if the date is not the requested one.
  73. if tag == 'tr' and \
  74. self._current_application.council_reference is not None and \
  75. self._current_application.date_received == self._requested_date:
  76. info_page_parser = SouthOxfordshireInfoURLParser()
  77. info_page_parser.feed(urllib2.urlopen(self._current_application.info_url).read())
  78. self._current_application.address = info_page_parser.address
  79. self._current_application.postcode = getPostcodeFromText(info_page_parser.address)
  80. self._current_application.description = info_page_parser.description
  81. # Add the current application to the results set
  82. self._results.addApplication(self._current_application)
  83. # At the end of the 5th <td>, self._data should contain
  84. # the received date of the application.
  85. if tag == 'td' and self._td_count == 5:
  86. app_year, app_month, app_day = tuple(time.strptime(self._data, "%d %B %Y")[:3])
  87. self._current_application.date_received = datetime.date(app_year, app_month, app_day)
  88. self._data = ''
  89. self._td_count = 0
  90. def handle_data(self, data):
  91. # There is no need to do anything if we aren't in table 3.
  92. if self._table_count == 3:
  93. # If we are in the first <td>, and the get_reference flag is set,
  94. # then the next data is the reference.
  95. if self._td_count == 1 and self._get_reference:
  96. self._current_application.council_reference = data
  97. # The comment url can now be made, as it depends only on the reference.
  98. # On this site, the link to the comment page is only displayed once
  99. # the planning authority has decided who is handling this application
  100. # and has opened consultations. The link below works straight away,
  101. # and also works for apps for which the consultation period is over.
  102. # I have no idea if anything is actually done with these comments if
  103. # it is followed too early...
  104. self._current_application.comment_url = comment_url %{'reference': self._current_application.council_reference}
  105. # Set the get_reference flag back to False.
  106. self._get_reference = False
  107. # If we are in the 5th <td>, then we need to collect all the data together
  108. # before we can use it. This is actually processed in handle_endtag.
  109. if self._td_count == 5:
  110. self._data += data
  111. def handle_entityref( self, ref ):
  112. # We might have some entity_refs to clear up.
  113. # there is no need to bother with this if we aren't in the results table.
  114. if self._table_count == 3 and self._td_count == 5:
  115. if ref == 'nbsp':
  116. self._data += ' '
  117. def getResultsByDayMonthYear(self, day, month, year):
  118. """This will return an ApplicationResults object containg the
  119. applications for the date passed in."""
  120. today = datetime.date.today()
  121. self._requested_date = datetime.date(year, month, day)
  122. delta = today - self._requested_date
  123. # to get the correct page, we need
  124. # page ((days mod 7) + 1)
  125. page_number = delta.days/7 + 1
  126. response = urllib2.urlopen(search_url %page_number)
  127. contents = response.read()
  128. self.feed(contents)
  129. return self._results
  130. def getResults(self, day, month, year):
  131. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  132. class SouthOxfordshireInfoURLParser(HTMLParser.HTMLParser):
  133. """This parser is to get the description and address out of the info page
  134. for a South Oxfordshire application."""
  135. def __init__(self):
  136. HTMLParser.HTMLParser.__init__(self)
  137. self.address = None
  138. self.description = None
  139. # These two states will be set to:
  140. # 0 - if we haven't yet got that bit
  141. # 1 - if we are currently working on it
  142. # 2 - if we have finished
  143. self._address_state = 0
  144. self._description_state = 0
  145. # We well need to know whether or not we are in a <td>
  146. self._in_td = False
  147. # This is used for collecting together date which comes in several bits.
  148. self._data = ''
  149. def handle_starttag(self, tag, attrs):
  150. # If we see the start of a <td> and we are still interested in some data
  151. # then set the td flag to true, and blank the data
  152. if tag == 'td' and (self._address_state < 2 or self._description_state < 2):
  153. self._in_td = True
  154. self._data = ''
  155. def handle_endtag(self, tag):
  156. if tag == 'td' and (self._address_state < 2 or self._description_state < 2):
  157. # If we are working on the description,
  158. # set description from _data and note that we need to work on it no more.
  159. if self._description_state == 1:
  160. self.description = self._data
  161. self._description_state = 2
  162. # If we are working on the address,
  163. # set address from _data and note that we need to work on it no more.
  164. elif self._address_state == 1:
  165. self.address = self._data
  166. self._address_state = 2
  167. # If we see data which says 'Descripton',
  168. # then set the description state to working.
  169. elif self._data.strip() == 'Description':
  170. self._description_state = 1
  171. # If we see data which says 'Location',
  172. # then set the addresss state to working.
  173. elif self._data.strip() == 'Location':
  174. self._address_state = 1
  175. # Note that we are leaving the <td>
  176. self._in_td = False
  177. def handle_data(self, data):
  178. # if we are in a td, and we are still interested in the data for something,
  179. # append the current bit to self._data
  180. if self._in_td and (self._address_state < 2 or self._description_state < 2):
  181. self._data += data
  182. # TODO
  183. # find out what time of day this is run - does it matter that
  184. # we aren't being careful with daylight saving time etc.
  185. # Can we check that scraped email address really is
  186. # an email address?