Automatically exported from code.google.com/p/planningalerts
No puede seleccionar más de 25 temas Los temas deben comenzar con una letra o número, pueden incluir guiones ('-') y pueden tener hasta 35 caracteres de largo.
 
 
 
 
 
 

264 líneas
10 KiB

  1. #!/usr/bin/python
  2. import cgi
  3. import cgitb; cgitb.enable(display=0, logdir="/tmp")
  4. import urllib, urllib2
  5. import HTMLParser
  6. import urlparse
  7. import datetime, time
  8. # This needs a page number inserting
  9. search_url = "http://www.southoxon.gov.uk/ccm/planning/ApplicationList.jsp?PAGE=%d"
  10. # This needs the council reference
  11. comment_url = "https://forms.southoxon.gov.uk/ufs/ufsmain?formid=PLANNINGCOMMENT&PLNGAPPL_REFERENCE=%(reference)s"
  12. authority_name = "South Oxfordshire District Council"
  13. authority_short_name = "South Oxfordshire"
  14. from PlanningUtils import fixNewlines, \
  15. getPostcodeFromText, \
  16. PlanningAuthorityResults, \
  17. PlanningApplication
  18. class SouthOxfordshireParser(HTMLParser.HTMLParser):
  19. """In this case we'll take the date, so that we can avoid doing dowloads for
  20. the other days in this week's file. This date should be a datetime.date object.
  21. """
  22. def __init__(self):
  23. HTMLParser.HTMLParser.__init__(self)
  24. self._requested_date = None
  25. # We'll keep a count of the number of tables we have seen.
  26. # All the interesting stuff is in table 3
  27. self._table_count = 0
  28. # While inside table 3, we'll keep a count of the number of
  29. # <td>s we have seen. What is in which numbered <td> is detailed below.
  30. # 1 reference
  31. # 3 place and description
  32. # 5 date received
  33. # 2 and 4 are just padding
  34. self._td_count = 0
  35. # This is just a flag to say that we are now ready to get the reference
  36. # from the next bit of data
  37. self._get_reference = False
  38. self._data = ''
  39. # this will hold the application we are currently working on.
  40. self._current_application = None
  41. # The object which stores our set of planning application results
  42. self._results = PlanningAuthorityResults(authority_name, authority_short_name)
  43. def handle_starttag(self, tag, attrs):
  44. # if we see a table tag, increment the table count.
  45. if tag == 'table':
  46. self._table_count += 1
  47. # we are only interested in other tags if we are in table 3.
  48. if self._table_count == 3:
  49. # If we are starting a <tr>, create a new PlanningApplication object
  50. # for the application currently being processed
  51. if tag == 'tr':
  52. self._current_application = PlanningApplication()
  53. # if we see a td, increment the <td> count.
  54. if tag == 'td':
  55. self._td_count += 1
  56. # if we are in the first <td>, and we see a link,
  57. # then it is to the info page for this applicaion.
  58. if tag == 'a' and self._td_count == 1:
  59. for key, value in attrs:
  60. if key == 'href':
  61. url_end = value
  62. self._current_application.info_url = urlparse.urljoin(search_url,url_end)
  63. # We now know that the next bit of data is the reference
  64. self._get_reference = True
  65. # href is the only attribute we are interested in.
  66. break
  67. def handle_endtag(self, tag):
  68. # There is no need to do anything unless we are in table 3.
  69. if self._table_count == 3:
  70. # The end <tr> indicates that the current application is finished.
  71. # Now we can fetch the info_page to get the address, postcode,
  72. # and description.
  73. # If we don't have a reference, then we are in the header row,
  74. # which we don't want.
  75. # There is no point in doing this if the date is not the requested one.
  76. if tag == 'tr' and \
  77. self._current_application.council_reference is not None and \
  78. self._current_application.date_received == self._requested_date:
  79. info_page_parser = SouthOxfordshireInfoURLParser()
  80. info_page_parser.feed(urllib2.urlopen(self._current_application.info_url).read())
  81. self._current_application.address = info_page_parser.address
  82. self._current_application.postcode = getPostcodeFromText(info_page_parser.address)
  83. self._current_application.description = info_page_parser.description
  84. # Add the current application to the results set
  85. self._results.addApplication(self._current_application)
  86. # At the end of the 5th <td>, self._data should contain
  87. # the received date of the application.
  88. if tag == 'td' and self._td_count == 5:
  89. app_year, app_month, app_day = tuple(time.strptime(self._data, "%d %B %Y")[:3])
  90. self._current_application.date_received = datetime.date(app_year, app_month, app_day)
  91. self._data = ''
  92. self._td_count = 0
  93. def handle_data(self, data):
  94. # There is no need to do anything if we aren't in table 3.
  95. if self._table_count == 3:
  96. # If we are in the first <td>, and the get_reference flag is set,
  97. # then the next data is the reference.
  98. if self._td_count == 1 and self._get_reference:
  99. self._current_application.council_reference = data
  100. # The comment url can now be made, as it depends only on the reference.
  101. # On this site, the link to the comment page is only displayed once
  102. # the planning authority has decided who is handling this application
  103. # and has opened consultations. The link below works straight away,
  104. # and also works for apps for which the consultation period is over.
  105. # I have no idea if anything is actually done with these comments if
  106. # it is followed too early...
  107. self._current_application.comment_url = comment_url %{'reference': self._current_application.council_reference}
  108. # Set the get_reference flag back to False.
  109. self._get_reference = False
  110. # If we are in the 5th <td>, then we need to collect all the data together
  111. # before we can use it. This is actually processed in handle_endtag.
  112. if self._td_count == 5:
  113. self._data += data
  114. def handle_entityref( self, ref ):
  115. # We might have some entity_refs to clear up.
  116. # there is no need to bother with this if we aren't in the results table.
  117. if self._table_count == 3 and self._td_count == 5:
  118. if ref == 'nbsp':
  119. self._data += ' '
  120. def getResultsByDayMonthYear(self, day, month, year):
  121. """This will return an ApplicationResults object containg the
  122. applications for the date passed in."""
  123. today = datetime.date.today()
  124. self.requested_date = datetime.date(year, month, day)
  125. delta = today - self.requested_date
  126. # to get the correct page, we need
  127. # page ((days mod 7) + 1)
  128. page_number = delta.days/7 + 1
  129. response = urllib2.urlopen(search_url %page_number)
  130. self.feed(response.read())
  131. return self._results
  132. def getResults(self, day, month, year):
  133. return getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  134. class SouthOxfordshireInfoURLParser(HTMLParser.HTMLParser):
  135. """This parser is to get the description and address out of the info page
  136. for a South Oxfordshire application."""
  137. def __init__(self):
  138. HTMLParser.HTMLParser.__init__(self)
  139. self.address = None
  140. self.description = None
  141. # These two states will be set to:
  142. # 0 - if we haven't yet got that bit
  143. # 1 - if we are currently working on it
  144. # 2 - if we have finished
  145. self._address_state = 0
  146. self._description_state = 0
  147. # We well need to know whether or not we are in a <td>
  148. self._in_td = False
  149. # This is used for collecting together date which comes in several bits.
  150. self._data = ''
  151. def handle_starttag(self, tag, attrs):
  152. # If we see the start of a <td> and we are still interested in some data
  153. # then set the td flag to true, and blank the data
  154. if tag == 'td' and (self._address_state < 2 or self._description_state < 2):
  155. self._in_td = True
  156. self._data = ''
  157. def handle_endtag(self, tag):
  158. if tag == 'td' and (self._address_state < 2 or self._description_state < 2):
  159. # If we are working on the description,
  160. # set description from _data and note that we need to work on it no more.
  161. if self._description_state == 1:
  162. self.description = self._data
  163. self._description_state = 2
  164. # If we are working on the address,
  165. # set address from _data and note that we need to work on it no more.
  166. elif self._address_state == 1:
  167. self.address = self._data
  168. self._address_state = 2
  169. # If we see data which says 'Descripton',
  170. # then set the description state to working.
  171. elif self._data.strip() == 'Description':
  172. self._description_state = 1
  173. # If we see data which says 'Location',
  174. # then set the addresss state to working.
  175. elif self._data.strip() == 'Location':
  176. self._address_state = 1
  177. # Note that we are leaving the <td>
  178. self._in_td = False
  179. def handle_data(self, data):
  180. # if we are in a td, and we are still interested in the data for something,
  181. # append the current bit to self._data
  182. if self._in_td and (self._address_state < 2 or self._description_state < 2):
  183. self._data += data
  184. # TODO
  185. # find out what time of day this is run - does it matter that
  186. # we aren't being careful with daylight saving time etc.
  187. # Can we check that scraped email address really is
  188. # an email address?
  189. if __name__ == "__main__":
  190. form = cgi.FieldStorage()
  191. day = form.getfirst('day')
  192. month = form.getfirst('month')
  193. year = form.getfirst('year')
  194. parser = SouthOxfordshireParser()
  195. print "Content-Type: text/xml" # XML is following
  196. print
  197. print xml # print the xml