Automatically exported from code.google.com/p/planningalerts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

236 lines
11 KiB

  1. import urllib2
  2. import urllib
  3. import urlparse
  4. import cgi
  5. import re
  6. import datetime
  7. import BeautifulSoup
  8. from PlanningUtils import getPostcodeFromText, \
  9. PlanningAuthorityResults, \
  10. PlanningApplication
  11. # - Browser request: --------------------------
  12. # {POST http://digitalmaidstone.co.uk/swiftlg/apas/run/WPHAPPCRITERIA HTTP/1.0} {Host: digitalmaidstone.co.uk
  13. # Accept: text/html, text/plain, text/css, text/sgml, */*;q=0.01
  14. # Accept-Encoding: gzip
  15. # Accept-Language: en
  16. # Pragma: no-cache
  17. # Cache-Control: no-cache
  18. # User-Agent: Lynx/2.8.6rel.4 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.6.3
  19. # Content-type: application/x-www-form-urlencoded
  20. # Content-length: 638
  21. # } %25.MAINBODY.WPACIS.1.=&APNID.MAINBODY.WPACIS.1.=&JUSTLOCATION.MAINBODY.WPACIS.1.=&JUSTDEVDESC.MAINBODY.WPACIS.1.=&DEVDESC.MAINBODY.WPACIS.1.=&SURNAME.MAINBODY.WPACIS.1.=&REGFROMDATE.MAINBODY.WPACIS.1.=01%2F11%2F2007&REGTODATE.MAINBODY.WPACIS.1.=02%2F11%2F2007&DECFROMDATE.MAINBODY.WPACIS.1.=&DECTODATE.MAINBODY.WPACIS.1.=&FINALGRANTFROM.MAINBODY.WPACIS.1.=&FINALGRANTTO.MAINBODY.WPACIS.1.=&APELDGDATFROM.MAINBODY.WPACIS.1.=&APELDGDATTO.MAINBODY.WPACIS.1.=&APEDECDATFROM.MAINBODY.WPACIS.1.=&APEDECDATTO.MAINBODY.WPACIS.1.=&AREA.MAINBODY.WPACIS.1.=&WARD.MAINBODY.WPACIS.1.=&PARISH.MAINBODY.WPACIS.1.=&SEARCHBUTTON.MAINBODY.WPACIS.1.=Search
  22. # server=[digitalmaidstone.co.uk] , port=[80], script=[/swiftlg/apas/run/WPHAPPCRITERIA]
  23. # request_line=[POST /swiftlg/apas/run/WPHAPPCRITERIA HTTP/1.0]
  24. # second page
  25. #http://digitalmaidstone.co.uk/swiftlg/apas/run/WPHAPPSEARCHRES.displayResultsURL?ResultID=243941&
  26. #StartIndex=11&
  27. #SortOrder=APNID:asc&
  28. #DispResultsAs=WPHAPPSEARCHRES&
  29. #BackURL=<a%20href=wphappcriteria.display?paSearchKey=147118>Search%20Criteria
  30. # Date format to enter into search boxes
  31. date_format = "%d/%m/%Y"
  32. class SwiftLGParser:
  33. search_path = "WPHAPPCRITERIA"
  34. info_path = "WPHAPPDETAIL.DisplayUrl?theApnID=%s"
  35. comment_path ="wphmakerep.displayURL?ApnID=%s"
  36. def _fixHTML(self, html):
  37. return html
  38. def _findResultsTable(self, soup):
  39. """Unless there is just one table in the page, the resuts table,
  40. override this in a subclass."""
  41. return soup.table
  42. def _findTRs(self, results_table):
  43. """The usual situation is for the results table to contain
  44. one row of headers, followed by a row per app.
  45. If this is not the case, override this in a subclass."""
  46. # import pdb;pdb.set_trace()
  47. return results_table.findAll("tr")[1:]
  48. def __init__(self,
  49. authority_name,
  50. authority_short_name,
  51. base_url,
  52. debug=False):
  53. self.authority_name = authority_name
  54. self.authority_short_name = authority_short_name
  55. self.base_url = base_url
  56. self.search_url = urlparse.urljoin(base_url, self.search_path)
  57. self.info_url = urlparse.urljoin(base_url, self.info_path)
  58. self.comment_url = urlparse.urljoin(base_url, self.comment_path)
  59. self.debug = debug
  60. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  61. def getResultsByDayMonthYear(self, day, month, year):
  62. search_date = datetime.date(year, month, day)
  63. post_data = urllib.urlencode((
  64. ("REGFROMDATE.MAINBODY.WPACIS.1.", search_date.strftime(date_format)),
  65. ("REGTODATE.MAINBODY.WPACIS.1.", search_date.strftime(date_format)),
  66. ("SEARCHBUTTON.MAINBODY.WPACIS.1.", "Search"),
  67. ))
  68. response = urllib2.urlopen(self.search_url, post_data)
  69. contents = response.read()
  70. # Let's give scrapers the change to tidy up any rubbish - I'm looking
  71. # at you Cannock Chase
  72. contents = self._fixHTML(contents)
  73. # Check for the no results warning
  74. if not contents.count("No Matching Applications Found"):
  75. soup = BeautifulSoup.BeautifulSoup(contents)
  76. # Get the links to later pages of results.
  77. later_pages = soup.findAll("a", {"href": re.compile("WPHAPPSEARCHRES\.displayResultsURL.*StartIndex=\d*.*")})
  78. for a in ["initial_search"] + later_pages:
  79. if a != "initial_search":
  80. url = a['href']
  81. # Example url
  82. #http://digitalmaidstone.co.uk/swiftlg/apas/run/WPHAPPSEARCHRES.displayResultsURL?ResultID=244037&StartIndex=11&SortOrder=APNID:asc&DispResultsAs=WPHAPPSEARCHRES&BackURL=<a href=wphappcriteria.display?paSearchKey=147170>Search Criteria</a>
  83. # urllib2 doesn't like this url, to make it happy, we'll
  84. # get rid of the BackURL parameter, which we don't need.
  85. split_url = urlparse.urlsplit(url)
  86. qs = split_url[3]
  87. # This gets us a dictionary of key to lists of values
  88. qsl = cgi.parse_qsl(qs)
  89. # Get rid of BackURL
  90. qsl.pop(-1)
  91. # I think this is safe, as there are no repeats of parameters
  92. new_qs = urllib.urlencode(qsl)
  93. url = urlparse.urlunsplit(split_url[:3] + (new_qs,) + split_url[4:])
  94. this_page_url = urlparse.urljoin(self.base_url, url)
  95. response = urllib2.urlopen(this_page_url)
  96. contents = response.read()
  97. soup = BeautifulSoup.BeautifulSoup(contents)
  98. results_table = self._findResultsTable(soup)#.body.find("table", {"class": "apas_tbl"})
  99. trs = self._findTRs(results_table)
  100. for tr in trs:
  101. self._current_application = PlanningApplication()
  102. tds = tr.findAll("td")
  103. # The first td
  104. #<td class="apas_tblContent"><a href="WPHAPPDETAIL.DisplayUrl?theApnID=07/1884&amp;backURL=&lt;a href=wphappcriteria.display?paSearchKey=147125&gt;Search Criteria&lt;/a&gt; &gt; &lt;a href='wphappsearchres.displayResultsURL?ResultID=243950%26StartIndex=1%26SortOrder=APNID:asc%26DispResultsAs=WPHAPPSEARCHRES%26BackURL=&lt;a href=wphappcriteria.display?paSearchKey=147125&gt;Search Criteria&lt;/a&gt;'&gt;Search Results&lt;/a&gt;"></a><a href="wphappcriteria.display?paSearchKey=147125">Search Criteria</a> > <a href="wphappsearchres.displayResultsURL?ResultID=243950%26StartIndex=1%26SortOrder=APNID:asc%26DispResultsAs=WPHAPPSEARCHRES%26BackURL=&lt;a href=wphappcriteria.display?paSearchKey=147125&gt;Search Criteria&lt;/a&gt;"></a><a href="wphappcriteria.display?paSearchKey=147125">Search Criteria</a>'>Search Results">07/1884</td>
  105. # The html here is a bit of a mess, and doesn't all get into
  106. # the soup.
  107. # We can get the reference from the first <a href> in td 0.
  108. first_link = tds[0].a['href']
  109. app_id = cgi.parse_qs(urlparse.urlsplit(first_link)[3])['theApnID'][0]
  110. self._current_application.date_received = search_date
  111. self._current_application.council_reference = app_id
  112. self._current_application.info_url = self.info_url %(app_id)
  113. self._current_application.comment_url = self.comment_url %(app_id)
  114. self._current_application.description = tds[1].string.strip()
  115. # the second td
  116. #<td class="apas_tblContent"><input type="HIDDEN" name="ORDERCOUNTER.PAHEADER.PACIS2.1-1." value="1" class="input-box" size="7" />
  117. #LAND ADJ. BRAMBLING, HAWKENBURY ROAD, HAWKENBURY, TN120EA
  118. #</td>
  119. address = ' '.join([x for x in tds[2].contents if isinstance(x, BeautifulSoup.NavigableString)]).strip()
  120. self._current_application.address = address
  121. self._current_application.postcode = getPostcodeFromText(address)
  122. self._results.addApplication(self._current_application)
  123. return self._results
  124. def getResults(self, day, month, year):
  125. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  126. class EastHertsParser(SwiftLGParser):
  127. def _findResultsTable(self, soup):
  128. return soup.findAll("table")[3]
  129. class GwyneddParser(SwiftLGParser):
  130. def _findResultsTable(self, soup):
  131. return soup.find("table", {"class": "thinBox"})
  132. class IslingtonParser(SwiftLGParser):
  133. def _findResultsTable(self, soup):
  134. return soup.table.table
  135. class MacclesfieldParser(SwiftLGParser):
  136. def _findResultsTable(self, soup):
  137. return soup.findAll("table")[6]
  138. class MoleValleyParser(SwiftLGParser):
  139. def _findResultsTable(self, soup):
  140. # import pdb;pdb.set_trace()
  141. return soup.findAll("table")[2]
  142. class SloughParser(SwiftLGParser):
  143. def _findResultsTable(self, soup):
  144. return soup.findAll("table")[1]
  145. def _findTRs(self, results_table):
  146. return results_table.findAll("tr")[2:]
  147. class CannockChaseParser(SwiftLGParser):
  148. def _fixHTML(self, html):
  149. return html.replace('</tr class="tablebody">', '</tr>')
  150. if __name__ == '__main__':
  151. # parser = SwiftLGParser("Boston Borough Council", "Boston", "http://195.224.121.199/swiftlg/apas/run/")
  152. # parser = SwiftLGParser("Dudley", "Dudley", "http://www2.dudley.gov.uk/swiftlg/apas/run/")
  153. # parser = EastHertsParser("East Hertfordshire", "East Herts", "http://e-services.eastherts.gov.uk/swiftlg/apas/run/")
  154. # parser = GwyneddParser("Gwynedd", "Gwynedd", "http://www.gwynedd.gov.uk/swiftlg/apas/run/")
  155. # parser = IslingtonParser("Islington", "Islington", "https://www.islington.gov.uk/onlineplanning/apas/run/")
  156. # parser = SwiftLGParser("Lake District", "Lake District", "http://www.lake-district.gov.uk/swiftlg/apas/run/")
  157. # parser = SwiftLGParser("Maidstone Borough Council", "Maidstone", "http://digitalmaidstone.co.uk/swiftlg/apas/run/")
  158. # parser = MoleValleyParser("Mole Valley", "Mole Valley", "http://www.molevalley.gov.uk/swiftlg/apas/run/")
  159. # parser = SwiftLGParser("Pembrokeshire County Council", "Pembrokeshire", "http://planning.pembrokeshire.gov.uk/swiftlg/apas/run/")
  160. # parser = SwiftLGParser("Rochdale Metropolitan Borough Council", "Rochdale", "http://www.rochdale.gov.uk/swiftlg/apas/run/")
  161. # parser = SloughParser("Slough", "Slough", "http://www2.slough.gov.uk/swiftlg/apas/run/")
  162. # parser = SwiftLGParser("Snowdonia National Park", "Snowdonia", "http://www.snowdonia-npa.gov.uk/swiftlg/apas/run/")
  163. # parser = SwiftLGParser("St Edmundsbury", "Bury St Edmunds", "http://www.stedmundsbury.gov.uk/swiftlg/apas/run/")
  164. # parser = MacclesfieldParser("Macclesfield", "Macclesfield", "http://www.planportal.macclesfield.gov.uk/swiftlg/apas/run/")
  165. parser = SwiftLGParser("Daventry District Council", "Daventry", "http://212.125.73.214/swiftlg/apas/run/wphappcriteria.display")
  166. # parser = SwiftLGParser("Warrington Borough Council", "Warrington", "http://212.248.237.123:8080/swiftlg/apas/run/wphappcriteria.display")
  167. # parser = CannockChaseParser("Cannock Chase District Council", "Cannock Chase", "http://planning.cannockchasedc.com/swiftlg/apas/run/wphappcriteria.display")
  168. # parser = SwiftLGParser("London Borough of Enfield", "Enfield", "http://forms.enfield.gov.uk/swiftlg/apas/run/wphappcriteria.display")
  169. print parser.getResults(12,6,2009)
  170. # To Do:
  171. #1) Check out comment url on Maidstone
  172. #2) Daventry, when it is back up.
  173. #3) Work out what goes wrong with Gwynedd on 06/11/2007