Automatically exported from code.google.com/p/planningalerts
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
 
 
 
 
 
 

579 行
23 KiB

  1. import urllib2
  2. import urllib
  3. import urlparse
  4. import cgi
  5. import re
  6. import datetime
  7. from BeautifulSoup import BeautifulSoup
  8. from PlanningUtils import PlanningApplication, \
  9. PlanningAuthorityResults, \
  10. getPostcodeFromText
  11. # Date format to enter into search boxes
  12. date_format = "%d/%m/%Y"
  13. # Regex for getting the application code
  14. # (needed for the comments url, when it exists)
  15. app_code_regex = re.compile("PARAM0=(\d*)")
  16. class PlanningExplorerParser:
  17. # If this authority doesn't have a comments page,
  18. # then set this email_address to an address for the
  19. # planning department, and it will be used in lieu of
  20. # a comments url.
  21. comments_email_address = None
  22. # These are the directories where the info urls, and search urls,
  23. # usually live underneath the base_url.
  24. # If these are different for a particular
  25. # authority, then they can be overridden in a subclass.
  26. info_url_path = "MVM/Online/Generic/"
  27. search_url_path = "MVM/Online/PL/GeneralSearch.aspx"
  28. # This is the most common place for comments urls to live
  29. # The %s will be filled in with an application code
  30. comments_path = "MVM/Online/PL/PLComments.aspx?pk=%s"
  31. # Most authorities don't need the referer header on the post
  32. # request. If one does, override this in the subclass
  33. use_referer = False
  34. # Some authorities won't give us anything back if we use the
  35. # python urllib2 useragent string. In that case, override this
  36. # in a subclass to pretend to be firefox.
  37. use_firefox_user_agent = False
  38. # This is the most common css class of the table containing the
  39. # the search results. If it is different for a particular authority
  40. # it can be overridden in a subclass
  41. results_table_attrs = {"class": "ResultsTable"}
  42. # These are the most common column positions for the
  43. # council reference, the address, and the description
  44. # in the results table.
  45. # They should be overridden in subclasses if they are different
  46. # for a particular authority.
  47. reference_td_no = 0
  48. address_td_no = 1
  49. description_td_no = 2
  50. def _modify_response(self, response):
  51. """For most sites, we have managed to get all the apps on a
  52. single page by choosing the right parameters.
  53. If that hasn't been possible, override this method to get a
  54. new response object which has all the apps in one page.
  55. (See, for example, Hackney).
  56. """
  57. return response
  58. def _find_trs(self, results_table):
  59. """Normally, we just want a list of all the trs except the first one
  60. (which is usually a header).
  61. If the authority requires a different list of trs, override this method.
  62. """
  63. return results_table.findAll("tr")[1:]
  64. def _sanitisePostHtml(self, html):
  65. """This method can be overriden in subclasses if the
  66. html that comes back from the post request is bad, and
  67. needs tidying up before giving it to BeautifulSoup."""
  68. return html
  69. def _sanitiseInfoUrl(self, url):
  70. """If an authority has info urls which are for some reason full
  71. of crap (like Broadland does), then this method should be overridden
  72. in order to tidy them up."""
  73. return url
  74. def _getHeaders(self):
  75. """If the authority requires any headers for the post request,
  76. override this method returning a dictionary of header key to
  77. header value."""
  78. headers = {}
  79. if self.use_firefox_user_agent:
  80. headers["User-Agent"] = "Mozilla/5.0 (X11; U; Linux i686; en-GB; rv:1.8.1.10) Gecko/20071126 Ubuntu/7.10 (gutsy) Firefox/2.0.0.10"
  81. if self.use_referer:
  82. headers["Referer"] = self.search_url
  83. return headers
  84. def _getPostData(self, asp_args, search_date):
  85. """Accepts asp_args (a tuple of key value pairs of the pesky ASP
  86. parameters, and search_date, a datetime.date object for the day
  87. we are searching for.
  88. This seems to be the most common set of post data which is needed
  89. for PlanningExplorer sites. It won't work for all of them, so
  90. will sometimes need to be overridden in a subclass.
  91. The parameter edrDateSelection is often not needed.
  92. It is needed by Charnwood though, so I've left it in
  93. to keep things simple.
  94. """
  95. year_month_day = search_date.timetuple()[:3]
  96. post_data = urllib.urlencode(asp_args + (
  97. ("_ctl0", "DATE_RECEIVED"),
  98. ("rbGroup", "_ctl5"),
  99. ("_ctl7_hidden", urllib.quote('<DateChooser Value="%d%%2C%d%%2C%d"><ExpandEffects></ExpandEffects></DateChooser>' %year_month_day)),
  100. ("_ctl8_hidden", urllib.quote('<DateChooser Value="%d%%2C%d%%2C%d"><ExpandEffects></ExpandEffects></DateChooser>' %year_month_day)),
  101. ("edrDateSelection", "1"),
  102. ("csbtnSearch", "Search"),
  103. ("cboNumRecs", "99999"),
  104. ))
  105. return post_data
  106. def _getPostCode(self):
  107. """In most cases, the postcode can be got from the address in
  108. the results table. Some councils put the address there without the
  109. postcode. In this case we will have to go to the info page to get
  110. the postcode. This should be done by overriding this method with
  111. one that parses the info page."""
  112. return getPostcodeFromText(self._current_application.address)
  113. def __init__(self,
  114. authority_name,
  115. authority_short_name,
  116. base_url,
  117. debug=False):
  118. self.authority_name = authority_name
  119. self.authority_short_name = authority_short_name
  120. self.base_url = base_url
  121. self.search_url = urlparse.urljoin(base_url, self.search_url_path)
  122. self.info_url_base = urlparse.urljoin(self.base_url, self.info_url_path)
  123. self.debug = debug
  124. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  125. def getResultsByDayMonthYear(self, day, month, year):
  126. search_date = datetime.date(year, month, day)
  127. # First do a get, to get some state
  128. get_request = urllib2.Request(self.search_url)
  129. get_response = urllib2.urlopen(get_request)
  130. html = get_response.read()
  131. # We need to find those ASP parameters such as __VIEWSTATE
  132. # so we can use them in the next POST
  133. asp_args_regex = re.compile('<input[^>]*name=\"(__[A-Z]*)\"[^>]*value=\"([^\"]*)\"[^>]*>')
  134. # re.findall gets us a list of key value pairs.
  135. # We want to concatenate it with a tuple, so we must
  136. # make it a tuple
  137. asp_args = tuple(re.findall(asp_args_regex, html))
  138. # The post data needs to be different for different councils
  139. # so we have a method on each council's scraper to make it.
  140. post_data = self._getPostData(asp_args, search_date)
  141. headers = self._getHeaders()
  142. request = urllib2.Request(self.search_url, post_data, headers)
  143. post_response = urllib2.urlopen(request)
  144. # We have actually been returned here by an http302 object
  145. # moved, and the response called post_response is really a get.
  146. # In some cases, we can't get the page size set high
  147. # until now. In that case, override _modify_response
  148. # so that we get back a response with all the apps on one page.
  149. # We pass in headers so that any
  150. post_response = self._modify_response(post_response)
  151. html = self._sanitisePostHtml(post_response.read())
  152. soup = BeautifulSoup(html)
  153. results_table = soup.find("table", attrs=self.results_table_attrs)
  154. # If there is no results table, then there were no apps on that day.
  155. if results_table:
  156. trs = self._find_trs(results_table)
  157. self._current_application = None
  158. # The first tr is just titles, cycle through the trs after that
  159. for tr in trs:
  160. self._current_application = PlanningApplication()
  161. # There is no need to search for the date_received, it's what
  162. # we searched for
  163. self._current_application.date_received = search_date
  164. tds = tr.findAll("td")
  165. for td_no in range(len(tds)):
  166. if td_no == self.reference_td_no:
  167. # This td contains the reference number and a link to details
  168. self._current_application.council_reference = tds[td_no].a.string
  169. relative_info_url = self._sanitiseInfoUrl(tds[td_no].a['href'])
  170. self._current_application.info_url = urlparse.urljoin(self.info_url_base, relative_info_url)
  171. # What about a comment url?
  172. # There doesn't seem to be one, so we'll use the email address
  173. if self.comments_email_address is not None:
  174. # We're using the email address, as there doesn't seem
  175. # to be a web form for comments
  176. self._current_application.comment_url = self.comments_email_address
  177. else:
  178. # This link contains a code which we need for the comments url
  179. # (on those sites that use it)
  180. application_code = app_code_regex.search(relative_info_url).groups()[0]
  181. relative_comments_url = self.comments_path %(application_code)
  182. self._current_application.comment_url = urlparse.urljoin(self.base_url, relative_comments_url)
  183. elif td_no == self.address_td_no:
  184. # If this td contains a div, then the address is the
  185. # string in there - otherwise, use the string in the td.
  186. if tds[td_no].div is not None:
  187. address = tds[td_no].div.string
  188. else:
  189. address = tds[td_no].string
  190. self._current_application.address = address
  191. self._current_application.postcode = self._getPostCode()
  192. elif td_no == self.description_td_no:
  193. if tds[td_no].div is not None:
  194. # Mostly this is in a div
  195. # Use the empty string if the description is missing
  196. description = tds[td_no].div.string or ""
  197. else:
  198. # But sometimes (eg Crewe) it is directly in the td.
  199. # Use the empty string if the description is missing
  200. description = tds[td_no].string or ""
  201. self._current_application.description = description
  202. self._results.addApplication(self._current_application)
  203. return self._results
  204. def getResults(self, day, month, year):
  205. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  206. class BroadlandLike:
  207. # FIXME - BroadlandLike authorities don't have postcodes on their site, but
  208. # they do have grid references. We should use these.
  209. results_table_attrs = {"class": "display_table"}
  210. info_url_path = "Northgate/PlanningExplorer/Generic/"
  211. search_url_path = "Northgate/PlanningExplorer/GeneralSearch.aspx"
  212. use_firefox_user_agent = True
  213. use_referer = True
  214. def _getPostData(self, asp_args, search_date):
  215. post_data = urllib.urlencode(asp_args + (
  216. ("cboSelectDateValue", "DATE_RECEIVED"),
  217. ("rbGroup", "rbRange"),
  218. ("dateStart", search_date.strftime(date_format)),
  219. ("dateEnd", search_date.strftime(date_format)),
  220. ("cboNumRecs", "99999"),
  221. ("csbtnSearch", "Search"),
  222. ))
  223. return post_data
  224. def _sanitiseInfoUrl(self, url):
  225. """The broadland info urls arrive full of rubbish. This method tidies
  226. them up."""
  227. # We need to
  228. # 1) Remove whitespace
  229. # 2) Remove &#xA; and &#xD;
  230. ws_re = re.compile("(?:(?:\s)|(?:&#x\w;))*")
  231. return ''.join(ws_re.split(url))
  232. class BlackburnParser(PlanningExplorerParser):
  233. use_firefox_user_agent = True
  234. class BroadlandParser(BroadlandLike, PlanningExplorerParser):
  235. # FIXME - is http://secure.broadland.gov.uk/mvm/Online/PL/GeneralSearch.aspx
  236. # a better url for Broadland?
  237. def _sanitisePostHtml(self, html):
  238. """The page that comes back from the post for the broadland site
  239. has a broken doctype declaration. We need to tidy that up before
  240. giving it to BeautifulSoup."""
  241. # This is what it looks like - note the missing close doublequote
  242. #<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd>
  243. # Split on the broken doctype and join with the doctype with
  244. # closing quote.
  245. html = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'.join(html.split('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd>'))
  246. return html
  247. class CamdenParser(BroadlandLike, PlanningExplorerParser):
  248. comments_path = "Northgate/PlanningExplorer/PLComments.aspx?pk=%s"
  249. class CharnwoodParser(PlanningExplorerParser):
  250. use_firefox_user_agent = True
  251. class CreweParser(PlanningExplorerParser):
  252. use_firefox_user_agent = True
  253. address_td_no = 4
  254. def _getPostData(self, asp_args, search_date):
  255. year_month_day = search_date.timetuple()[:3]
  256. post_data = urllib.urlencode(asp_args + (
  257. ("drDateReceived:_ctl0_hidden", urllib.quote('<DateChooser Value="%d%%2C%d%%2C%d"><ExpandEffects></ExpandEffects></DateChooser>' %year_month_day)),
  258. ("drDateReceivedxxctl0_input", search_date.strftime(date_format)),
  259. ("drDateReceived:_ctl1_hidden", urllib.quote('<DateChooser Value="%d%%2C%d%%2C%d"><ExpandEffects></ExpandEffects></DateChooser>' %year_month_day)),
  260. ("drDateReceivedxxctl1_input", search_date.strftime(date_format)),
  261. ("cboNumRecs", "99999"),
  262. ("csbtnSearch", "Search"),
  263. ))
  264. return post_data
  265. class EastStaffsParser(PlanningExplorerParser):
  266. use_firefox_user_agent = True
  267. address_td_no = 4
  268. description_td_no = 1
  269. class EppingForestParser(PlanningExplorerParser):
  270. use_firefox_user_agent = True
  271. address_td_no = 3
  272. description_td_no = 1
  273. class ForestHeathParser(BroadlandLike, PlanningExplorerParser):
  274. pass
  275. class HackneyParser(PlanningExplorerParser):
  276. # FIXME - This will only get the first ten records on this
  277. # day. Need to deal with paging.
  278. use_firefox_user_agent = True
  279. address_td_no = 6
  280. description_td_no = 5
  281. def _modify_response(self, response):
  282. # In order to make sure we don't have to worry about any paging,
  283. # We'll fetch this url again with PS=99999.
  284. real_url_tuple = urlparse.urlsplit(response.geturl())
  285. query_string = real_url_tuple[3]
  286. # Get the query as a list of key, value pairs
  287. parsed_query_list = list(cgi.parse_qsl(query_string))
  288. # Go through the query string replacing any PS parameters
  289. # with PS=99999
  290. for i in range(len(parsed_query_list)):
  291. key, value = parsed_query_list[i]
  292. if key == "PS":
  293. value = "99999"
  294. parsed_query_list[i] = (key, value)
  295. new_query_string = urllib.urlencode(parsed_query_list)
  296. new_url_tuple = real_url_tuple[:3] + (new_query_string,) + real_url_tuple[4:]
  297. new_url = urlparse.urlunsplit(new_url_tuple)
  298. new_request = urllib2.Request(new_url, None, self._getHeaders())
  299. new_response = urllib2.urlopen(new_request)
  300. return new_response
  301. def _getPostData(self, asp_args, search_date):
  302. post_data = urllib.urlencode(asp_args + (
  303. ("ctl00", "DATE_RECEIVED"),
  304. ("rbGroup", "ctl05"),
  305. ("ctl07_input", search_date.strftime(date_format)),
  306. ("ctl08_input", search_date.strftime(date_format)),
  307. ("edrDateSelection", "1"),
  308. ("csbtnSearch", "Search"),
  309. ))
  310. return post_data
  311. class KennetParser(PlanningExplorerParser):
  312. use_firefox_user_agent = True
  313. address_td_no = 3
  314. class LincolnParser(PlanningExplorerParser):
  315. use_firefox_user_agent = True
  316. class LiverpoolParser(PlanningExplorerParser):
  317. comments_email_address = "planningandbuildingcontrol@liverpool.gov.uk"
  318. use_firefox_user_agent = True
  319. use_referer = True
  320. results_table_attrs = {"xmlns:mvm":"http://www.mvm.co.uk"}
  321. info_url_path = "mvm/"
  322. search_url_path = "mvm/planningsearch.aspx"
  323. def _find_trs(self, results_table):
  324. """In this case we are after all trs except the first two which have a
  325. class attribute row0 or row1."""
  326. return results_table.findAll("tr", {"class":["row0", "row1"]})[3:]
  327. def _getPostData(self, asp_args, search_date):
  328. post_data = urllib.urlencode(asp_args + (
  329. ("dummy", "dummy field\tused for custom\tvalidator"),
  330. ("drReceived$txtStart", search_date.strftime(date_format)),
  331. ("drReceived$txtEnd", search_date.strftime(date_format)),
  332. ("cboNumRecs", "99999"),
  333. ("cmdSearch", "Search"),
  334. ))
  335. return post_data
  336. def _sanitiseInfoUrl(self, url):
  337. """The liverpool info urls arrive full of rubbish. This method tidies
  338. them up."""
  339. # We need to
  340. # 1) Remove whitespace
  341. # 2) Remove &#xA; and &#xD;
  342. ws_re = re.compile("(?:(?:\s)|(?:&#x\w;))*")
  343. return ''.join(ws_re.split(url))
  344. # FIXME - Merton needs to be done here when it is back up.
  345. class MertonParser(PlanningExplorerParser):
  346. use_firefox_user_agent = True
  347. class ShrewsburyParser(PlanningExplorerParser):
  348. use_firefox_user_agent = True
  349. class SouthNorfolkParser(PlanningExplorerParser):
  350. use_firefox_user_agent = True
  351. class SouthShropshireParser(PlanningExplorerParser):
  352. comments_email_address = "planning@southshropshire.gov.uk"
  353. use_firefox_user_agent = True
  354. info_url_path = "MVM/Online/PL/"
  355. def _getPostData(self, asp_args, search_date):
  356. local_date_format = "%d-%m-%Y"
  357. year, month, day = search_date.timetuple()[:3]
  358. post_data = urllib.urlencode(asp_args + (
  359. ("edrDateSelection:htxtRange", "radRangeBetween"),
  360. ("cboDateList", "DATE_RECEIVED"),
  361. ("edrDateSelection:txtStart", search_date.strftime(local_date_format)),
  362. ("edrDateSelection:txtEnd", search_date.strftime(local_date_format)),
  363. ("edrDateSelection:txtDateReceived", "%(day)d-%(month)d-%(year)d~%(day)d-%(month)d-%(year)d" %({"day":day, "month":month, "year":year})),
  364. ("cboNumRecs", "99999"),
  365. ("csbtnSearch", "Search"),
  366. ))
  367. return post_data
  368. class SouthTynesideParser(BroadlandLike, PlanningExplorerParser):
  369. # Unlike the other BroadlandLike sites, there are postcodes :-)
  370. pass
  371. class StockportParser(PlanningExplorerParser):
  372. comments_email_address = "admin.dc@stockport.gov.uk"
  373. info_url_path = "MVM/Online/PL/"
  374. def _getPostData(self, asp_args, search_date):
  375. post_data = urllib.urlencode(asp_args + (
  376. ("drDateReceived:txtStart", search_date.strftime(date_format)),
  377. ("drDateReceived:txtEnd", search_date.strftime(date_format)),
  378. ("cboNumRecs", "99999"),
  379. ("csbtnSearch", "Search"),),
  380. )
  381. return post_data
  382. class SwanseaParser(BroadlandLike, PlanningExplorerParser):
  383. # Unlike the other BroadlandLike sites, there are postcodes :-)
  384. pass
  385. class TamworthParser(PlanningExplorerParser):
  386. comments_email_address = "planningadmin@tamworth.gov.uk"
  387. use_firefox_user_agent = True
  388. info_url_path = "MVM/Online/PL/"
  389. class TraffordParser(PlanningExplorerParser):
  390. # There are no postcodes on the Trafford site.
  391. use_firefox_user_agent = True
  392. address_td_no = 3
  393. class WestOxfordshireParser(PlanningExplorerParser):
  394. address_td_no = 3
  395. description_td_no = 1
  396. use_firefox_user_agent = True
  397. if __name__ == '__main__':
  398. # NOTE - 04/11/2007 is a sunday
  399. # I'm using it to test that the scrapers behave on days with no apps.
  400. #parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/")
  401. #parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/")
  402. #parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/")
  403. #parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/")
  404. #parser = CreweParser("Crewe and Nantwich Borough Council", "Crewe and Nantwich", "http://portal.crewe-nantwich.gov.uk/")
  405. #parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/")
  406. #parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/")
  407. #parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/")
  408. #parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
  409. #parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/")
  410. #parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/")
  411. #parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/")
  412. #parser = ShrewsburyParser("Shrewsbury and Atcham Borough Council", "Shrewsbury", "http://www2.shrewsbury.gov.uk/")
  413. #parser = SouthNorfolkParser("South Norfolk Council", "South Norfolk", "http://planning.south-norfolk.gov.uk/")
  414. #parser = SouthShropshireParser("South Shropshire District Council", "South Shropshire", "http://194.201.44.102/")
  415. #parser = SouthTynesideParser("South Tyneside Council", "South Tyneside", "http://poppy.southtyneside.gov.uk/")
  416. #parser = StockportParser("Stockport Metropolitan District Council", "Stockport", "http://s1.stockport.gov.uk/council/eed/dc/planning/")
  417. #parser = SwanseaParser("Swansea City and County Council", "Swansea", "http://www2.swansea.gov.uk/")
  418. #parser = TamworthParser("Tamworth Borough Council", "Tamworth", "http://80.1.64.77/")
  419. #parser = TraffordParser("Trafford Council", "Trafford", "http://planning.trafford.gov.uk/")
  420. #parser = WestOxfordshireParser("West Oxfordshire District Council", "West Oxfordshire", "http://planning.westoxon.gov.uk/")
  421. print parser.getResults(22, 1, 2008)
  422. # To Do
  423. # Sort out paging:
  424. # South Shropshire - pages on 6
  425. # Investigate catching unavailable message:
  426. # Charnwood