Automatically exported from code.google.com/p/planningalerts
Non puoi selezionare più di 25 argomenti Gli argomenti devono iniziare con una lettera o un numero, possono includere trattini ('-') e possono essere lunghi fino a 35 caratteri.
 
 
 
 
 
 

913 righe
35 KiB

  1. import urllib2
  2. import urllib
  3. import urlparse
  4. import cgi
  5. import re
  6. import datetime
  7. import BeautifulSoup
  8. import cookielib
  9. cookie_jar = cookielib.CookieJar()
  10. from BeautifulSoup import BeautifulSoup
  11. __auth__ = None
  12. import re
  13. date_format = "%d/%m/%Y"
  14. def fixNewlines(text):
  15. # This can be used to sort out windows newlines
  16. return text.replace("\r\n","\n")
  17. # So what can a postcode look like then?
  18. # This list of formats comes from http://www.mailsorttechnical.com/frequentlyaskedquestions.cfm
  19. #AN NAA M1 1AA
  20. #ANN NAA M60 1NW
  21. #AAN NAA CR2 6XH
  22. #AANN NAA DN55 1PT
  23. #ANA NAA W1A 1HP
  24. #AANA NAA EC1A 1BB
  25. postcode_regex = re.compile("[A-Z][A-Z]?\d(\d|[A-Z])? ?\d[A-Z][A-Z]")
  26. def getPostcodeFromText(text, default_postcode="No Postcode"):
  27. """This function takes a piece of text and returns the first
  28. bit of it that looks like a postcode."""
  29. postcode_match = postcode_regex.search(text)
  30. return postcode_match.group() if postcode_match else default_postcode
  31. class PlanningAuthorityResults:
  32. """This class represents a set of results of a planning search.
  33. This should probably be separated out so that it can be used for
  34. authorities other than Cherwell.
  35. """
  36. def __init__(self, authority_name, authority_short_name):
  37. self.authority_name = authority_name
  38. self.authority_short_name = authority_short_name
  39. # this will be a list of PlanningApplication objects
  40. self.planning_applications = []
  41. def addApplication(self, application):
  42. self.planning_applications.append(application)
  43. def __repr__(self):
  44. return self.displayXML()
  45. def displayXML(self):
  46. """This should display the contents of this object in the planningalerts format.
  47. i.e. in the same format as this one:
  48. http://www.planningalerts.com/lambeth.xml
  49. """
  50. applications_bit = "".join([x.displayXML() for x in self.planning_applications])
  51. return u"""<?xml version="1.0" encoding="UTF-8"?>\n""" + \
  52. u"<planning>\n" +\
  53. u"<authority_name>%s</authority_name>\n" %self.authority_name +\
  54. u"<authority_short_name>%s</authority_short_name>\n" %self.authority_short_name +\
  55. u"<applications>\n" + applications_bit +\
  56. u"</applications>\n" +\
  57. u"</planning>\n"
  58. class PlanningApplication:
  59. def __init__(self):
  60. self.council_reference = None
  61. self.address = None
  62. self.postcode = None
  63. self.description = None
  64. self.info_url = None
  65. self.comment_url = None
  66. # expecting this as a datetime.date object
  67. self.date_received = None
  68. # If we can get them, we may as well include OSGB.
  69. # These will be the entirely numeric version.
  70. self.osgb_x = None
  71. self.osgb_y = None
  72. def __repr__(self):
  73. return self.displayXML()
  74. def is_ready(self):
  75. # This method tells us if the application is complete
  76. # Because of the postcode default, we can't really
  77. # check the postcode - make sure it is filled in when
  78. # you do the address.
  79. return self.council_reference \
  80. and self.address \
  81. and self.description \
  82. and self.info_url \
  83. and self.comment_url \
  84. and self.date_received
  85. def displayXML(self):
  86. #print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received
  87. if not self.postcode:
  88. self.postcode = getPostcodeFromText(self.address)
  89. contents = [
  90. u"<council_reference><![CDATA[%s]]></council_reference>" %(self.council_reference),
  91. u"<address><![CDATA[%s]]></address>" %(self.address),
  92. u"<postcode><![CDATA[%s]]></postcode>" %self.postcode,
  93. u"<description><![CDATA[%s]]></description>" %(self.description),
  94. u"<info_url><![CDATA[%s]]></info_url>" %(self.info_url),
  95. u"<comment_url><![CDATA[%s]]></comment_url>" %(self.comment_url),
  96. u"<date_received><![CDATA[%s]]></date_received>" %self.date_received.strftime(date_format),
  97. ]
  98. if self.osgb_x:
  99. contents.append(u"<osgb_x>%s</osgb_x>" %(self.osgb_x))
  100. if self.osgb_y:
  101. contents.append(u"<osgb_y>%s</osgb_y>" %(self.osgb_y))
  102. return u"<application>\n%s\n</application>" %('\n'.join(contents))
  103. # Date format to enter into search boxes
  104. date_format = "%d/%m/%Y"
  105. # Regex for getting the application code
  106. # (needed for the comments url, when it exists)
  107. app_code_regex = re.compile("PARAM0=(\d*)")
  108. class PlanningExplorerParser:
  109. # If this authority doesn't have a comments page,
  110. # then set this email_address to an address for the
  111. # planning department, and it will be used in lieu of
  112. # a comments url.
  113. comments_email_address = None
  114. # These are the directories where the info urls, and search urls,
  115. # usually live underneath the base_url.
  116. # If these are different for a particular
  117. # authority, then they can be overridden in a subclass.
  118. info_url_path = "MVM/Online/Generic/"
  119. search_url_path = "MVM/Online/PL/GeneralSearch.aspx"
  120. # This is the most common place for comments urls to live
  121. # The %s will be filled in with an application code
  122. comments_path = "MVM/Online/PL/PLComments.aspx?pk=%s"
  123. # Most authorities don't need the referer header on the post
  124. # request. If one does, override this in the subclass
  125. use_referer = False
  126. # Some authorities won't give us anything back if we use the
  127. # python urllib2 useragent string. In that case, override this
  128. # in a subclass to pretend to be firefox.
  129. use_firefox_user_agent = False
  130. # This is the most common css class of the table containing the
  131. # the search results. If it is different for a particular authority
  132. # it can be overridden in a subclass
  133. results_table_attrs = {"class": "ResultsTable"}
  134. # These are the most common column positions for the
  135. # council reference, the address, and the description
  136. # in the results table.
  137. # They should be overridden in subclasses if they are different
  138. # for a particular authority.
  139. reference_td_no = 0
  140. address_td_no = 1
  141. description_td_no = 2
  142. # In some cases we won't be able to get the full address/description/postcode without getting the info page for each app.
  143. # If fetch_info_page is set to true, then we need to get a copy of the info page and store it as an attribute on current_application (naughty!)
  144. fetch_info_page = False
  145. asp_args_regex = re.compile('<input[^>]*name=\"(__[A-Z]*)\"[^>]*value=\"([^\"]*)\"[^>]*>')
  146. def _modify_response(self, response):
  147. """For most sites, we have managed to get all the apps on a
  148. single page by choosing the right parameters.
  149. If that hasn't been possible, override this method to get a
  150. new response object which has all the apps in one page.
  151. (See, for example, Hackney).
  152. """
  153. return response
  154. def _find_trs(self, results_table):
  155. """Normally, we just want a list of all the trs except the first one
  156. (which is usually a header).
  157. If the authority requires a different list of trs, override this method.
  158. """
  159. return results_table.findAll("tr")[1:]
  160. def _sanitisePostHtml(self, html):
  161. """This method can be overriden in subclasses if the
  162. html that comes back from the post request is bad, and
  163. needs tidying up before giving it to BeautifulSoup."""
  164. return html
  165. def _sanitiseInfoUrl(self, url):
  166. """If an authority has info urls which are for some reason full
  167. of crap (like Broadland does), then this method should be overridden
  168. in order to tidy them up."""
  169. return ''.join(url.split())
  170. def _getHeaders(self):
  171. """If the authority requires any headers for the post request,
  172. override this method returning a dictionary of header key to
  173. header value."""
  174. headers = {}
  175. if self.use_firefox_user_agent:
  176. headers["User-Agent"] = "Mozilla/5.0 (X11; U; Linux i686; en-GB; rv:1.8.1.10) Gecko/20071126 Ubuntu/7.10 (gutsy) Firefox/2.0.0.10"
  177. if self.use_referer:
  178. headers["Referer"] = self.search_url
  179. return headers
  180. def _getPostData(self, asp_args, search_date):
  181. """Accepts asp_args (a tuple of key value pairs of the pesky ASP
  182. parameters, and search_date, a datetime.date object for the day
  183. we are searching for.
  184. This seems to be the most common set of post data which is needed
  185. for PlanningExplorer sites. It won't work for all of them, so
  186. will sometimes need to be overridden in a subclass.
  187. The parameter edrDateSelection is often not needed.
  188. It is needed by Charnwood though, so I've left it in
  189. to keep things simple.
  190. """
  191. year_month_day = search_date.timetuple()[:3]
  192. post_data = urllib.urlencode(asp_args + (
  193. ("_ctl0", "DATE_REGISTERED"),
  194. ("rbGroup", "_ctl5"),
  195. ("_ctl7_hidden", urllib.quote('<DateChooser Value="%d%%2C%d%%2C%d"><ExpandEffects></ExpandEffects></DateChooser>' %year_month_day)),
  196. ("_ctl8_hidden", urllib.quote('<DateChooser Value="%d%%2C%d%%2C%d"><ExpandEffects></ExpandEffects></DateChooser>' %year_month_day)),
  197. ("edrDateSelection", "1"),
  198. ("csbtnSearch", "Search"),
  199. ("cboNumRecs", "99999"),
  200. ))
  201. return post_data
  202. def _getAddress(self, tds, info_soup):
  203. # If this td contains a div, then the address is the
  204. # string in there - otherwise, use the string in the td.
  205. address_td = tds[self.address_td_no]
  206. if address_td.div is not None:
  207. address = address_td.div.string
  208. else:
  209. address = address_td.string
  210. return address
  211. def _getPostCode(self, info_soup):
  212. """In most cases, the postcode can be got from the address in
  213. the results table. Some councils put the address there without the
  214. postcode. In this case we will have to go to the info page to get
  215. the postcode. This should be done by overriding this method with
  216. one that parses the info page."""
  217. return getPostcodeFromText(self._current_application.address)
  218. def _getDescription(self, tds, info_soup):
  219. description_td = tds[self.description_td_no]
  220. if description_td.div is not None:
  221. # Mostly this is in a div
  222. # Use the empty string if the description is missing
  223. description = description_td.div.string or ""
  224. else:
  225. # But sometimes (eg Crewe) it is directly in the td.
  226. # Use the empty string if the description is missing
  227. description = description_td.string or ""
  228. return description
  229. def __init__(self,
  230. authority_name,
  231. authority_short_name,
  232. base_url,
  233. debug=False):
  234. self.authority_name = authority_name
  235. self.authority_short_name = authority_short_name
  236. self.base_url = base_url
  237. self.search_url = urlparse.urljoin(base_url, self.search_url_path)
  238. self.info_url_base = urlparse.urljoin(self.base_url, self.info_url_path)
  239. self.debug = debug
  240. self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
  241. def getResultsByDayMonthYear(self, day, month, year):
  242. search_date = datetime.date(year, month, day)
  243. # First do a get, to get some state
  244. get_request = urllib2.Request(self.search_url)
  245. get_response = urllib2.urlopen(get_request)
  246. cookie_jar.extract_cookies(get_response, get_request)
  247. html = get_response.read()
  248. # We need to find those ASP parameters such as __VIEWSTATE
  249. # so we can use them in the next POST
  250. # re.findall gets us a list of key value pairs.
  251. # We want to concatenate it with a tuple, so we must
  252. # make it a tuple
  253. asp_args = tuple(re.findall(self.asp_args_regex, html))
  254. # The post data needs to be different for different councils
  255. # so we have a method on each council's scraper to make it.
  256. post_data = self._getPostData(asp_args, search_date)
  257. headers = self._getHeaders()
  258. request = urllib2.Request(self.search_url, post_data, headers)
  259. cookie_jar.add_cookie_header(request)
  260. post_response = urllib2.urlopen(request)
  261. # We have actually been returned here by an http302 object
  262. # moved, and the response called post_response is really a get.
  263. # In some cases, we can't get the page size set high
  264. # until now. In that case, override _modify_response
  265. # so that we get back a response with all the apps on one page.
  266. # We pass in headers so that any
  267. post_response = self._modify_response(post_response)
  268. html = self._sanitisePostHtml(post_response.read())
  269. soup = BeautifulSoup(html)
  270. results_table = soup.find("table", attrs=self.results_table_attrs)
  271. # If there is no results table, then there were no apps on that day.
  272. if results_table:
  273. trs = self._find_trs(results_table)
  274. self._current_application = None
  275. # The first tr is just titles, cycle through the trs after that
  276. for tr in trs:
  277. self._current_application = PlanningApplication()
  278. # There is no need to search for the date_received, it's what
  279. # we searched for
  280. self._current_application.date_received = search_date
  281. tds = tr.findAll("td")
  282. self._current_application.council_reference = tds[self.reference_td_no].a.string
  283. relative_info_url = self._sanitiseInfoUrl(tds[self.reference_td_no].a['href'])
  284. self._current_application.info_url = urlparse.urljoin(self.info_url_base, relative_info_url)
  285. # Fetch the info page if we need it, otherwise set it to None
  286. if self.fetch_info_page:
  287. # We need to quote the spaces in the info url
  288. info_request = urllib2.Request(urllib.quote(self._current_application.info_url, ":/&?="))
  289. info_soup = BeautifulSoup(urllib2.urlopen(info_request))
  290. else:
  291. info_soup = None
  292. # What about a comment url?
  293. # There doesn't seem to be one, so we'll use the email address
  294. if self.comments_email_address is not None:
  295. # We're using the email address, as there doesn't seem
  296. # to be a web form for comments
  297. self._current_application.comment_url = self.comments_email_address
  298. else:
  299. # This link contains a code which we need for the comments url
  300. # (on those sites that use it)
  301. application_code = app_code_regex.search(relative_info_url).groups()[0]
  302. relative_comments_url = self.comments_path %(application_code)
  303. self._current_application.comment_url = urlparse.urljoin(self.base_url, relative_comments_url)
  304. self._current_application.address = self._getAddress(tds, info_soup)
  305. self._current_application.postcode = self._getPostCode(info_soup)
  306. self._current_application.description = self._getDescription(tds, info_soup)
  307. self._results.addApplication(self._current_application)
  308. return self._results
  309. def getResults(self, day, month, year):
  310. return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
  311. class BroadlandLike:
  312. # FIXME - BroadlandLike authorities don't have postcodes on their site, but
  313. # they do have grid references. We should use these.
  314. results_table_attrs = {"class": "display_table"}
  315. info_url_path = "Northgate/PlanningExplorer/Generic/"
  316. search_url_path = "Northgate/PlanningExplorer/GeneralSearch.aspx"
  317. use_firefox_user_agent = True
  318. use_referer = True
  319. def _getPostData(self, asp_args, search_date):
  320. post_data = urllib.urlencode(asp_args + (
  321. ("cboSelectDateValue", "DATE_RECEIVED"),
  322. ("rbGroup", "rbRange"),
  323. ("dateStart", search_date.strftime(date_format)),
  324. ("dateEnd", search_date.strftime(date_format)),
  325. ("cboNumRecs", "99999"),
  326. ("csbtnSearch", "Search"),
  327. ))
  328. return post_data
  329. def _sanitiseInfoUrl(self, url):
  330. """The broadland info urls arrive full of rubbish. This method tidies
  331. them up."""
  332. # We need to
  333. # 1) Remove whitespace
  334. # 2) Remove &#xA; and &#xD;
  335. ws_re = re.compile("(?:(?:\s)|(?:&#x\w;))*")
  336. return ''.join(ws_re.split(url))
  337. class BlackburnParser(PlanningExplorerParser):
  338. use_firefox_user_agent = True
  339. class BroadlandParser(BroadlandLike, PlanningExplorerParser):
  340. # FIXME - is http://secure.broadland.gov.uk/mvm/Online/PL/GeneralSearch.aspx
  341. # a better url for Broadland?
  342. def _sanitisePostHtml(self, html):
  343. """The page that comes back from the post for the broadland site
  344. has a broken doctype declaration. We need to tidy that up before
  345. giving it to BeautifulSoup."""
  346. # This is what it looks like - note the missing close doublequote
  347. #<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd>
  348. # Split on the broken doctype and join with the doctype with
  349. # closing quote.
  350. html = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'.join(html.split('<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd>'))
  351. return html
  352. class CamdenParser(BroadlandLike, PlanningExplorerParser):
  353. comments_path = "Northgate/PlanningExplorer/PLComments.aspx?pk=%s"
  354. class CharnwoodParser(PlanningExplorerParser):
  355. use_firefox_user_agent = True
  356. class CreweParser(PlanningExplorerParser):
  357. use_firefox_user_agent = True
  358. use_referer = True
  359. info_url_path = "Northgate/PlanningExplorer/Generic/"
  360. search_url_path = "northgate/planningexplorer/generalsearch.aspx"
  361. results_table_attrs = {"class": "display_table"}
  362. def _getPostData(self, asp_args, search_date):
  363. year_month_day = search_date.timetuple()[:3]
  364. post_data = urllib.urlencode(asp_args + (
  365. ("txtApplicantName", ""),
  366. ("txtAgentName", ""),
  367. ("cboStreetReferenceNumber", ""),
  368. ("txtProposal", ""),
  369. ("cboWardCode", ""),
  370. ("cboParishCode", ""),
  371. ("cboApplicationTypeCode", ""),
  372. ("cboDevelopmentTypeCode", ""),
  373. ("cboStatusCode", ""),
  374. ("cboSelectDateValue", "DATE_RECEIVED"),
  375. ("cboMonths", "1"),
  376. ("cboDays", "1"),
  377. ("rbGroup", "rbRange"),
  378. ("dateStart", search_date.strftime(date_format)),
  379. ("dateEnd", search_date.strftime(date_format)),
  380. ("edrDateSelection", ""),
  381. ("csbtnSearch", "Search"),
  382. )
  383. )
  384. return post_data
  385. class EastStaffsParser(PlanningExplorerParser):
  386. use_firefox_user_agent = True
  387. address_td_no = 4
  388. description_td_no = 1
  389. class EppingForestParser(PlanningExplorerParser):
  390. use_firefox_user_agent = True
  391. address_td_no = 3
  392. description_td_no = 1
  393. class ForestHeathParser(BroadlandLike, PlanningExplorerParser):
  394. pass
  395. class HackneyParser(PlanningExplorerParser):
  396. # FIXME - This will only get the first ten records on this
  397. # day. Need to deal with paging.
  398. use_firefox_user_agent = True
  399. address_td_no = 6
  400. description_td_no = 5
  401. def _modify_response(self, response):
  402. # In order to make sure we don't have to worry about any paging,
  403. # We'll fetch this url again with PS=99999.
  404. real_url_tuple = urlparse.urlsplit(response.geturl())
  405. query_string = real_url_tuple[3]
  406. # Get the query as a list of key, value pairs
  407. parsed_query_list = list(cgi.parse_qsl(query_string))
  408. # Go through the query string replacing any PS parameters
  409. # with PS=99999
  410. for i in range(len(parsed_query_list)):
  411. key, value = parsed_query_list[i]
  412. if key == "PS":
  413. value = "99999"
  414. parsed_query_list[i] = (key, value)
  415. new_query_string = urllib.urlencode(parsed_query_list)
  416. new_url_tuple = real_url_tuple[:3] + (new_query_string,) + real_url_tuple[4:]
  417. new_url = urlparse.urlunsplit(new_url_tuple)
  418. new_request = urllib2.Request(new_url, None, self._getHeaders())
  419. new_response = urllib2.urlopen(new_request)
  420. return new_response
  421. #txtApplicationNumber=&ctl00=DATE_REGISTERED&ctl01=1&ctl02=1&rbGroup=ctl05&ctl07_hidden=&ctl07_input=28%2F08%2F2008&ctl08_hidden=&ctl08_input=28%2F08%2F2008&edrDateSelection=1&cboApplicationTypeCode=&txtLocality=&txtPostCode=&txtPropertyName=&txtPropertyNumber=&txtSiteAddress=&txtStreetName=&csbtnSearch=Search&
  422. def _getPostData(self, asp_args, search_date):
  423. """Note - using date registered here, not date received. There is too much time taken
  424. between the council 'receiving' an app and 'registering' it for the latter to be useful."""
  425. post_data = urllib.urlencode(asp_args + (
  426. ("txtApplicationNumber", ""),
  427. ("ctl00", "DATE_REGISTERED"),
  428. ("ctl01", "1"),
  429. ("ctl02", "1"),
  430. ("rbGroup", "ctl05"),
  431. ("ctl07_hidden", ""),
  432. ("ctl07_input", search_date.strftime(date_format)),
  433. ("ctl08_hidden", ""),
  434. ("ctl08_input", search_date.strftime(date_format)),
  435. ("edrDateSelection", "1"),
  436. ("cboApplicationTypeCode", ""),
  437. ("txtLocality", ""),
  438. ("txtPostCode", ""),
  439. ("txtPropertyName", ""),
  440. ("txtPropertyNumber", ""),
  441. ("txtSiteAddress", ""),
  442. ("txtStreetName", ""),
  443. ("csbtnSearch", "Search"),
  444. )
  445. )
  446. return post_data
  447. class KennetParser(BroadlandLike, PlanningExplorerParser):
  448. comments_path = "Northgate/PlanningExplorer/PLComments.aspx?pk=%s"
  449. class LincolnParser(PlanningExplorerParser):
  450. use_firefox_user_agent = True
  451. use_referer = True
  452. results_table_attrs = {"class": "display_table"}
  453. search_url_path = "northgate/planningexplorer/generalsearch.aspx"
  454. info_url_path = "Northgate/PlanningExplorer/Generic/"
  455. def _getPostData(self, asp_args, search_date):
  456. post_data = urllib.urlencode(asp_args + (
  457. ("txtApplicationNumber", ""),
  458. ("txtApplicantName", ""),
  459. ("txtAgentName", ""),
  460. ("cboApplicationTypeCode", ""),
  461. ("cboStatusCode", ""),
  462. ("txtPropertyName", ""),
  463. ("txtPropertyNumber", ""),
  464. ("cboStreetReferenceNumber", ""),
  465. ("txtPostCode", ""),
  466. ("cboLocality", ""),
  467. ("txtProposal", ""),
  468. ("cboSelectDateValue", "DATE_REGISTERED"),
  469. ("cboMonths", "1"),
  470. ("rbGroup", "rbDay"),
  471. ("cboDays", "10"),
  472. ("dateStart", search_date.strftime(date_format)),
  473. ("dateEnd", search_date.strftime(date_format)),
  474. ("edrDateSelection", ""),
  475. ("csbtnSearch", "Search"),
  476. )
  477. )
  478. return post_data
  479. class LiverpoolParser(PlanningExplorerParser):
  480. comments_email_address = "planningandbuildingcontrol@liverpool.gov.uk"
  481. use_firefox_user_agent = True
  482. use_referer = True
  483. results_table_attrs = {"xmlns:mvm":"http://www.mvm.co.uk"}
  484. info_url_path = "mvm/"
  485. search_url_path = "mvm/planningsearch.aspx"
  486. def _find_trs(self, results_table):
  487. """In this case we are after all trs except the first two which have a
  488. class attribute row0 or row1."""
  489. return results_table.findAll("tr", {"class":["row0", "row1"]})[3:]
  490. def _getPostData(self, asp_args, search_date):
  491. post_data = urllib.urlencode(asp_args + (
  492. ("dummy", "dummy field\tused for custom\tvalidator"),
  493. ("drReceived$txtStart", search_date.strftime(date_format)),
  494. ("drReceived$txtEnd", search_date.strftime(date_format)),
  495. ("cboNumRecs", "99999"),
  496. ("cmdSearch", "Search"),
  497. ))
  498. return post_data
  499. def _sanitiseInfoUrl(self, url):
  500. """The liverpool info urls arrive full of rubbish. This method tidies
  501. them up."""
  502. # We need to
  503. # 1) Remove whitespace
  504. # 2) Remove &#xA; and &#xD;
  505. ws_re = re.compile("(?:(?:\s)|(?:&#x\w;))*")
  506. return ''.join(ws_re.split(url))
  507. class MertonParser(PlanningExplorerParser):
  508. use_firefox_user_agent = True
  509. fetch_info_page = True
  510. def _getAddress(self, tds, info_soup):
  511. return info_soup.find(text="Site Address").findNext("td").string.strip()
  512. def _getDescription(self, tds, info_soup):
  513. return info_soup.find(text="Development Proposal").findNext("td").string.strip()
  514. class ShrewsburyParser(PlanningExplorerParser):
  515. use_firefox_user_agent = True
  516. class BirminghamParser(PlanningExplorerParser):
  517. search_url_path = "PlanningExplorer/GeneralSearch.aspx"
  518. info_url_path = "PlanningExplorer/Generic/"
  519. comments_path = "PlanningExplorer/PLComments.aspx?pk=%s"
  520. use_firefox_user_agent = True
  521. use_referer = True
  522. results_table_attrs = {"class": "display_table"}
  523. def _getPostData(self, asp_args, search_date):
  524. post_data = urllib.urlencode(asp_args + (
  525. ("txtApplicationNumber", ""),
  526. ("cboApplicationTypeCode", ""),
  527. ("txtSiteAddress", ""),
  528. ("txtProposal", ""),
  529. ("cboWardCode", ""),
  530. ("cboConstituencyCode", ""),
  531. ("txtApplicantName", ""),
  532. ("txtAgentName", ""),
  533. ("cboDevelopmentTypeCode", ""),
  534. ("cboSelectDateValue", "DATE_REGISTERED"),
  535. ("cboMonths", "1"),
  536. ("cboDays", "10"),
  537. ("rbGroup", "rbRange"),
  538. ("dateStart", search_date.strftime(date_format)),
  539. ("dateEnd", search_date.strftime(date_format)),
  540. ("edrDateSelection", ""),
  541. ("csbtnSearch", "Search"),
  542. )
  543. )
  544. return post_data
  545. class SouthNorfolkParser(PlanningExplorerParser):
  546. use_firefox_user_agent = True
  547. class SouthShropshireParser(PlanningExplorerParser):
  548. comments_email_address = "planning@southshropshire.gov.uk"
  549. use_firefox_user_agent = True
  550. info_url_path = "MVM/Online/PL/"
  551. def _getPostData(self, asp_args, search_date):
  552. local_date_format = "%d-%m-%Y"
  553. year, month, day = search_date.timetuple()[:3]
  554. post_data = urllib.urlencode(asp_args + (
  555. ("edrDateSelection:htxtRange", "radRangeBetween"),
  556. ("cboDateList", "DATE_REGISTERED"),
  557. ("edrDateSelection:txtStart", search_date.strftime(local_date_format)),
  558. ("edrDateSelection:txtEnd", search_date.strftime(local_date_format)),
  559. ("edrDateSelection:txtDateReceived", "%(day)d-%(month)d-%(year)d~%(day)d-%(month)d-%(year)d" %({"day":day, "month":month, "year":year})),
  560. ("cboNumRecs", "99999"),
  561. ("csbtnSearch", "Search"),
  562. ))
  563. return post_data
  564. class SouthTynesideParser(BroadlandLike, PlanningExplorerParser):
  565. # Unlike the other BroadlandLike sites, there are postcodes :-)
  566. pass
  567. class StockportParser(PlanningExplorerParser):
  568. comments_email_address = "admin.dc@stockport.gov.uk"
  569. info_url_path = "MVM/Online/PL/"
  570. def _getPostData(self, asp_args, search_date):
  571. post_data = urllib.urlencode(asp_args + (
  572. ("drDateReceived:txtStart", search_date.strftime(date_format)),
  573. ("drDateReceived:txtEnd", search_date.strftime(date_format)),
  574. ("cboNumRecs", "99999"),
  575. ("csbtnSearch", "Search"),),
  576. )
  577. return post_data
  578. class SwanseaParser(BroadlandLike, PlanningExplorerParser):
  579. # Unlike the other BroadlandLike sites, there are postcodes :-)
  580. pass
  581. class TamworthParser(PlanningExplorerParser):
  582. comments_email_address = "planningadmin@tamworth.gov.uk"
  583. use_firefox_user_agent = True
  584. info_url_path = "MVM/Online/PL/"
  585. class TraffordParser(PlanningExplorerParser):
  586. # There are no postcodes on the Trafford site.
  587. use_firefox_user_agent = True
  588. address_td_no = 3
  589. class WestOxfordshireParser(PlanningExplorerParser):
  590. address_td_no = 3
  591. description_td_no = 1
  592. use_firefox_user_agent = True
  593. class WalthamForestParser(PlanningExplorerParser):
  594. search_url_path = "PlanningExplorer/GeneralSearch.aspx"
  595. info_url_path = "PlanningExplorer/Generic/"
  596. use_firefox_user_agent = True
  597. use_referer = True
  598. # I know - I should change this so that the attribute is not comments_email_address, but
  599. # something more general
  600. comments_email_address = "https://www1.walthamforest.gov.uk/webforms/plan_comments/"
  601. results_table_attrs = {"class": "display_table"}
  602. def _getPostData(self, asp_args, search_date):
  603. post_data = urllib.urlencode(asp_args + (
  604. ("txtApplicantName", ""),
  605. ("txtAgentName", ""),
  606. ("cboStreetReferenceNumber", ""),
  607. ("txtProposal", ""),
  608. ("cboWardCode", ""),
  609. ("cboParishCode", ""),
  610. ("cboApplicationTypeCode", ""),
  611. ("cboDevelopmentTypeCode", ""),
  612. ("cboStatusCode", ""),
  613. ("cboSelectDateValue", "DATE_REGISTERED"),
  614. ("cboMonths", "1"),
  615. ("cboDays", "10"),
  616. ("rbGroup", "rbRange"),
  617. ("dateStart", search_date.strftime(date_format)),
  618. ("dateEnd", search_date.strftime(date_format)),
  619. ("edrDateSelection", ""),
  620. ("csbtnSearch", "Search"),
  621. )
  622. )
  623. return post_data
  624. class ConwyParser(BroadlandLike, PlanningExplorerParser):
  625. search_url_path = "Northgate/planningexplorerenglish/generalsearch.aspx"
  626. info_url_path = "Northgate/PlanningExplorerEnglish/Generic/"
  627. comments_path = "Northgate/PlanningExplorerEnglish/PLComments.aspx?pk=%s"
  628. use_firefox_user_agent = True
  629. class MendipParser(BroadlandLike, PlanningExplorerParser):
  630. comments_email_address = "customerservices@mendip.gov.uk"
  631. # search_url_path = "northgate/planningexplorer/generalsearch.aspx"
  632. #&first=1&quick=1&search=&txtApplicationNumber=&txtApplicantName=&txtAgentName=&txtProposal=&txtSiteAddress=&txtStreetName=&cboWardCode=&cboParishCode=&cboApplicationTypeCode=&cboDevelopmentTypeCode=&cboStatusCode=&cboSelectDateValue=DATE_RECEIVED&cboMonths=1&cboDays=1&rbGroup=rbRange&dateStart=12%2F06%2F2009&dateEnd=12%2F06%2F2009&edrDateSelection=&csbtnSearch=Search
  633. #&txtApplicationNumber=&txtProposal=&txtSiteAddress=&cboWardCode=&cboParishCode=&cboApplicationTypeCode=&cboDevelopmentTypeCode=&cboStatusCode=&cboSelectDateValue=DATE_RECEIVED&cboMonths=1&cboDays=1&rbGroup=rbRange&dateStart=10%2F07%2F2008&dateEnd=20%2F07%2F2008&edrDateSelection=&csbtnSearch=Search
  634. #txtApplicantName=
  635. #txtAgentName=
  636. #cboStreetReferenceNumber=
  637. #txtProposal=
  638. #cboWardCode=
  639. #cboParishCode=
  640. #cboApplicationTypeCode=
  641. #cboDevelopmentTypeCode=
  642. #cboStatusCode=
  643. #cboSelectDateValue=DATE_RECEIVED
  644. #cboMonths=1
  645. #cboDays=1
  646. #rbGroup=rbRange
  647. #dateStart=01%2F03%2F2008
  648. #dateEnd=01%2F04%2F2008
  649. #edrDateSelection=
  650. #csbtnSearch=Search
  651. if __name__ == '__main__':
  652. # NOTE - 04/11/2007 is a sunday
  653. # I'm using it to test that the scrapers behave on days with no apps.
  654. # parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/")
  655. # parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/")
  656. # parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/")
  657. # parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/")
  658. # parser = CreweParser("Crewe and Nantwich Borough Council", "Crewe and Nantwich", "http://portal.crewe-nantwich.gov.uk/")
  659. # parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/")
  660. # parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/")
  661. # parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/")
  662. # parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
  663. # parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/")
  664. # parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/")
  665. # parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/")
  666. # parser = ShrewsburyParser("Shrewsbury and Atcham Borough Council", "Shrewsbury", "http://www2.shrewsbury.gov.uk/")
  667. # parser = SouthNorfolkParser("South Norfolk Council", "South Norfolk", "http://planning.south-norfolk.gov.uk/")
  668. # parser = SouthShropshireParser("South Shropshire District Council", "South Shropshire", "http://194.201.44.102/")
  669. # parser = SouthTynesideParser("South Tyneside Council", "South Tyneside", "http://poppy.southtyneside.gov.uk/")
  670. # parser = StockportParser("Stockport Metropolitan District Council", "Stockport", "http://s1.stockport.gov.uk/council/eed/dc/planning/")
  671. # parser = SwanseaParser("Swansea City and County Council", "Swansea", "http://www2.swansea.gov.uk/")
  672. # parser = TamworthParser("Tamworth Borough Council", "Tamworth", "http://80.1.64.91/")
  673. # parser = TraffordParser("Trafford Council", "Trafford", "http://planning.trafford.gov.uk/")
  674. # parser = WestOxfordshireParser("West Oxfordshire District Council", "West Oxfordshire", "http://planning.westoxon.gov.uk/")
  675. # parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/")
  676. # parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/")
  677. # parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk")
  678. # parser = MendipParser("Mendip District Council", "Mendip", "http://planning.mendip.gov.uk/")
  679. parser = BirminghamParser("Birmingham City Council", "Birmingham", "http://eplanning.birmingham.gov.uk/Northgate/")
  680. print parser.getResults(27, 4, 2010)
  681. # To Do
  682. # Sort out paging:
  683. # South Shropshire - pages on 6
  684. # Investigate catching unavailable message:
  685. # Charnwood
  686. # South Norfolk has no postcodes. I wonder if the postcodes are in the WAM site...
  687. # Notes:
  688. # Since the changed, Liverpool and Crewe look rather similar. They are also a little Broadlandlike. Maybe we can do some consolidation