diff --git a/trunk/python_scrapers/PlanningExplorer.py b/trunk/python_scrapers/PlanningExplorer.py new file mode 100644 index 0000000..b96bd6e --- /dev/null +++ b/trunk/python_scrapers/PlanningExplorer.py @@ -0,0 +1,568 @@ +import urllib2 +import urllib +import urlparse +import cgi +import re +import datetime + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +# Date format to enter into search boxes +date_format = "%d/%m/%Y" + +# Regex for getting the application code +# (needed for the comments url, when it exists) +app_code_regex = re.compile("PARAM0=(\d*)") + + +class PlanningExplorerParser: + # If this authority doesn't have a comments page, + # then set this email_address to an address for the + # planning department, and it will be used in lieu of + # a comments url. + comments_email_address = None + + # These are the directories where the info urls, and search urls, + # usually live underneath the base_url. + # If these are different for a particular + # authority, then they can be overridden in a subclass. + info_url_path = "MVM/Online/Generic/" + search_url_path = "MVM/Online/PL/GeneralSearch.aspx" + + # This is the most common place for comments urls to live + # The %s will be filled in with an application code + comments_path = "MVM/Online/PL/PLComments.aspx?pk=%s" + + # Most authorities don't need the referer header on the post + # request. If one does, override this in the subclass + use_referer = False + + # Some authorities won't give us anything back if we use the + # python urllib2 useragent string. In that case, override this + # in a subclass to pretend to be firefox. + use_firefox_user_agent = False + + # This is the most common css class of the table containing the + # the search results. If it is different for a particular authority + # it can be overridden in a subclass + results_table_attrs = {"class": "ResultsTable"} + + # These are the most common column positions for the + # council reference, the address, and the description + # in the results table. + # They should be overridden in subclasses if they are different + # for a particular authority. + reference_td_no = 0 + address_td_no = 1 + description_td_no = 2 + + def _modify_response(self, response): + """For most sites, we have managed to get all the apps on a + single page by choosing the right parameters. + If that hasn't been possible, override this method to get a + new response object which has all the apps in one page. + (See, for example, Hackney). + """ + return response + + def _find_trs(self, results_table): + """Normally, we just want a list of all the trs except the first one + (which is usually a header). + If the authority requires a different list of trs, override this method. + """ + return results_table.findAll("tr")[1:] + + def _sanitisePostHtml(self, html): + """This method can be overriden in subclasses if the + html that comes back from the post request is bad, and + needs tidying up before giving it to BeautifulSoup.""" + return html + + def _sanitiseInfoUrl(self, url): + """If an authority has info urls which are for some reason full + of crap (like Broadland does), then this method should be overridden + in order to tidy them up.""" + return url + + def _getHeaders(self): + """If the authority requires any headers for the post request, + override this method returning a dictionary of header key to + header value.""" + headers = {} + + if self.use_firefox_user_agent: + headers["User-Agent"] = "Mozilla/5.0 (X11; U; Linux i686; en-GB; rv:1.8.1.10) Gecko/20071126 Ubuntu/7.10 (gutsy) Firefox/2.0.0.10" + + if self.use_referer: + headers["Referer"] = self.search_url + + return headers + + def _getPostData(self, asp_args, search_date): + """Accepts asp_args (a tuple of key value pairs of the pesky ASP + parameters, and search_date, a datetime.date object for the day + we are searching for. + + This seems to be the most common set of post data which is needed + for PlanningExplorer sites. It won't work for all of them, so + will sometimes need to be overridden in a subclass. + + The parameter edrDateSelection is often not needed. + It is needed by Charnwood though, so I've left it in + to keep things simple. + """ + year_month_day = search_date.timetuple()[:3] + + post_data = urllib.urlencode(asp_args + ( + ("_ctl0", "DATE_RECEIVED"), + ("rbGroup", "_ctl5"), + ("_ctl7_hidden", urllib.quote('' %year_month_day)), + ("_ctl8_hidden", urllib.quote('' %year_month_day)), + ("edrDateSelection", "1"), + ("csbtnSearch", "Search"), + ("cboNumRecs", "99999"), + )) + + return post_data + + def _getPostCode(self): + """In most cases, the postcode can be got from the address in + the results table. Some councils put the address there without the + postcode. In this case we will have to go to the info page to get + the postcode. This should be done by overriding this method with + one that parses the info page.""" + + return getPostcodeFromText(self._current_application.address) + + def __init__(self, + authority_name, + authority_short_name, + base_url, + debug=False): + + self.authority_name = authority_name + self.authority_short_name = authority_short_name + self.base_url = base_url + + self.search_url = urlparse.urljoin(base_url, self.search_url_path) + self.info_url_base = urlparse.urljoin(self.base_url, self.info_url_path) + + self.debug = debug + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + def getResultsByDayMonthYear(self, day, month, year): + search_date = datetime.date(year, month, day) + + # First do a get, to get some state + get_request = urllib2.Request(self.search_url) + get_response = urllib2.urlopen(get_request) + + html = get_response.read() + + # We need to find those ASP parameters such as __VIEWSTATE + # so we can use them in the next POST + asp_args_regex = re.compile(']*name=\"(__[A-Z]*)\"[^>]*value=\"([^\"]*)\"[^>]*>') + + # re.findall gets us a list of key value pairs. + # We want to concatenate it with a tuple, so we must + # make it a tuple + asp_args = tuple(re.findall(asp_args_regex, html)) + + # The post data needs to be different for different councils + # so we have a method on each council's scraper to make it. + post_data = self._getPostData(asp_args, search_date) + + headers = self._getHeaders() + + request = urllib2.Request(self.search_url, post_data, headers) + post_response = urllib2.urlopen(request) + + # We have actually been returned here by an http302 object + # moved, and the response called post_response is really a get. + + # In some cases, we can't get the page size set high + # until now. In that case, override _modify_response + # so that we get back a response with all the apps on one page. + # We pass in headers so that any + post_response = self._modify_response(post_response) + + html = self._sanitisePostHtml(post_response.read()) + + soup = BeautifulSoup(html) + + results_table = soup.find("table", attrs=self.results_table_attrs) + + # If there is no results table, then there were no apps on that day. + if results_table: + trs = self._find_trs(results_table) + + self._current_application = None + + # The first tr is just titles, cycle through the trs after that + for tr in trs: + self._current_application = PlanningApplication() + + # There is no need to search for the date_received, it's what + # we searched for + self._current_application.date_received = search_date + + tds = tr.findAll("td") + + for td_no in range(len(tds)): + if td_no == self.reference_td_no: + # This td contains the reference number and a link to details + self._current_application.council_reference = tds[td_no].a.string + + relative_info_url = self._sanitiseInfoUrl(tds[td_no].a['href']) + + self._current_application.info_url = urlparse.urljoin(self.info_url_base, relative_info_url) + + + # What about a comment url? + # There doesn't seem to be one, so we'll use the email address + if self.comments_email_address is not None: + # We're using the email address, as there doesn't seem + # to be a web form for comments + self._current_application.comment_url = self.comments_email_address + else: + # This link contains a code which we need for the comments url + # (on those sites that use it) + application_code = app_code_regex.search(relative_info_url).groups()[0] + + relative_comments_url = self.comments_path %(application_code) + self._current_application.comment_url = urlparse.urljoin(self.base_url, relative_comments_url) + + elif td_no == self.address_td_no: + # If this td contains a div, then the address is the + # string in there - otherwise, use the string in the td. + if tds[td_no].div is not None: + address = tds[td_no].div.string + else: + address = tds[td_no].string + + self._current_application.address = address + + self._current_application.postcode = self._getPostCode() + + elif td_no == self.description_td_no: + if tds[td_no].div is not None: + # Mostly this is in a div + # Use the empty string if the description is missing + description = tds[td_no].div.string or "" + else: + # But sometimes (eg Crewe) it is directly in the td. + # Use the empty string if the description is missing + description = tds[td_no].string or "" + + self._current_application.description = description + + self._results.addApplication(self._current_application) + + return self._results + + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + + +class BroadlandLike: + # FIXME - BroadlandLike authorities don't have postcodes on their site, but + # they do have grid references. We should use these. + + results_table_attrs = {"class": "display_table"} + + info_url_path = "Northgate/PlanningExplorer/Generic/" + search_url_path = "Northgate/PlanningExplorer/GeneralSearch.aspx" + + use_firefox_user_agent = True + use_referer = True + + def _getPostData(self, asp_args, search_date): + post_data = urllib.urlencode(asp_args + ( + ("cboSelectDateValue", "DATE_RECEIVED"), + ("rbGroup", "rbRange"), + ("dateStart", search_date.strftime(date_format)), + ("dateEnd", search_date.strftime(date_format)), + ("cboNumRecs", "99999"), + ("csbtnSearch", "Search"), + )) + + return post_data + + + def _sanitiseInfoUrl(self, url): + """The broadland info urls arrive full of rubbish. This method tidies + them up.""" + + # We need to + # 1) Remove whitespace + # 2) Remove and + + ws_re = re.compile("(?:(?:\s)|(?:&#x\w;))*") + + return ''.join(ws_re.split(url)) + + + +class BlackburnParser(PlanningExplorerParser): + use_firefox_user_agent = True + +class BroadlandParser(BroadlandLike, PlanningExplorerParser): + # FIXME - is http://secure.broadland.gov.uk/mvm/Online/PL/GeneralSearch.aspx + # a better url for Broadland? + + def _sanitisePostHtml(self, html): + """The page that comes back from the post for the broadland site + has a broken doctype declaration. We need to tidy that up before + giving it to BeautifulSoup.""" + + # This is what it looks like - note the missing close doublequote + # + + # Split on the broken doctype and join with the doctype with + # closing quote. + + html = ''.join(html.split('')) + + return html + +class CamdenParser(BroadlandLike, PlanningExplorerParser): + comments_path = "Northgate/PlanningExplorer/PLComments.aspx?pk=%s" + +class CharnwoodParser(PlanningExplorerParser): + use_firefox_user_agent = True + +class CreweParser(PlanningExplorerParser): + use_firefox_user_agent = True + address_td_no = 4 + + def _getPostData(self, asp_args, search_date): + year_month_day = search_date.timetuple()[:3] + + post_data = urllib.urlencode(asp_args + ( + ("drDateReceived:_ctl0_hidden", urllib.quote('' %year_month_day)), + ("drDateReceivedxxctl0_input", search_date.strftime(date_format)), + ("drDateReceived:_ctl1_hidden", urllib.quote('' %year_month_day)), + ("drDateReceivedxxctl1_input", search_date.strftime(date_format)), + ("cboNumRecs", "99999"), + ("csbtnSearch", "Search"), + )) + + return post_data + + +class EastStaffsParser(PlanningExplorerParser): + use_firefox_user_agent = True + + address_td_no = 4 + description_td_no = 1 + + +class EppingForestParser(PlanningExplorerParser): + use_firefox_user_agent = True + + address_td_no = 3 + description_td_no = 1 + +class ForestHeathParser(BroadlandLike, PlanningExplorerParser): + pass + +class HackneyParser(PlanningExplorerParser): + # FIXME - This will only get the first ten records on this + # day. Need to deal with paging. + + use_firefox_user_agent = True + + address_td_no = 6 + description_td_no = 5 + + def _modify_response(self, response): + # In order to make sure we don't have to worry about any paging, + # We'll fetch this url again with PS=99999. + real_url_tuple = urlparse.urlsplit(response.geturl()) + + query_string = real_url_tuple[3] + + # Get the query as a list of key, value pairs + parsed_query_list = list(cgi.parse_qsl(query_string)) + + # Go through the query string replacing any PS parameters + # with PS=99999 + + for i in range(len(parsed_query_list)): + key, value = parsed_query_list[i] + + if key == "PS": + value = "99999" + parsed_query_list[i] = (key, value) + + new_query_string = urllib.urlencode(parsed_query_list) + + new_url_tuple = real_url_tuple[:3] + (new_query_string,) + real_url_tuple[4:] + + new_url = urlparse.urlunsplit(new_url_tuple) + new_request = urllib2.Request(new_url, None, self._getHeaders()) + new_response = urllib2.urlopen(new_request) + + return new_response + + + def _getPostData(self, asp_args, search_date): + post_data = urllib.urlencode(asp_args + ( + ("ctl00", "DATE_RECEIVED"), + ("rbGroup", "ctl05"), + ("ctl07_input", search_date.strftime(date_format)), + ("ctl08_input", search_date.strftime(date_format)), + ("edrDateSelection", "1"), + ("csbtnSearch", "Search"), + )) + + return post_data + +class KennetParser(PlanningExplorerParser): + use_firefox_user_agent = True + + address_td_no = 3 + +class LincolnParser(PlanningExplorerParser): + use_firefox_user_agent = True + +class LiverpoolParser(PlanningExplorerParser): + comments_email_address = "planningandbuildingcontrol@liverpool.gov.uk" + use_firefox_user_agent = True + use_referer = True + + results_table_attrs = {"xmlns:mvm":"http://www.mvm.co.uk"} + + info_url_path = "mvm/" + search_url_path = "mvm/planningsearch.aspx" + + def _find_trs(self, results_table): + """In this case we are after all trs except the first two which have a + class attribute row0 or row1.""" + return results_table.findAll("tr", {"class":["row0", "row1"]})[3:] + + def _getPostData(self, asp_args, search_date): + post_data = urllib.urlencode(asp_args + ( + ("dummy", "dummy field\tused for custom\tvalidator"), + ("drReceived$txtStart", search_date.strftime(date_format)), + ("drReceived$txtEnd", search_date.strftime(date_format)), + ("cboNumRecs", "99999"), + ("cmdSearch", "Search"), + )) + + return post_data + + def _sanitiseInfoUrl(self, url): + """The liverpool info urls arrive full of rubbish. This method tidies + them up.""" + + # We need to + # 1) Remove whitespace + # 2) Remove and + + ws_re = re.compile("(?:(?:\s)|(?:&#x\w;))*") + + return ''.join(ws_re.split(url)) + +# FIXME - Merton, Shrewsbury, and South Norfolk need to be done here. +# All are down today... + +class SouthNorfolkParser(PlanningExplorerParser): + use_firefox_user_agent = True + +class SouthShropshireParser(PlanningExplorerParser): + comments_email_address = "planning@southshropshire.gov.uk" + use_firefox_user_agent = True + info_url_path = "MVM/Online/PL/" + + def _getPostData(self, asp_args, search_date): + local_date_format = "%d-%m-%Y" + year, month, day = search_date.timetuple()[:3] + + post_data = urllib.urlencode(asp_args + ( + ("edrDateSelection:htxtRange", "radRangeBetween"), + ("cboDateList", "DATE_RECEIVED"), + ("edrDateSelection:txtStart", search_date.strftime(local_date_format)), + ("edrDateSelection:txtEnd", search_date.strftime(local_date_format)), + ("edrDateSelection:txtDateReceived", "%(day)d-%(month)d-%(year)d~%(day)d-%(month)d-%(year)d" %({"day":day, "month":month, "year":year})), + ("cboNumRecs", "99999"), + ("csbtnSearch", "Search"), + )) + + return post_data + +class SouthTynesideParser(BroadlandLike, PlanningExplorerParser): + # Unlike the other BroadlandLike sites, there are postcodes :-) + pass + + +class StockportParser(PlanningExplorerParser): + comments_email_address = "admin.dc@stockport.gov.uk" + info_url_path = "MVM/Online/PL/" + + def _getPostData(self, asp_args, search_date): + post_data = urllib.urlencode(asp_args + ( + ("drDateReceived:txtStart", search_date.strftime(date_format)), + ("drDateReceived:txtEnd", search_date.strftime(date_format)), + ("cboNumRecs", "99999"), + ("csbtnSearch", "Search"),), + ) + + return post_data + +# FIXME - should add Swansea, but it is currently down + +class TamworthParser(PlanningExplorerParser): + comments_email_address = "planningadmin@tamworth.gov.uk" + use_firefox_user_agent = True + info_url_path = "MVM/Online/PL/" + +class TraffordParser(PlanningExplorerParser): + # There are no postcodes on the Trafford site. + use_firefox_user_agent = True + address_td_no = 3 + +class WestOxfordshireParser(PlanningExplorerParser): + address_td_no = 3 + description_td_no = 1 + + use_firefox_user_agent = True + +if __name__ == '__main__': + # NOTE - 04/11/2007 is a sunday + # I'm using it to test that the scrapers behave on days with no apps. + + #parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/") + #parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/") + #parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/") + #parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/") + #parser = CreweParser("Crewe and Nantwich Borough Council", "Crewe and Nantwich", "http://portal.crewe-nantwich.gov.uk/") + #parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/") + #parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/") + #parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/") + parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/") + #parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/") + #parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/") + #parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/") + #parser = SouthNorfolkParser("South Norfolk Council", "South Norfolk", "http://planning.south-norfolk.gov.uk/") + #parser = SouthShropshireParser("South Shropshire District Council", "South Shropshire", "http://194.201.44.102/") + #parser = SouthTynesideParser("South Tyneside Council", "South Tyneside", "http://poppy.southtyneside.gov.uk/") + #parser = StockportParser("Stockport Metropolitan District Council", "Stockport", "http://s1.stockport.gov.uk/council/eed/dc/planning/") + #parser = TamworthParser("Tamworth Borough Council", "Tamworth", "http://80.1.64.77/") + #parser = TraffordParser("Trafford Council", "Trafford", "http://planning.trafford.gov.uk/") + #parser = WestOxfordshireParser("West Oxfordshire District Council", "West Oxfordshire", "http://planning.westoxon.gov.uk/") + print parser.getResults(1, 11, 2007) + +# To Do + +# Sort out paging: +# South Shropshire - pages on 6 + +# Investigate catching unavailable message: +# Charnwood