diff --git a/trunk/python_scrapers/PlanningExplorer.py b/trunk/python_scrapers/PlanningExplorer.py index 5c3e2ec..ebdef7e 100644 --- a/trunk/python_scrapers/PlanningExplorer.py +++ b/trunk/python_scrapers/PlanningExplorer.py @@ -5,6 +5,12 @@ import cgi import re import datetime + +import cookielib + +cookie_jar = cookielib.CookieJar() + + from BeautifulSoup import BeautifulSoup from PlanningUtils import PlanningApplication, \ @@ -64,6 +70,8 @@ class PlanningExplorerParser: # If fetch_info_page is set to true, then we need to get a copy of the info page and store it as an attribute on current_application (naughty!) fetch_info_page = False + asp_args_regex = re.compile(']*name=\"(__[A-Z]*)\"[^>]*value=\"([^\"]*)\"[^>]*>') + def _modify_response(self, response): """For most sites, we have managed to get all the apps on a single page by choosing the right parameters. @@ -90,7 +98,7 @@ class PlanningExplorerParser: """If an authority has info urls which are for some reason full of crap (like Broadland does), then this method should be overridden in order to tidy them up.""" - return url + return ''.join(url.split()) def _getHeaders(self): """If the authority requires any headers for the post request, @@ -194,16 +202,17 @@ class PlanningExplorerParser: get_request = urllib2.Request(self.search_url) get_response = urllib2.urlopen(get_request) + cookie_jar.extract_cookies(get_response, get_request) + html = get_response.read() # We need to find those ASP parameters such as __VIEWSTATE # so we can use them in the next POST - asp_args_regex = re.compile(']*name=\"(__[A-Z]*)\"[^>]*value=\"([^\"]*)\"[^>]*>') # re.findall gets us a list of key value pairs. # We want to concatenate it with a tuple, so we must # make it a tuple - asp_args = tuple(re.findall(asp_args_regex, html)) + asp_args = tuple(re.findall(self.asp_args_regex, html)) # The post data needs to be different for different councils # so we have a method on each council's scraper to make it. @@ -212,7 +221,7 @@ class PlanningExplorerParser: headers = self._getHeaders() request = urllib2.Request(self.search_url, post_data, headers) - + cookie_jar.add_cookie_header(request) post_response = urllib2.urlopen(request) # We have actually been returned here by an http302 object @@ -358,19 +367,36 @@ class CharnwoodParser(PlanningExplorerParser): class CreweParser(PlanningExplorerParser): use_firefox_user_agent = True - address_td_no = 4 + use_referer = True + + info_url_path = "Northgate/PlanningExplorer/Generic/" + search_url_path = "northgate/planningexplorer/generalsearch.aspx" + + results_table_attrs = {"class": "display_table"} def _getPostData(self, asp_args, search_date): year_month_day = search_date.timetuple()[:3] post_data = urllib.urlencode(asp_args + ( - ("drDateReceived:_ctl0_hidden", urllib.quote('' %year_month_day)), - ("drDateReceivedxxctl0_input", search_date.strftime(date_format)), - ("drDateReceived:_ctl1_hidden", urllib.quote('' %year_month_day)), - ("drDateReceivedxxctl1_input", search_date.strftime(date_format)), - ("cboNumRecs", "99999"), + ("txtApplicantName", ""), + ("txtAgentName", ""), + ("cboStreetReferenceNumber", ""), + ("txtProposal", ""), + ("cboWardCode", ""), + ("cboParishCode", ""), + ("cboApplicationTypeCode", ""), + ("cboDevelopmentTypeCode", ""), + ("cboStatusCode", ""), + ("cboSelectDateValue", "DATE_RECEIVED"), + ("cboMonths", "1"), + ("cboDays", "1"), + ("rbGroup", "rbRange"), + ("dateStart", search_date.strftime(date_format)), + ("dateEnd", search_date.strftime(date_format)), + ("edrDateSelection", ""), ("csbtnSearch", "Search"), - )) + ) + ) return post_data @@ -430,26 +456,72 @@ class HackneyParser(PlanningExplorerParser): return new_response - +#txtApplicationNumber=&ctl00=DATE_REGISTERED&ctl01=1&ctl02=1&rbGroup=ctl05&ctl07_hidden=&ctl07_input=28%2F08%2F2008&ctl08_hidden=&ctl08_input=28%2F08%2F2008&edrDateSelection=1&cboApplicationTypeCode=&txtLocality=&txtPostCode=&txtPropertyName=&txtPropertyNumber=&txtSiteAddress=&txtStreetName=&csbtnSearch=Search& def _getPostData(self, asp_args, search_date): + """Note - using date registered here, not date received. There is too much time taken + between the council 'receiving' an app and 'registering' it for the latter to be useful.""" post_data = urllib.urlencode(asp_args + ( - ("ctl00", "DATE_RECEIVED"), + ("txtApplicationNumber", ""), + ("ctl00", "DATE_REGISTERED"), + ("ctl01", "1"), + ("ctl02", "1"), ("rbGroup", "ctl05"), + ("ctl07_hidden", ""), ("ctl07_input", search_date.strftime(date_format)), + ("ctl08_hidden", ""), ("ctl08_input", search_date.strftime(date_format)), ("edrDateSelection", "1"), + ("cboApplicationTypeCode", ""), + ("txtLocality", ""), + ("txtPostCode", ""), + ("txtPropertyName", ""), + ("txtPropertyNumber", ""), + ("txtSiteAddress", ""), + ("txtStreetName", ""), ("csbtnSearch", "Search"), - )) - + ) + ) return post_data - + class KennetParser(BroadlandLike, PlanningExplorerParser): comments_path = "Northgate/PlanningExplorer/PLComments.aspx?pk=%s" class LincolnParser(PlanningExplorerParser): use_firefox_user_agent = True - results_table_attrs = {"class": "resultstable"} + use_referer = True + + results_table_attrs = {"class": "display_table"} + search_url_path = "northgate/planningexplorer/generalsearch.aspx" + info_url_path = "Northgate/PlanningExplorer/Generic/" + + + def _getPostData(self, asp_args, search_date): + post_data = urllib.urlencode(asp_args + ( + ("txtApplicationNumber", ""), + ("txtApplicantName", ""), + ("txtAgentName", ""), + ("cboApplicationTypeCode", ""), + ("cboStatusCode", ""), + ("txtPropertyName", ""), + ("txtPropertyNumber", ""), + ("cboStreetReferenceNumber", ""), + ("txtPostCode", ""), + ("cboLocality", ""), + ("txtProposal", ""), + ("cboSelectDateValue", "DATE_REGISTERED"), + ("cboMonths", "1"), + ("rbGroup", "rbDay"), + ("cboDays", "10"), + ("dateStart", search_date.strftime(date_format)), + ("dateEnd", search_date.strftime(date_format)), + ("edrDateSelection", ""), + ("csbtnSearch", "Search"), + ) + ) + return post_data + + class LiverpoolParser(PlanningExplorerParser): comments_email_address = "planningandbuildingcontrol@liverpool.gov.uk" use_firefox_user_agent = True @@ -639,9 +711,9 @@ if __name__ == '__main__': # parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/") # parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/") # parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/") - parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/") +# parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/") # parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/") -# parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/") + parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/") # parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/") # parser = ShrewsburyParser("Shrewsbury and Atcham Borough Council", "Shrewsbury", "http://www2.shrewsbury.gov.uk/") # parser = SouthNorfolkParser("South Norfolk Council", "South Norfolk", "http://planning.south-norfolk.gov.uk/") @@ -655,7 +727,7 @@ if __name__ == '__main__': # parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/") # parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/") # parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk") - print parser.getResults(4, 9, 2008) + print parser.getResults(9, 9, 2008) # To Do @@ -666,3 +738,7 @@ if __name__ == '__main__': # Charnwood # South Norfolk has no postcodes. I wonder if the postcodes are in the WAM site... + +# Notes: + +# Since the changed, Liverpool and Crewe look rather similar. They are also a little Broadlandlike. Maybe we can do some consolidation