Browse Source

Fixes for Lincoln and Crewe.

Make Hackney use date registered rather than date received.
master
duncan.parkes 16 years ago
parent
commit
d53a2f877f
1 changed files with 96 additions and 20 deletions
  1. +96
    -20
      python_scrapers/PlanningExplorer.py

+ 96
- 20
python_scrapers/PlanningExplorer.py View File

@@ -5,6 +5,12 @@ import cgi
import re import re
import datetime import datetime



import cookielib

cookie_jar = cookielib.CookieJar()


from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup


from PlanningUtils import PlanningApplication, \ from PlanningUtils import PlanningApplication, \
@@ -64,6 +70,8 @@ class PlanningExplorerParser:
# If fetch_info_page is set to true, then we need to get a copy of the info page and store it as an attribute on current_application (naughty!) # If fetch_info_page is set to true, then we need to get a copy of the info page and store it as an attribute on current_application (naughty!)
fetch_info_page = False fetch_info_page = False


asp_args_regex = re.compile('<input[^>]*name=\"(__[A-Z]*)\"[^>]*value=\"([^\"]*)\"[^>]*>')

def _modify_response(self, response): def _modify_response(self, response):
"""For most sites, we have managed to get all the apps on a """For most sites, we have managed to get all the apps on a
single page by choosing the right parameters. single page by choosing the right parameters.
@@ -90,7 +98,7 @@ class PlanningExplorerParser:
"""If an authority has info urls which are for some reason full """If an authority has info urls which are for some reason full
of crap (like Broadland does), then this method should be overridden of crap (like Broadland does), then this method should be overridden
in order to tidy them up.""" in order to tidy them up."""
return url
return ''.join(url.split())


def _getHeaders(self): def _getHeaders(self):
"""If the authority requires any headers for the post request, """If the authority requires any headers for the post request,
@@ -194,16 +202,17 @@ class PlanningExplorerParser:
get_request = urllib2.Request(self.search_url) get_request = urllib2.Request(self.search_url)
get_response = urllib2.urlopen(get_request) get_response = urllib2.urlopen(get_request)


cookie_jar.extract_cookies(get_response, get_request)

html = get_response.read() html = get_response.read()


# We need to find those ASP parameters such as __VIEWSTATE # We need to find those ASP parameters such as __VIEWSTATE
# so we can use them in the next POST # so we can use them in the next POST
asp_args_regex = re.compile('<input[^>]*name=\"(__[A-Z]*)\"[^>]*value=\"([^\"]*)\"[^>]*>')


# re.findall gets us a list of key value pairs. # re.findall gets us a list of key value pairs.
# We want to concatenate it with a tuple, so we must # We want to concatenate it with a tuple, so we must
# make it a tuple # make it a tuple
asp_args = tuple(re.findall(asp_args_regex, html))
asp_args = tuple(re.findall(self.asp_args_regex, html))


# The post data needs to be different for different councils # The post data needs to be different for different councils
# so we have a method on each council's scraper to make it. # so we have a method on each council's scraper to make it.
@@ -212,7 +221,7 @@ class PlanningExplorerParser:
headers = self._getHeaders() headers = self._getHeaders()


request = urllib2.Request(self.search_url, post_data, headers) request = urllib2.Request(self.search_url, post_data, headers)
cookie_jar.add_cookie_header(request)
post_response = urllib2.urlopen(request) post_response = urllib2.urlopen(request)


# We have actually been returned here by an http302 object # We have actually been returned here by an http302 object
@@ -358,19 +367,36 @@ class CharnwoodParser(PlanningExplorerParser):


class CreweParser(PlanningExplorerParser): class CreweParser(PlanningExplorerParser):
use_firefox_user_agent = True use_firefox_user_agent = True
address_td_no = 4
use_referer = True

info_url_path = "Northgate/PlanningExplorer/Generic/"
search_url_path = "northgate/planningexplorer/generalsearch.aspx"
results_table_attrs = {"class": "display_table"}


def _getPostData(self, asp_args, search_date): def _getPostData(self, asp_args, search_date):
year_month_day = search_date.timetuple()[:3] year_month_day = search_date.timetuple()[:3]


post_data = urllib.urlencode(asp_args + ( post_data = urllib.urlencode(asp_args + (
("drDateReceived:_ctl0_hidden", urllib.quote('<DateChooser Value="%d%%2C%d%%2C%d"><ExpandEffects></ExpandEffects></DateChooser>' %year_month_day)),
("drDateReceivedxxctl0_input", search_date.strftime(date_format)),
("drDateReceived:_ctl1_hidden", urllib.quote('<DateChooser Value="%d%%2C%d%%2C%d"><ExpandEffects></ExpandEffects></DateChooser>' %year_month_day)),
("drDateReceivedxxctl1_input", search_date.strftime(date_format)),
("cboNumRecs", "99999"),
("txtApplicantName", ""),
("txtAgentName", ""),
("cboStreetReferenceNumber", ""),
("txtProposal", ""),
("cboWardCode", ""),
("cboParishCode", ""),
("cboApplicationTypeCode", ""),
("cboDevelopmentTypeCode", ""),
("cboStatusCode", ""),
("cboSelectDateValue", "DATE_RECEIVED"),
("cboMonths", "1"),
("cboDays", "1"),
("rbGroup", "rbRange"),
("dateStart", search_date.strftime(date_format)),
("dateEnd", search_date.strftime(date_format)),
("edrDateSelection", ""),
("csbtnSearch", "Search"), ("csbtnSearch", "Search"),
))
)
)


return post_data return post_data


@@ -430,26 +456,72 @@ class HackneyParser(PlanningExplorerParser):


return new_response return new_response


#txtApplicationNumber=&ctl00=DATE_REGISTERED&ctl01=1&ctl02=1&rbGroup=ctl05&ctl07_hidden=&ctl07_input=28%2F08%2F2008&ctl08_hidden=&ctl08_input=28%2F08%2F2008&edrDateSelection=1&cboApplicationTypeCode=&txtLocality=&txtPostCode=&txtPropertyName=&txtPropertyNumber=&txtSiteAddress=&txtStreetName=&csbtnSearch=Search&
def _getPostData(self, asp_args, search_date): def _getPostData(self, asp_args, search_date):
"""Note - using date registered here, not date received. There is too much time taken
between the council 'receiving' an app and 'registering' it for the latter to be useful."""
post_data = urllib.urlencode(asp_args + ( post_data = urllib.urlencode(asp_args + (
("ctl00", "DATE_RECEIVED"),
("txtApplicationNumber", ""),
("ctl00", "DATE_REGISTERED"),
("ctl01", "1"),
("ctl02", "1"),
("rbGroup", "ctl05"), ("rbGroup", "ctl05"),
("ctl07_hidden", ""),
("ctl07_input", search_date.strftime(date_format)), ("ctl07_input", search_date.strftime(date_format)),
("ctl08_hidden", ""),
("ctl08_input", search_date.strftime(date_format)), ("ctl08_input", search_date.strftime(date_format)),
("edrDateSelection", "1"), ("edrDateSelection", "1"),
("cboApplicationTypeCode", ""),
("txtLocality", ""),
("txtPostCode", ""),
("txtPropertyName", ""),
("txtPropertyNumber", ""),
("txtSiteAddress", ""),
("txtStreetName", ""),
("csbtnSearch", "Search"), ("csbtnSearch", "Search"),
))

)
)
return post_data return post_data
class KennetParser(BroadlandLike, PlanningExplorerParser): class KennetParser(BroadlandLike, PlanningExplorerParser):
comments_path = "Northgate/PlanningExplorer/PLComments.aspx?pk=%s" comments_path = "Northgate/PlanningExplorer/PLComments.aspx?pk=%s"
class LincolnParser(PlanningExplorerParser): class LincolnParser(PlanningExplorerParser):
use_firefox_user_agent = True use_firefox_user_agent = True
results_table_attrs = {"class": "resultstable"}
use_referer = True

results_table_attrs = {"class": "display_table"}
search_url_path = "northgate/planningexplorer/generalsearch.aspx"
info_url_path = "Northgate/PlanningExplorer/Generic/"


def _getPostData(self, asp_args, search_date):
post_data = urllib.urlencode(asp_args + (
("txtApplicationNumber", ""),
("txtApplicantName", ""),
("txtAgentName", ""),
("cboApplicationTypeCode", ""),
("cboStatusCode", ""),
("txtPropertyName", ""),
("txtPropertyNumber", ""),
("cboStreetReferenceNumber", ""),
("txtPostCode", ""),
("cboLocality", ""),
("txtProposal", ""),
("cboSelectDateValue", "DATE_REGISTERED"),
("cboMonths", "1"),
("rbGroup", "rbDay"),
("cboDays", "10"),
("dateStart", search_date.strftime(date_format)),
("dateEnd", search_date.strftime(date_format)),
("edrDateSelection", ""),
("csbtnSearch", "Search"),
)
)
return post_data


class LiverpoolParser(PlanningExplorerParser): class LiverpoolParser(PlanningExplorerParser):
comments_email_address = "planningandbuildingcontrol@liverpool.gov.uk" comments_email_address = "planningandbuildingcontrol@liverpool.gov.uk"
use_firefox_user_agent = True use_firefox_user_agent = True
@@ -639,9 +711,9 @@ if __name__ == '__main__':
# parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/") # parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/")
# parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/") # parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/")
# parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/") # parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/")
parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
# parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
# parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/") # parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/")
# parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/")
parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/")
# parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/") # parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/")
# parser = ShrewsburyParser("Shrewsbury and Atcham Borough Council", "Shrewsbury", "http://www2.shrewsbury.gov.uk/") # parser = ShrewsburyParser("Shrewsbury and Atcham Borough Council", "Shrewsbury", "http://www2.shrewsbury.gov.uk/")
# parser = SouthNorfolkParser("South Norfolk Council", "South Norfolk", "http://planning.south-norfolk.gov.uk/") # parser = SouthNorfolkParser("South Norfolk Council", "South Norfolk", "http://planning.south-norfolk.gov.uk/")
@@ -655,7 +727,7 @@ if __name__ == '__main__':
# parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/") # parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/")
# parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/") # parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/")
# parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk") # parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk")
print parser.getResults(4, 9, 2008)
print parser.getResults(9, 9, 2008)


# To Do # To Do


@@ -666,3 +738,7 @@ if __name__ == '__main__':
# Charnwood # Charnwood


# South Norfolk has no postcodes. I wonder if the postcodes are in the WAM site... # South Norfolk has no postcodes. I wonder if the postcodes are in the WAM site...

# Notes:

# Since the changed, Liverpool and Crewe look rather similar. They are also a little Broadlandlike. Maybe we can do some consolidation

Loading…
Cancel
Save