瀏覽代碼

Fixes for Lincoln and Crewe.

Make Hackney use date registered rather than date received.
import/raw
duncan.parkes 16 年之前
父節點
當前提交
8e40e8a961
共有 1 個文件被更改,包括 96 次插入20 次删除
  1. +96
    -20
      trunk/python_scrapers/PlanningExplorer.py

+ 96
- 20
trunk/python_scrapers/PlanningExplorer.py 查看文件

@@ -5,6 +5,12 @@ import cgi
import re
import datetime


import cookielib

cookie_jar = cookielib.CookieJar()


from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
@@ -64,6 +70,8 @@ class PlanningExplorerParser:
# If fetch_info_page is set to true, then we need to get a copy of the info page and store it as an attribute on current_application (naughty!)
fetch_info_page = False

asp_args_regex = re.compile('<input[^>]*name=\"(__[A-Z]*)\"[^>]*value=\"([^\"]*)\"[^>]*>')

def _modify_response(self, response):
"""For most sites, we have managed to get all the apps on a
single page by choosing the right parameters.
@@ -90,7 +98,7 @@ class PlanningExplorerParser:
"""If an authority has info urls which are for some reason full
of crap (like Broadland does), then this method should be overridden
in order to tidy them up."""
return url
return ''.join(url.split())

def _getHeaders(self):
"""If the authority requires any headers for the post request,
@@ -194,16 +202,17 @@ class PlanningExplorerParser:
get_request = urllib2.Request(self.search_url)
get_response = urllib2.urlopen(get_request)

cookie_jar.extract_cookies(get_response, get_request)

html = get_response.read()

# We need to find those ASP parameters such as __VIEWSTATE
# so we can use them in the next POST
asp_args_regex = re.compile('<input[^>]*name=\"(__[A-Z]*)\"[^>]*value=\"([^\"]*)\"[^>]*>')

# re.findall gets us a list of key value pairs.
# We want to concatenate it with a tuple, so we must
# make it a tuple
asp_args = tuple(re.findall(asp_args_regex, html))
asp_args = tuple(re.findall(self.asp_args_regex, html))

# The post data needs to be different for different councils
# so we have a method on each council's scraper to make it.
@@ -212,7 +221,7 @@ class PlanningExplorerParser:
headers = self._getHeaders()

request = urllib2.Request(self.search_url, post_data, headers)
cookie_jar.add_cookie_header(request)
post_response = urllib2.urlopen(request)

# We have actually been returned here by an http302 object
@@ -358,19 +367,36 @@ class CharnwoodParser(PlanningExplorerParser):

class CreweParser(PlanningExplorerParser):
use_firefox_user_agent = True
address_td_no = 4
use_referer = True

info_url_path = "Northgate/PlanningExplorer/Generic/"
search_url_path = "northgate/planningexplorer/generalsearch.aspx"
results_table_attrs = {"class": "display_table"}

def _getPostData(self, asp_args, search_date):
year_month_day = search_date.timetuple()[:3]

post_data = urllib.urlencode(asp_args + (
("drDateReceived:_ctl0_hidden", urllib.quote('<DateChooser Value="%d%%2C%d%%2C%d"><ExpandEffects></ExpandEffects></DateChooser>' %year_month_day)),
("drDateReceivedxxctl0_input", search_date.strftime(date_format)),
("drDateReceived:_ctl1_hidden", urllib.quote('<DateChooser Value="%d%%2C%d%%2C%d"><ExpandEffects></ExpandEffects></DateChooser>' %year_month_day)),
("drDateReceivedxxctl1_input", search_date.strftime(date_format)),
("cboNumRecs", "99999"),
("txtApplicantName", ""),
("txtAgentName", ""),
("cboStreetReferenceNumber", ""),
("txtProposal", ""),
("cboWardCode", ""),
("cboParishCode", ""),
("cboApplicationTypeCode", ""),
("cboDevelopmentTypeCode", ""),
("cboStatusCode", ""),
("cboSelectDateValue", "DATE_RECEIVED"),
("cboMonths", "1"),
("cboDays", "1"),
("rbGroup", "rbRange"),
("dateStart", search_date.strftime(date_format)),
("dateEnd", search_date.strftime(date_format)),
("edrDateSelection", ""),
("csbtnSearch", "Search"),
))
)
)

return post_data

@@ -430,26 +456,72 @@ class HackneyParser(PlanningExplorerParser):

return new_response

#txtApplicationNumber=&ctl00=DATE_REGISTERED&ctl01=1&ctl02=1&rbGroup=ctl05&ctl07_hidden=&ctl07_input=28%2F08%2F2008&ctl08_hidden=&ctl08_input=28%2F08%2F2008&edrDateSelection=1&cboApplicationTypeCode=&txtLocality=&txtPostCode=&txtPropertyName=&txtPropertyNumber=&txtSiteAddress=&txtStreetName=&csbtnSearch=Search&
def _getPostData(self, asp_args, search_date):
"""Note - using date registered here, not date received. There is too much time taken
between the council 'receiving' an app and 'registering' it for the latter to be useful."""
post_data = urllib.urlencode(asp_args + (
("ctl00", "DATE_RECEIVED"),
("txtApplicationNumber", ""),
("ctl00", "DATE_REGISTERED"),
("ctl01", "1"),
("ctl02", "1"),
("rbGroup", "ctl05"),
("ctl07_hidden", ""),
("ctl07_input", search_date.strftime(date_format)),
("ctl08_hidden", ""),
("ctl08_input", search_date.strftime(date_format)),
("edrDateSelection", "1"),
("cboApplicationTypeCode", ""),
("txtLocality", ""),
("txtPostCode", ""),
("txtPropertyName", ""),
("txtPropertyNumber", ""),
("txtSiteAddress", ""),
("txtStreetName", ""),
("csbtnSearch", "Search"),
))

)
)
return post_data
class KennetParser(BroadlandLike, PlanningExplorerParser):
comments_path = "Northgate/PlanningExplorer/PLComments.aspx?pk=%s"
class LincolnParser(PlanningExplorerParser):
use_firefox_user_agent = True
results_table_attrs = {"class": "resultstable"}
use_referer = True

results_table_attrs = {"class": "display_table"}
search_url_path = "northgate/planningexplorer/generalsearch.aspx"
info_url_path = "Northgate/PlanningExplorer/Generic/"


def _getPostData(self, asp_args, search_date):
post_data = urllib.urlencode(asp_args + (
("txtApplicationNumber", ""),
("txtApplicantName", ""),
("txtAgentName", ""),
("cboApplicationTypeCode", ""),
("cboStatusCode", ""),
("txtPropertyName", ""),
("txtPropertyNumber", ""),
("cboStreetReferenceNumber", ""),
("txtPostCode", ""),
("cboLocality", ""),
("txtProposal", ""),
("cboSelectDateValue", "DATE_REGISTERED"),
("cboMonths", "1"),
("rbGroup", "rbDay"),
("cboDays", "10"),
("dateStart", search_date.strftime(date_format)),
("dateEnd", search_date.strftime(date_format)),
("edrDateSelection", ""),
("csbtnSearch", "Search"),
)
)
return post_data


class LiverpoolParser(PlanningExplorerParser):
comments_email_address = "planningandbuildingcontrol@liverpool.gov.uk"
use_firefox_user_agent = True
@@ -639,9 +711,9 @@ if __name__ == '__main__':
# parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/")
# parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/")
# parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/")
parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
# parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/")
# parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/")
# parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/")
parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/")
# parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/")
# parser = ShrewsburyParser("Shrewsbury and Atcham Borough Council", "Shrewsbury", "http://www2.shrewsbury.gov.uk/")
# parser = SouthNorfolkParser("South Norfolk Council", "South Norfolk", "http://planning.south-norfolk.gov.uk/")
@@ -655,7 +727,7 @@ if __name__ == '__main__':
# parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/")
# parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/")
# parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk")
print parser.getResults(4, 9, 2008)
print parser.getResults(9, 9, 2008)

# To Do

@@ -666,3 +738,7 @@ if __name__ == '__main__':
# Charnwood

# South Norfolk has no postcodes. I wonder if the postcodes are in the WAM site...

# Notes:

# Since the changed, Liverpool and Crewe look rather similar. They are also a little Broadlandlike. Maybe we can do some consolidation

Loading…
取消
儲存