From fa73ab577a2f6d06cf6c29e5f3514dac4ebc6fae Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Tue, 12 Aug 2008 17:17:45 +0000 Subject: [PATCH] Add scraper for Westminster. --- trunk/python_scrapers/OtherFilesToCopy.csv | 1 + trunk/python_scrapers/SitesToGenerate.csv | 1 + trunk/python_scrapers/WAM.py | 6 +- trunk/python_scrapers/Westminster.py | 112 +++++++++++++++++++++ 4 files changed, 117 insertions(+), 3 deletions(-) create mode 100644 trunk/python_scrapers/Westminster.py diff --git a/trunk/python_scrapers/OtherFilesToCopy.csv b/trunk/python_scrapers/OtherFilesToCopy.csv index fc33531..4914f1c 100644 --- a/trunk/python_scrapers/OtherFilesToCopy.csv +++ b/trunk/python_scrapers/OtherFilesToCopy.csv @@ -51,3 +51,4 @@ "KingstonUponThames.py", "420" "Hounslow.py", "420" "Harrow.py", "420" +"Westminster.py", "420" diff --git a/trunk/python_scrapers/SitesToGenerate.csv b/trunk/python_scrapers/SitesToGenerate.csv index 023b500..1c90dbf 100644 --- a/trunk/python_scrapers/SitesToGenerate.csv +++ b/trunk/python_scrapers/SitesToGenerate.csv @@ -255,3 +255,4 @@ "Royal Borough of Kingston upon Thames", "Kingston upon Thames", "", "KingstonUponThames", "KingstonParser" "London Borough of Hounslow", "Hounslow", "", "Hounslow", "HounslowParser" "London Borough of Harrow", "Harrow", "", "Harrow", "HarrowParser" +"Westminster City Council", "Westminster", "", "Westminster", "WestminsterParser" diff --git a/trunk/python_scrapers/WAM.py b/trunk/python_scrapers/WAM.py index 082574e..589e36d 100644 --- a/trunk/python_scrapers/WAM.py +++ b/trunk/python_scrapers/WAM.py @@ -167,7 +167,7 @@ class BraintreeParser(WAMParser): if __name__ == '__main__': #parser = BraintreeParser("Braintree", "Braintree", "http://planningapp.braintree.gov.uk/WAM1/weeklyApplications.do", debug=True) # Camden - parser = WAMParser("Castle Point", "Castle Point", "http://wam.castlepoint.gov.uk/WAM/pas/searchApplications.do")#, debug=True) +# parser = WAMParser("Castle Point", "Castle Point", "http://wam.castlepoint.gov.uk/WAM/pas/searchApplications.do")#, debug=True) #Chichester - Done as PublicAccess #parser = BraintreeParser("Colchester", "Colchester", "http://www.planning.colchester.gov.uk/WAM/weeklyApplications.do", debug=True) #parser = WAMParser("East Lothian", "East Lothian", "http://www.planning.eastlothian.gov.uk/WAM/pas/searchApplications.do", debug=True) @@ -178,9 +178,9 @@ if __name__ == '__main__': #parser = BraintreeParser("South Gloucestershire", "South Gloucestershire", "http://planning.southglos.gov.uk/WAM/pas/WeeklyApplications.do", debug=True) #parser = WAMParser("South Norfolk", "South Norfolk", "http://wam.south-norfolk.gov.uk/WAM/pas/searchApplications.do", debug=True) #parser = BraintreeParser("Tower Hamlets", "Tower Hamlets", "http://194.201.98.213/WAM/weeklyApplications.do", debug=True) - #parser = WAMParser("Westminster", "Westminster", "http://idocs.westminster.gov.uk:8080/WAM/search/pas/index.htm", debug=True) + parser = WAMParser("Westminster", "Westminster", "http://idocs.westminster.gov.uk:8080/WAM/search/pas/index.htm", debug=True) - print parser.getResults(20,5,2008) + print parser.getResults(1,8,2008) # Left to fix diff --git a/trunk/python_scrapers/Westminster.py b/trunk/python_scrapers/Westminster.py new file mode 100644 index 0000000..743e721 --- /dev/null +++ b/trunk/python_scrapers/Westminster.py @@ -0,0 +1,112 @@ +""" +This is the screenscraper for Westminster City Council. + +I have just noticed that there is a PublicAccess underneath all this, but +it only has the apps in for which they are accepting comments, so I think +we may as well use this url and get the lot... + +This is the PublicAccess url: +http://publicaccess.westminster.gov.uk/publicaccess/ +""" + +import urllib2 +import urllib +import urlparse + +import datetime, time +import cgi + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +date_format = "%d%%2F%m%%2F%Y" + +class WestminsterParser: + def __init__(self, *args): + + self.authority_name = "City of Westminster" + self.authority_short_name = "Westminster" + self.base_url = "http://www3.westminster.gov.uk/planningapplications/currentsearch-results.cfm" + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + search_day = datetime.date(year, month, day) + +# post_data = [ +# ("EFNO", ""), +# ("STName", ""), +# ("STNUMB", ""), +# ("ADRSNO", ""), +# ("WARD", "AllWards"), +# ("AGT", ""), +# ("ATCDE", "AllApps"), +# ("DECDE", "AllDecs"), +# ("DTErec", search_day.strftime(date_format)), +# ("DTErecTo", search_day.strftime(date_format)), +# ("DTEvalid", ""), +# ("DTEvalidTo", ""), +# ("APDECDE", "AllAppDecs"), +# ("submit", "Start+Search"), +# ] + post_data = "REFNO=&STName=&STNUMB=&ADRSNO=&WARD=AllWards&AGT=&ATCDE=AllApps&DECDE=AllDecs&DTErec=%(date)s&DTErecTo=%(date)s&DTEvalid=&DTEvalidTo=&APDECDE=AllAppDecs&submit=Start+Search" %{"date": search_day.strftime(date_format)} + + while post_data: + + # Now get the search page + response = urllib2.urlopen(self.base_url, post_data) + soup = BeautifulSoup(response.read()) + + results_form = soup.find("form", {"name": "currentsearchresultsNext"}) + + # Sort out the post_data for the next page, if there is one + # If there is no next page then there will be no inputs in the form. + # In this case, post_data will be '', which is false. + + post_data = urllib.urlencode([(x['name'], x['value']) for x in results_form.findAll("input")]) + + # Each result has one link, and they are the only links in the form + + links = results_form.findAll("a") + + for link in links: + application = PlanningApplication() + + application.date_received = search_day + application.info_url = urlparse.urljoin(self.base_url, link['href']) + application.council_reference = link.string.strip() + + application.address = link.findNext("td").string.strip() + application.postcode = getPostcodeFromText(application.address) + + application.description = link.findNext("tr").findAll("td")[-1].string.strip() + + # To get the comment url, we're going to have to go to each info url :-( + + info_response = urllib2.urlopen(application.info_url) + info_soup = BeautifulSoup(info_response) + + comment_nav_string = info_soup.find(text="Comment on this case") + if comment_nav_string: + application.comment_url = comment_nav_string.parent['href'] + else: + application.comment_url = "No Comments" + + #http://publicaccess.westminster.gov.uk/publicaccess/tdc/dcapplication/application_comments_entryform.aspx?caseno=K586GHRP03500 + + self._results.addApplication(application) + + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +if __name__ == '__main__': + parser = WestminsterParser() + print parser.getResults(1,8,2008) +