From 6cf496dfb931f5e7386df079b3645304586d7c83 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Mon, 8 Sep 2008 15:13:12 +0000 Subject: [PATCH] Add scraper for Eastbourne. The info and comment links won't work since they require you to have a cookie. If you go back to them once you have the cookie, you're fine... --- trunk/python_scrapers/Eastbourne.py | 120 +++++++++++++++++++++ trunk/python_scrapers/OtherFilesToCopy.csv | 1 + trunk/python_scrapers/SitesToGenerate.csv | 1 + trunk/python_scrapers/WAM.py | 6 +- 4 files changed, 125 insertions(+), 3 deletions(-) create mode 100644 trunk/python_scrapers/Eastbourne.py diff --git a/trunk/python_scrapers/Eastbourne.py b/trunk/python_scrapers/Eastbourne.py new file mode 100644 index 0000000..ee7ef71 --- /dev/null +++ b/trunk/python_scrapers/Eastbourne.py @@ -0,0 +1,120 @@ +import urllib2 +import urllib +import urlparse + +import datetime, time +import cgi + +import BeautifulSoup + +import cookielib +cookie_jar = cookielib.CookieJar() + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + +date_format = "%d/%m/%Y" + +class EastbourneParser: + def __init__(self, *args): + + self.authority_name = "Eastbourne Borough Council" + self.authority_short_name = "Eastbourne" +# self.base_url = "http://www.eastbourne.gov.uk/planningapplications/search.asp" + self.first_url = "http://www.eastbourne.gov.uk/planningapplications/index.asp" + self.base_url = "http://www.eastbourne.gov.uk/planningapplications/results.asp" + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + search_day = datetime.date(year, month, day) + + # There's going to be some faffing around here. We need a cookie to say we have agreed to some T&Cs. + + # First get the search page - we'll be redirected somewhere else for not having the cookie + + first_request = urllib2.Request(self.first_url) + first_response = urllib2.urlopen(first_request) + cookie_jar.extract_cookies(first_response, first_request) + + first_page_soup = BeautifulSoup.BeautifulSoup(first_response.read()) + + first_page_action = urlparse.urljoin(self.first_url, first_page_soup.form['action']) + + the_input = first_page_soup.form.input + + second_page_post_data = urllib.urlencode( + ( + (the_input['name'], the_input['value']), + ) + ) + + second_request = urllib2.Request(first_page_action, second_page_post_data) + cookie_jar.add_cookie_header(second_request) + second_response = urllib2.urlopen(second_request) + cookie_jar.extract_cookies(second_response, second_request) + + # Now (finally) get the search page + +#ApplicationNumber=&AddressPrefix=&Postcode=&CaseOfficer=&WardMember=&DateReceivedStart=31%2F08%2F2008&DateReceivedEnd=31%2F08%2F2008&DateDecidedStart=&DateDecidedEnd=&Locality=&AgentName=&ApplicantName=&ShowDecided=&DecisionLevel=&Sort1=FullAddressPrefix&Sort2=DateReceived+DESC&Submit=Search + + post_data = urllib.urlencode( + ( + ("ApplicationNumber", ""), + ("AddressPrefix", ""), + ("Postcode", ""), + ("CaseOfficer", ""), + ("WardMember", ""), + ("DateReceivedStart", search_day.strftime(date_format)), + ("DateReceivedEnd", search_day.strftime(date_format)), + ("DateDecidedStart", ""), + ("DateDecidedEnd", ""), + ("Locality", ""), + ("AgentName", ""), + ("ApplicantName", ""), + ("ShowDecided", ""), + ("DecisionLevel", ""), + ("Sort1", "FullAddressPrefix"), + ("Sort2", "DateReceived DESC"), + ("Submit", "Search"), + ) + ) + + search_request = urllib2.Request(self.base_url) + cookie_jar.add_cookie_header(search_request) + search_response = urllib2.urlopen(search_request, post_data) + + soup = BeautifulSoup.BeautifulSoup(search_response.read()) + + app_no_strings = soup.findAll(text="App. No.:") + + for app_no_string in app_no_strings: + application = PlanningApplication() + application.date_received = search_day + + application.council_reference = app_no_string.findNext("a").string.strip() + application.info_url = urlparse.urljoin(self.base_url, app_no_string.findNext("a")['href']) + + application.address = ' '.join([x.strip() for x in app_no_string.findNext(text="Site Address:").findNext("td").contents if type(x) == BeautifulSoup.NavigableString]) + application.postcode = getPostcodeFromText(application.address) + + application.comment_url = urlparse.urljoin(self.base_url, app_no_string.findNext(text="Comment on application").parent['href']) + + application.description = app_no_string.findNext(text="Description:").findNext("td").string.strip() + + self._results.addApplication(application) + + return self._results + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + +if __name__ == '__main__': + parser = EastbourneParser() + print parser.getResults(1,9,2008) + + + +# TODO - currently paginates at 20 diff --git a/trunk/python_scrapers/OtherFilesToCopy.csv b/trunk/python_scrapers/OtherFilesToCopy.csv index 8620687..f5819a5 100644 --- a/trunk/python_scrapers/OtherFilesToCopy.csv +++ b/trunk/python_scrapers/OtherFilesToCopy.csv @@ -57,3 +57,4 @@ "Hastings.py", "420" "Herefordshire.py", "420" "Exmoor.py", "420" +"Eastbourne.py", "420" diff --git a/trunk/python_scrapers/SitesToGenerate.csv b/trunk/python_scrapers/SitesToGenerate.csv index 57c7bcd..0a22dba 100644 --- a/trunk/python_scrapers/SitesToGenerate.csv +++ b/trunk/python_scrapers/SitesToGenerate.csv @@ -261,3 +261,4 @@ "Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser" "Herefordshire Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser" "Exmoor National Park", "Exmoor", "", "Exmoor", "ExmoorParser" +"Eastbourne Borough Council", "Eastbourne", "", "Eastbourne", "EastbourneParser" diff --git a/trunk/python_scrapers/WAM.py b/trunk/python_scrapers/WAM.py index 589e36d..b3a85cf 100644 --- a/trunk/python_scrapers/WAM.py +++ b/trunk/python_scrapers/WAM.py @@ -172,15 +172,15 @@ if __name__ == '__main__': #parser = BraintreeParser("Colchester", "Colchester", "http://www.planning.colchester.gov.uk/WAM/weeklyApplications.do", debug=True) #parser = WAMParser("East Lothian", "East Lothian", "http://www.planning.eastlothian.gov.uk/WAM/pas/searchApplications.do", debug=True) #parser = BraintreeParser("North Somerset", "North Somerset", "http://wam.n-somerset.gov.uk/MULTIWAM/weeklyApplications.do", debug=True) - #parser = WAMParser("Nottingham", "Nottingham", "http://plan4.nottinghamcity.gov.uk/WAM/pas/searchApplications.do", debug=True) + parser = WAMParser("Nottingham", "Nottingham", "http://plan4.nottinghamcity.gov.uk/WAM/pas/searchApplications.do", debug=True) #parser = PooleParser("Poole long", "Poole", "http://wam.boroughofpoole.com/WAM/pas/searchApplications.do", debug=True) #parser = WAMParser("Rother long", "Rother", "http://www.planning.rother.gov.uk/WAM/pas/searchApplications.do", debug=True) #parser = BraintreeParser("South Gloucestershire", "South Gloucestershire", "http://planning.southglos.gov.uk/WAM/pas/WeeklyApplications.do", debug=True) #parser = WAMParser("South Norfolk", "South Norfolk", "http://wam.south-norfolk.gov.uk/WAM/pas/searchApplications.do", debug=True) #parser = BraintreeParser("Tower Hamlets", "Tower Hamlets", "http://194.201.98.213/WAM/weeklyApplications.do", debug=True) - parser = WAMParser("Westminster", "Westminster", "http://idocs.westminster.gov.uk:8080/WAM/search/pas/index.htm", debug=True) + #parser = WAMParser("Westminster", "Westminster", "http://idocs.westminster.gov.uk:8080/WAM/search/pas/index.htm", debug=True) - print parser.getResults(1,8,2008) + print parser.getResults(31,8,2008) # Left to fix