import urllib2 import urllib import urlparse import datetime, time import cgi import BeautifulSoup import cookielib cookie_jar = cookielib.CookieJar() from PlanningUtils import PlanningApplication, \ PlanningAuthorityResults, \ getPostcodeFromText date_format = "%d/%m/%Y" class EastbourneParser: def __init__(self, *args): self.authority_name = "Eastbourne Borough Council" self.authority_short_name = "Eastbourne" # self.base_url = "http://www.eastbourne.gov.uk/planningapplications/search.asp" self.first_url = "http://www.eastbourne.gov.uk/planningapplications/index.asp" self.base_url = "http://www.eastbourne.gov.uk/planningapplications/results.asp" self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) def getResultsByDayMonthYear(self, day, month, year): search_day = datetime.date(year, month, day) # There's going to be some faffing around here. We need a cookie to say we have agreed to some T&Cs. # First get the search page - we'll be redirected somewhere else for not having the cookie first_request = urllib2.Request(self.first_url) first_response = urllib2.urlopen(first_request) cookie_jar.extract_cookies(first_response, first_request) first_page_soup = BeautifulSoup.BeautifulSoup(first_response.read()) first_page_action = urlparse.urljoin(self.first_url, first_page_soup.form['action']) the_input = first_page_soup.form.input second_page_post_data = urllib.urlencode( ( (the_input['name'], the_input['value']), ) ) second_request = urllib2.Request(first_page_action, second_page_post_data) cookie_jar.add_cookie_header(second_request) second_response = urllib2.urlopen(second_request) cookie_jar.extract_cookies(second_response, second_request) # Now (finally) get the search page #ApplicationNumber=&AddressPrefix=&Postcode=&CaseOfficer=&WardMember=&DateReceivedStart=31%2F08%2F2008&DateReceivedEnd=31%2F08%2F2008&DateDecidedStart=&DateDecidedEnd=&Locality=&AgentName=&ApplicantName=&ShowDecided=&DecisionLevel=&Sort1=FullAddressPrefix&Sort2=DateReceived+DESC&Submit=Search post_data = urllib.urlencode( ( ("ApplicationNumber", ""), ("AddressPrefix", ""), ("Postcode", ""), ("CaseOfficer", ""), ("WardMember", ""), ("DateReceivedStart", search_day.strftime(date_format)), ("DateReceivedEnd", search_day.strftime(date_format)), ("DateDecidedStart", ""), ("DateDecidedEnd", ""), ("Locality", ""), ("AgentName", ""), ("ApplicantName", ""), ("ShowDecided", ""), ("DecisionLevel", ""), ("Sort1", "FullAddressPrefix"), ("Sort2", "DateReceived DESC"), ("Submit", "Search"), ) ) search_request = urllib2.Request(self.base_url) cookie_jar.add_cookie_header(search_request) search_response = urllib2.urlopen(search_request, post_data) soup = BeautifulSoup.BeautifulSoup(search_response.read()) app_no_strings = soup.findAll(text="App. No.:") for app_no_string in app_no_strings: application = PlanningApplication() application.date_received = search_day application.council_reference = app_no_string.findNext("a").string.strip() application.info_url = urlparse.urljoin(self.base_url, app_no_string.findNext("a")['href']) application.address = ' '.join([x.strip() for x in app_no_string.findNext(text="Site Address:").findNext("td").contents if type(x) == BeautifulSoup.NavigableString]) application.postcode = getPostcodeFromText(application.address) application.comment_url = urlparse.urljoin(self.base_url, app_no_string.findNext(text="Comment on application").parent['href']) application.description = app_no_string.findNext(text="Description:").findNext("td").string.strip() self._results.addApplication(application) return self._results def getResults(self, day, month, year): return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() if __name__ == '__main__': parser = EastbourneParser() print parser.getResults(1,9,2008) # TODO - currently paginates at 20