Browse Source

Add scraper for Eastbourne. The info and comment links won't work since they require you to have a cookie. If you go

back to them once you have the cookie, you're fine...
import/raw
duncan.parkes 16 years ago
parent
commit
6cf496dfb9
4 changed files with 125 additions and 3 deletions
  1. +120
    -0
      trunk/python_scrapers/Eastbourne.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv
  4. +3
    -3
      trunk/python_scrapers/WAM.py

+ 120
- 0
trunk/python_scrapers/Eastbourne.py View File

@@ -0,0 +1,120 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

import BeautifulSoup

import cookielib
cookie_jar = cookielib.CookieJar()

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class EastbourneParser:
def __init__(self, *args):

self.authority_name = "Eastbourne Borough Council"
self.authority_short_name = "Eastbourne"
# self.base_url = "http://www.eastbourne.gov.uk/planningapplications/search.asp"
self.first_url = "http://www.eastbourne.gov.uk/planningapplications/index.asp"
self.base_url = "http://www.eastbourne.gov.uk/planningapplications/results.asp"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# There's going to be some faffing around here. We need a cookie to say we have agreed to some T&Cs.

# First get the search page - we'll be redirected somewhere else for not having the cookie

first_request = urllib2.Request(self.first_url)
first_response = urllib2.urlopen(first_request)
cookie_jar.extract_cookies(first_response, first_request)

first_page_soup = BeautifulSoup.BeautifulSoup(first_response.read())

first_page_action = urlparse.urljoin(self.first_url, first_page_soup.form['action'])
the_input = first_page_soup.form.input

second_page_post_data = urllib.urlencode(
(
(the_input['name'], the_input['value']),
)
)
second_request = urllib2.Request(first_page_action, second_page_post_data)
cookie_jar.add_cookie_header(second_request)
second_response = urllib2.urlopen(second_request)
cookie_jar.extract_cookies(second_response, second_request)

# Now (finally) get the search page

#ApplicationNumber=&AddressPrefix=&Postcode=&CaseOfficer=&WardMember=&DateReceivedStart=31%2F08%2F2008&DateReceivedEnd=31%2F08%2F2008&DateDecidedStart=&DateDecidedEnd=&Locality=&AgentName=&ApplicantName=&ShowDecided=&DecisionLevel=&Sort1=FullAddressPrefix&Sort2=DateReceived+DESC&Submit=Search

post_data = urllib.urlencode(
(
("ApplicationNumber", ""),
("AddressPrefix", ""),
("Postcode", ""),
("CaseOfficer", ""),
("WardMember", ""),
("DateReceivedStart", search_day.strftime(date_format)),
("DateReceivedEnd", search_day.strftime(date_format)),
("DateDecidedStart", ""),
("DateDecidedEnd", ""),
("Locality", ""),
("AgentName", ""),
("ApplicantName", ""),
("ShowDecided", ""),
("DecisionLevel", ""),
("Sort1", "FullAddressPrefix"),
("Sort2", "DateReceived DESC"),
("Submit", "Search"),
)
)

search_request = urllib2.Request(self.base_url)
cookie_jar.add_cookie_header(search_request)
search_response = urllib2.urlopen(search_request, post_data)

soup = BeautifulSoup.BeautifulSoup(search_response.read())

app_no_strings = soup.findAll(text="App. No.:")

for app_no_string in app_no_strings:
application = PlanningApplication()
application.date_received = search_day

application.council_reference = app_no_string.findNext("a").string.strip()
application.info_url = urlparse.urljoin(self.base_url, app_no_string.findNext("a")['href'])

application.address = ' '.join([x.strip() for x in app_no_string.findNext(text="Site Address:").findNext("td").contents if type(x) == BeautifulSoup.NavigableString])
application.postcode = getPostcodeFromText(application.address)

application.comment_url = urlparse.urljoin(self.base_url, app_no_string.findNext(text="Comment on application").parent['href'])

application.description = app_no_string.findNext(text="Description:").findNext("td").string.strip()

self._results.addApplication(application)
return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = EastbourneParser()
print parser.getResults(1,9,2008)



# TODO - currently paginates at 20

+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv View File

@@ -57,3 +57,4 @@
"Hastings.py", "420" "Hastings.py", "420"
"Herefordshire.py", "420" "Herefordshire.py", "420"
"Exmoor.py", "420" "Exmoor.py", "420"
"Eastbourne.py", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv View File

@@ -261,3 +261,4 @@
"Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser" "Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser"
"Herefordshire Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser" "Herefordshire Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser"
"Exmoor National Park", "Exmoor", "", "Exmoor", "ExmoorParser" "Exmoor National Park", "Exmoor", "", "Exmoor", "ExmoorParser"
"Eastbourne Borough Council", "Eastbourne", "", "Eastbourne", "EastbourneParser"

+ 3
- 3
trunk/python_scrapers/WAM.py View File

@@ -172,15 +172,15 @@ if __name__ == '__main__':
#parser = BraintreeParser("Colchester", "Colchester", "http://www.planning.colchester.gov.uk/WAM/weeklyApplications.do", debug=True) #parser = BraintreeParser("Colchester", "Colchester", "http://www.planning.colchester.gov.uk/WAM/weeklyApplications.do", debug=True)
#parser = WAMParser("East Lothian", "East Lothian", "http://www.planning.eastlothian.gov.uk/WAM/pas/searchApplications.do", debug=True) #parser = WAMParser("East Lothian", "East Lothian", "http://www.planning.eastlothian.gov.uk/WAM/pas/searchApplications.do", debug=True)
#parser = BraintreeParser("North Somerset", "North Somerset", "http://wam.n-somerset.gov.uk/MULTIWAM/weeklyApplications.do", debug=True) #parser = BraintreeParser("North Somerset", "North Somerset", "http://wam.n-somerset.gov.uk/MULTIWAM/weeklyApplications.do", debug=True)
#parser = WAMParser("Nottingham", "Nottingham", "http://plan4.nottinghamcity.gov.uk/WAM/pas/searchApplications.do", debug=True)
parser = WAMParser("Nottingham", "Nottingham", "http://plan4.nottinghamcity.gov.uk/WAM/pas/searchApplications.do", debug=True)
#parser = PooleParser("Poole long", "Poole", "http://wam.boroughofpoole.com/WAM/pas/searchApplications.do", debug=True) #parser = PooleParser("Poole long", "Poole", "http://wam.boroughofpoole.com/WAM/pas/searchApplications.do", debug=True)
#parser = WAMParser("Rother long", "Rother", "http://www.planning.rother.gov.uk/WAM/pas/searchApplications.do", debug=True) #parser = WAMParser("Rother long", "Rother", "http://www.planning.rother.gov.uk/WAM/pas/searchApplications.do", debug=True)
#parser = BraintreeParser("South Gloucestershire", "South Gloucestershire", "http://planning.southglos.gov.uk/WAM/pas/WeeklyApplications.do", debug=True) #parser = BraintreeParser("South Gloucestershire", "South Gloucestershire", "http://planning.southglos.gov.uk/WAM/pas/WeeklyApplications.do", debug=True)
#parser = WAMParser("South Norfolk", "South Norfolk", "http://wam.south-norfolk.gov.uk/WAM/pas/searchApplications.do", debug=True) #parser = WAMParser("South Norfolk", "South Norfolk", "http://wam.south-norfolk.gov.uk/WAM/pas/searchApplications.do", debug=True)
#parser = BraintreeParser("Tower Hamlets", "Tower Hamlets", "http://194.201.98.213/WAM/weeklyApplications.do", debug=True) #parser = BraintreeParser("Tower Hamlets", "Tower Hamlets", "http://194.201.98.213/WAM/weeklyApplications.do", debug=True)
parser = WAMParser("Westminster", "Westminster", "http://idocs.westminster.gov.uk:8080/WAM/search/pas/index.htm", debug=True)
#parser = WAMParser("Westminster", "Westminster", "http://idocs.westminster.gov.uk:8080/WAM/search/pas/index.htm", debug=True)


print parser.getResults(1,8,2008)
print parser.getResults(31,8,2008)


# Left to fix # Left to fix




Loading…
Cancel
Save