Procházet zdrojové kódy

Add scraper for Eastbourne. The info and comment links won't work since they require you to have a cookie. If you go

back to them once you have the cookie, you're fine...
duncan.parkes před 16 roky
4 změnil soubory, kde provedl 125 přidání a 3 odebrání
  1. +120
  2. +1
  3. +1
  4. +3

+ 120
- 0
trunk/python_scrapers/ Zobrazit soubor

@@ -0,0 +1,120 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

import BeautifulSoup

import cookielib
cookie_jar = cookielib.CookieJar()

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \

date_format = "%d/%m/%Y"

class EastbourneParser:
def __init__(self, *args):

self.authority_name = "Eastbourne Borough Council"
self.authority_short_name = "Eastbourne"
# self.base_url = ""
self.first_url = ""
self.base_url = ""

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)

def getResultsByDayMonthYear(self, day, month, year):
search_day =, month, day)

# There's going to be some faffing around here. We need a cookie to say we have agreed to some T&Cs.

# First get the search page - we'll be redirected somewhere else for not having the cookie

first_request = urllib2.Request(self.first_url)
first_response = urllib2.urlopen(first_request)
cookie_jar.extract_cookies(first_response, first_request)

first_page_soup = BeautifulSoup.BeautifulSoup(

first_page_action = urlparse.urljoin(self.first_url, first_page_soup.form['action'])
the_input = first_page_soup.form.input

second_page_post_data = urllib.urlencode(
(the_input['name'], the_input['value']),
second_request = urllib2.Request(first_page_action, second_page_post_data)
second_response = urllib2.urlopen(second_request)
cookie_jar.extract_cookies(second_response, second_request)

# Now (finally) get the search page


post_data = urllib.urlencode(
("ApplicationNumber", ""),
("AddressPrefix", ""),
("Postcode", ""),
("CaseOfficer", ""),
("WardMember", ""),
("DateReceivedStart", search_day.strftime(date_format)),
("DateReceivedEnd", search_day.strftime(date_format)),
("DateDecidedStart", ""),
("DateDecidedEnd", ""),
("Locality", ""),
("AgentName", ""),
("ApplicantName", ""),
("ShowDecided", ""),
("DecisionLevel", ""),
("Sort1", "FullAddressPrefix"),
("Sort2", "DateReceived DESC"),
("Submit", "Search"),

search_request = urllib2.Request(self.base_url)
search_response = urllib2.urlopen(search_request, post_data)

soup = BeautifulSoup.BeautifulSoup(

app_no_strings = soup.findAll(text="App. No.:")

for app_no_string in app_no_strings:
application = PlanningApplication()
application.date_received = search_day

application.council_reference = app_no_string.findNext("a").string.strip()
application.info_url = urlparse.urljoin(self.base_url, app_no_string.findNext("a")['href'])

application.address = ' '.join([x.strip() for x in app_no_string.findNext(text="Site Address:").findNext("td").contents if type(x) == BeautifulSoup.NavigableString])
application.postcode = getPostcodeFromText(application.address)

application.comment_url = urlparse.urljoin(self.base_url, app_no_string.findNext(text="Comment on application").parent['href'])

application.description = app_no_string.findNext(text="Description:").findNext("td").string.strip()

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = EastbourneParser()
print parser.getResults(1,9,2008)

# TODO - currently paginates at 20

+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv Zobrazit soubor

@@ -57,3 +57,4 @@
"", "420"
"", "420"
"", "420"
"", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv Zobrazit soubor

@@ -261,3 +261,4 @@
"Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser"
"Herefordshire Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser"
"Exmoor National Park", "Exmoor", "", "Exmoor", "ExmoorParser"
"Eastbourne Borough Council", "Eastbourne", "", "Eastbourne", "EastbourneParser"

+ 3
- 3
trunk/python_scrapers/ Zobrazit soubor

@@ -172,15 +172,15 @@ if __name__ == '__main__':
#parser = BraintreeParser("Colchester", "Colchester", "", debug=True)
#parser = WAMParser("East Lothian", "East Lothian", "", debug=True)
#parser = BraintreeParser("North Somerset", "North Somerset", "", debug=True)
#parser = WAMParser("Nottingham", "Nottingham", "", debug=True)
parser = WAMParser("Nottingham", "Nottingham", "", debug=True)
#parser = PooleParser("Poole long", "Poole", "", debug=True)
#parser = WAMParser("Rother long", "Rother", "", debug=True)
#parser = BraintreeParser("South Gloucestershire", "South Gloucestershire", "", debug=True)
#parser = WAMParser("South Norfolk", "South Norfolk", "", debug=True)
#parser = BraintreeParser("Tower Hamlets", "Tower Hamlets", "", debug=True)
parser = WAMParser("Westminster", "Westminster", "", debug=True)
#parser = WAMParser("Westminster", "Westminster", "", debug=True)

print parser.getResults(1,8,2008)
print parser.getResults(31,8,2008)

# Left to fix

Načítá se…