瀏覽代碼

Add scraper for Eastbourne. The info and comment links won't work since they require you to have a cookie. If you go

back to them once you have the cookie, you're fine...
import/raw
duncan.parkes 16 年之前
父節點
當前提交
6cf496dfb9
共有 4 個文件被更改,包括 125 次插入3 次删除
  1. +120
    -0
      trunk/python_scrapers/Eastbourne.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv
  4. +3
    -3
      trunk/python_scrapers/WAM.py

+ 120
- 0
trunk/python_scrapers/Eastbourne.py 查看文件

@@ -0,0 +1,120 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

import BeautifulSoup

import cookielib
cookie_jar = cookielib.CookieJar()

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class EastbourneParser:
def __init__(self, *args):

self.authority_name = "Eastbourne Borough Council"
self.authority_short_name = "Eastbourne"
# self.base_url = "http://www.eastbourne.gov.uk/planningapplications/search.asp"
self.first_url = "http://www.eastbourne.gov.uk/planningapplications/index.asp"
self.base_url = "http://www.eastbourne.gov.uk/planningapplications/results.asp"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# There's going to be some faffing around here. We need a cookie to say we have agreed to some T&Cs.

# First get the search page - we'll be redirected somewhere else for not having the cookie

first_request = urllib2.Request(self.first_url)
first_response = urllib2.urlopen(first_request)
cookie_jar.extract_cookies(first_response, first_request)

first_page_soup = BeautifulSoup.BeautifulSoup(first_response.read())

first_page_action = urlparse.urljoin(self.first_url, first_page_soup.form['action'])
the_input = first_page_soup.form.input

second_page_post_data = urllib.urlencode(
(
(the_input['name'], the_input['value']),
)
)
second_request = urllib2.Request(first_page_action, second_page_post_data)
cookie_jar.add_cookie_header(second_request)
second_response = urllib2.urlopen(second_request)
cookie_jar.extract_cookies(second_response, second_request)

# Now (finally) get the search page

#ApplicationNumber=&AddressPrefix=&Postcode=&CaseOfficer=&WardMember=&DateReceivedStart=31%2F08%2F2008&DateReceivedEnd=31%2F08%2F2008&DateDecidedStart=&DateDecidedEnd=&Locality=&AgentName=&ApplicantName=&ShowDecided=&DecisionLevel=&Sort1=FullAddressPrefix&Sort2=DateReceived+DESC&Submit=Search

post_data = urllib.urlencode(
(
("ApplicationNumber", ""),
("AddressPrefix", ""),
("Postcode", ""),
("CaseOfficer", ""),
("WardMember", ""),
("DateReceivedStart", search_day.strftime(date_format)),
("DateReceivedEnd", search_day.strftime(date_format)),
("DateDecidedStart", ""),
("DateDecidedEnd", ""),
("Locality", ""),
("AgentName", ""),
("ApplicantName", ""),
("ShowDecided", ""),
("DecisionLevel", ""),
("Sort1", "FullAddressPrefix"),
("Sort2", "DateReceived DESC"),
("Submit", "Search"),
)
)

search_request = urllib2.Request(self.base_url)
cookie_jar.add_cookie_header(search_request)
search_response = urllib2.urlopen(search_request, post_data)

soup = BeautifulSoup.BeautifulSoup(search_response.read())

app_no_strings = soup.findAll(text="App. No.:")

for app_no_string in app_no_strings:
application = PlanningApplication()
application.date_received = search_day

application.council_reference = app_no_string.findNext("a").string.strip()
application.info_url = urlparse.urljoin(self.base_url, app_no_string.findNext("a")['href'])

application.address = ' '.join([x.strip() for x in app_no_string.findNext(text="Site Address:").findNext("td").contents if type(x) == BeautifulSoup.NavigableString])
application.postcode = getPostcodeFromText(application.address)

application.comment_url = urlparse.urljoin(self.base_url, app_no_string.findNext(text="Comment on application").parent['href'])

application.description = app_no_string.findNext(text="Description:").findNext("td").string.strip()

self._results.addApplication(application)
return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = EastbourneParser()
print parser.getResults(1,9,2008)



# TODO - currently paginates at 20

+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv 查看文件

@@ -57,3 +57,4 @@
"Hastings.py", "420"
"Herefordshire.py", "420"
"Exmoor.py", "420"
"Eastbourne.py", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv 查看文件

@@ -261,3 +261,4 @@
"Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser"
"Herefordshire Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser"
"Exmoor National Park", "Exmoor", "", "Exmoor", "ExmoorParser"
"Eastbourne Borough Council", "Eastbourne", "", "Eastbourne", "EastbourneParser"

+ 3
- 3
trunk/python_scrapers/WAM.py 查看文件

@@ -172,15 +172,15 @@ if __name__ == '__main__':
#parser = BraintreeParser("Colchester", "Colchester", "http://www.planning.colchester.gov.uk/WAM/weeklyApplications.do", debug=True)
#parser = WAMParser("East Lothian", "East Lothian", "http://www.planning.eastlothian.gov.uk/WAM/pas/searchApplications.do", debug=True)
#parser = BraintreeParser("North Somerset", "North Somerset", "http://wam.n-somerset.gov.uk/MULTIWAM/weeklyApplications.do", debug=True)
#parser = WAMParser("Nottingham", "Nottingham", "http://plan4.nottinghamcity.gov.uk/WAM/pas/searchApplications.do", debug=True)
parser = WAMParser("Nottingham", "Nottingham", "http://plan4.nottinghamcity.gov.uk/WAM/pas/searchApplications.do", debug=True)
#parser = PooleParser("Poole long", "Poole", "http://wam.boroughofpoole.com/WAM/pas/searchApplications.do", debug=True)
#parser = WAMParser("Rother long", "Rother", "http://www.planning.rother.gov.uk/WAM/pas/searchApplications.do", debug=True)
#parser = BraintreeParser("South Gloucestershire", "South Gloucestershire", "http://planning.southglos.gov.uk/WAM/pas/WeeklyApplications.do", debug=True)
#parser = WAMParser("South Norfolk", "South Norfolk", "http://wam.south-norfolk.gov.uk/WAM/pas/searchApplications.do", debug=True)
#parser = BraintreeParser("Tower Hamlets", "Tower Hamlets", "http://194.201.98.213/WAM/weeklyApplications.do", debug=True)
parser = WAMParser("Westminster", "Westminster", "http://idocs.westminster.gov.uk:8080/WAM/search/pas/index.htm", debug=True)
#parser = WAMParser("Westminster", "Westminster", "http://idocs.westminster.gov.uk:8080/WAM/search/pas/index.htm", debug=True)

print parser.getResults(1,8,2008)
print parser.getResults(31,8,2008)

# Left to fix



Loading…
取消
儲存