소스 검색

Add scraper for Eastbourne. The info and comment links won't work since they require you to have a cookie. If you go

back to them once you have the cookie, you're fine...
import/raw
duncan.parkes 17 년 전
부모
커밋
6cf496dfb9
4개의 변경된 파일125개의 추가작업 그리고 3개의 파일을 삭제
  1. +120
    -0
      trunk/python_scrapers/Eastbourne.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv
  4. +3
    -3
      trunk/python_scrapers/WAM.py

+ 120
- 0
trunk/python_scrapers/Eastbourne.py 파일 보기

@@ -0,0 +1,120 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

import BeautifulSoup

import cookielib
cookie_jar = cookielib.CookieJar()

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class EastbourneParser:
def __init__(self, *args):

self.authority_name = "Eastbourne Borough Council"
self.authority_short_name = "Eastbourne"
# self.base_url = "http://www.eastbourne.gov.uk/planningapplications/search.asp"
self.first_url = "http://www.eastbourne.gov.uk/planningapplications/index.asp"
self.base_url = "http://www.eastbourne.gov.uk/planningapplications/results.asp"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# There's going to be some faffing around here. We need a cookie to say we have agreed to some T&Cs.

# First get the search page - we'll be redirected somewhere else for not having the cookie

first_request = urllib2.Request(self.first_url)
first_response = urllib2.urlopen(first_request)
cookie_jar.extract_cookies(first_response, first_request)

first_page_soup = BeautifulSoup.BeautifulSoup(first_response.read())

first_page_action = urlparse.urljoin(self.first_url, first_page_soup.form['action'])
the_input = first_page_soup.form.input

second_page_post_data = urllib.urlencode(
(
(the_input['name'], the_input['value']),
)
)
second_request = urllib2.Request(first_page_action, second_page_post_data)
cookie_jar.add_cookie_header(second_request)
second_response = urllib2.urlopen(second_request)
cookie_jar.extract_cookies(second_response, second_request)

# Now (finally) get the search page

#ApplicationNumber=&AddressPrefix=&Postcode=&CaseOfficer=&WardMember=&DateReceivedStart=31%2F08%2F2008&DateReceivedEnd=31%2F08%2F2008&DateDecidedStart=&DateDecidedEnd=&Locality=&AgentName=&ApplicantName=&ShowDecided=&DecisionLevel=&Sort1=FullAddressPrefix&Sort2=DateReceived+DESC&Submit=Search

post_data = urllib.urlencode(
(
("ApplicationNumber", ""),
("AddressPrefix", ""),
("Postcode", ""),
("CaseOfficer", ""),
("WardMember", ""),
("DateReceivedStart", search_day.strftime(date_format)),
("DateReceivedEnd", search_day.strftime(date_format)),
("DateDecidedStart", ""),
("DateDecidedEnd", ""),
("Locality", ""),
("AgentName", ""),
("ApplicantName", ""),
("ShowDecided", ""),
("DecisionLevel", ""),
("Sort1", "FullAddressPrefix"),
("Sort2", "DateReceived DESC"),
("Submit", "Search"),
)
)

search_request = urllib2.Request(self.base_url)
cookie_jar.add_cookie_header(search_request)
search_response = urllib2.urlopen(search_request, post_data)

soup = BeautifulSoup.BeautifulSoup(search_response.read())

app_no_strings = soup.findAll(text="App. No.:")

for app_no_string in app_no_strings:
application = PlanningApplication()
application.date_received = search_day

application.council_reference = app_no_string.findNext("a").string.strip()
application.info_url = urlparse.urljoin(self.base_url, app_no_string.findNext("a")['href'])

application.address = ' '.join([x.strip() for x in app_no_string.findNext(text="Site Address:").findNext("td").contents if type(x) == BeautifulSoup.NavigableString])
application.postcode = getPostcodeFromText(application.address)

application.comment_url = urlparse.urljoin(self.base_url, app_no_string.findNext(text="Comment on application").parent['href'])

application.description = app_no_string.findNext(text="Description:").findNext("td").string.strip()

self._results.addApplication(application)
return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = EastbourneParser()
print parser.getResults(1,9,2008)



# TODO - currently paginates at 20

+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv 파일 보기

@@ -57,3 +57,4 @@
"Hastings.py", "420" "Hastings.py", "420"
"Herefordshire.py", "420" "Herefordshire.py", "420"
"Exmoor.py", "420" "Exmoor.py", "420"
"Eastbourne.py", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv 파일 보기

@@ -261,3 +261,4 @@
"Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser" "Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser"
"Herefordshire Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser" "Herefordshire Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser"
"Exmoor National Park", "Exmoor", "", "Exmoor", "ExmoorParser" "Exmoor National Park", "Exmoor", "", "Exmoor", "ExmoorParser"
"Eastbourne Borough Council", "Eastbourne", "", "Eastbourne", "EastbourneParser"

+ 3
- 3
trunk/python_scrapers/WAM.py 파일 보기

@@ -172,15 +172,15 @@ if __name__ == '__main__':
#parser = BraintreeParser("Colchester", "Colchester", "http://www.planning.colchester.gov.uk/WAM/weeklyApplications.do", debug=True) #parser = BraintreeParser("Colchester", "Colchester", "http://www.planning.colchester.gov.uk/WAM/weeklyApplications.do", debug=True)
#parser = WAMParser("East Lothian", "East Lothian", "http://www.planning.eastlothian.gov.uk/WAM/pas/searchApplications.do", debug=True) #parser = WAMParser("East Lothian", "East Lothian", "http://www.planning.eastlothian.gov.uk/WAM/pas/searchApplications.do", debug=True)
#parser = BraintreeParser("North Somerset", "North Somerset", "http://wam.n-somerset.gov.uk/MULTIWAM/weeklyApplications.do", debug=True) #parser = BraintreeParser("North Somerset", "North Somerset", "http://wam.n-somerset.gov.uk/MULTIWAM/weeklyApplications.do", debug=True)
#parser = WAMParser("Nottingham", "Nottingham", "http://plan4.nottinghamcity.gov.uk/WAM/pas/searchApplications.do", debug=True)
parser = WAMParser("Nottingham", "Nottingham", "http://plan4.nottinghamcity.gov.uk/WAM/pas/searchApplications.do", debug=True)
#parser = PooleParser("Poole long", "Poole", "http://wam.boroughofpoole.com/WAM/pas/searchApplications.do", debug=True) #parser = PooleParser("Poole long", "Poole", "http://wam.boroughofpoole.com/WAM/pas/searchApplications.do", debug=True)
#parser = WAMParser("Rother long", "Rother", "http://www.planning.rother.gov.uk/WAM/pas/searchApplications.do", debug=True) #parser = WAMParser("Rother long", "Rother", "http://www.planning.rother.gov.uk/WAM/pas/searchApplications.do", debug=True)
#parser = BraintreeParser("South Gloucestershire", "South Gloucestershire", "http://planning.southglos.gov.uk/WAM/pas/WeeklyApplications.do", debug=True) #parser = BraintreeParser("South Gloucestershire", "South Gloucestershire", "http://planning.southglos.gov.uk/WAM/pas/WeeklyApplications.do", debug=True)
#parser = WAMParser("South Norfolk", "South Norfolk", "http://wam.south-norfolk.gov.uk/WAM/pas/searchApplications.do", debug=True) #parser = WAMParser("South Norfolk", "South Norfolk", "http://wam.south-norfolk.gov.uk/WAM/pas/searchApplications.do", debug=True)
#parser = BraintreeParser("Tower Hamlets", "Tower Hamlets", "http://194.201.98.213/WAM/weeklyApplications.do", debug=True) #parser = BraintreeParser("Tower Hamlets", "Tower Hamlets", "http://194.201.98.213/WAM/weeklyApplications.do", debug=True)
parser = WAMParser("Westminster", "Westminster", "http://idocs.westminster.gov.uk:8080/WAM/search/pas/index.htm", debug=True)
#parser = WAMParser("Westminster", "Westminster", "http://idocs.westminster.gov.uk:8080/WAM/search/pas/index.htm", debug=True)


print parser.getResults(1,8,2008)
print parser.getResults(31,8,2008)


# Left to fix # Left to fix




불러오는 중...
취소
저장