瀏覽代碼

Add scraper for Eastbourne. The info and comment links won't work since they require you to have a cookie. If you go

back to them once you have the cookie, you're fine...
master
duncan.parkes 17 年之前
父節點
當前提交
92e757b8dd
共有 4 個文件被更改,包括 125 次插入3 次删除
  1. +120
    -0
      python_scrapers/Eastbourne.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      python_scrapers/SitesToGenerate.csv
  4. +3
    -3
      python_scrapers/WAM.py

+ 120
- 0
python_scrapers/Eastbourne.py 查看文件

@@ -0,0 +1,120 @@
import urllib2
import urllib
import urlparse

import datetime, time
import cgi

import BeautifulSoup

import cookielib
cookie_jar = cookielib.CookieJar()

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

date_format = "%d/%m/%Y"

class EastbourneParser:
def __init__(self, *args):

self.authority_name = "Eastbourne Borough Council"
self.authority_short_name = "Eastbourne"
# self.base_url = "http://www.eastbourne.gov.uk/planningapplications/search.asp"
self.first_url = "http://www.eastbourne.gov.uk/planningapplications/index.asp"
self.base_url = "http://www.eastbourne.gov.uk/planningapplications/results.asp"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_day = datetime.date(year, month, day)

# There's going to be some faffing around here. We need a cookie to say we have agreed to some T&Cs.

# First get the search page - we'll be redirected somewhere else for not having the cookie

first_request = urllib2.Request(self.first_url)
first_response = urllib2.urlopen(first_request)
cookie_jar.extract_cookies(first_response, first_request)

first_page_soup = BeautifulSoup.BeautifulSoup(first_response.read())

first_page_action = urlparse.urljoin(self.first_url, first_page_soup.form['action'])
the_input = first_page_soup.form.input

second_page_post_data = urllib.urlencode(
(
(the_input['name'], the_input['value']),
)
)
second_request = urllib2.Request(first_page_action, second_page_post_data)
cookie_jar.add_cookie_header(second_request)
second_response = urllib2.urlopen(second_request)
cookie_jar.extract_cookies(second_response, second_request)

# Now (finally) get the search page

#ApplicationNumber=&AddressPrefix=&Postcode=&CaseOfficer=&WardMember=&DateReceivedStart=31%2F08%2F2008&DateReceivedEnd=31%2F08%2F2008&DateDecidedStart=&DateDecidedEnd=&Locality=&AgentName=&ApplicantName=&ShowDecided=&DecisionLevel=&Sort1=FullAddressPrefix&Sort2=DateReceived+DESC&Submit=Search

post_data = urllib.urlencode(
(
("ApplicationNumber", ""),
("AddressPrefix", ""),
("Postcode", ""),
("CaseOfficer", ""),
("WardMember", ""),
("DateReceivedStart", search_day.strftime(date_format)),
("DateReceivedEnd", search_day.strftime(date_format)),
("DateDecidedStart", ""),
("DateDecidedEnd", ""),
("Locality", ""),
("AgentName", ""),
("ApplicantName", ""),
("ShowDecided", ""),
("DecisionLevel", ""),
("Sort1", "FullAddressPrefix"),
("Sort2", "DateReceived DESC"),
("Submit", "Search"),
)
)

search_request = urllib2.Request(self.base_url)
cookie_jar.add_cookie_header(search_request)
search_response = urllib2.urlopen(search_request, post_data)

soup = BeautifulSoup.BeautifulSoup(search_response.read())

app_no_strings = soup.findAll(text="App. No.:")

for app_no_string in app_no_strings:
application = PlanningApplication()
application.date_received = search_day

application.council_reference = app_no_string.findNext("a").string.strip()
application.info_url = urlparse.urljoin(self.base_url, app_no_string.findNext("a")['href'])

application.address = ' '.join([x.strip() for x in app_no_string.findNext(text="Site Address:").findNext("td").contents if type(x) == BeautifulSoup.NavigableString])
application.postcode = getPostcodeFromText(application.address)

application.comment_url = urlparse.urljoin(self.base_url, app_no_string.findNext(text="Comment on application").parent['href'])

application.description = app_no_string.findNext(text="Description:").findNext("td").string.strip()

self._results.addApplication(application)
return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = EastbourneParser()
print parser.getResults(1,9,2008)



# TODO - currently paginates at 20

+ 1
- 0
python_scrapers/OtherFilesToCopy.csv 查看文件

@@ -57,3 +57,4 @@
"Hastings.py", "420" "Hastings.py", "420"
"Herefordshire.py", "420" "Herefordshire.py", "420"
"Exmoor.py", "420" "Exmoor.py", "420"
"Eastbourne.py", "420"

+ 1
- 0
python_scrapers/SitesToGenerate.csv 查看文件

@@ -261,3 +261,4 @@
"Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser" "Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser"
"Herefordshire Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser" "Herefordshire Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser"
"Exmoor National Park", "Exmoor", "", "Exmoor", "ExmoorParser" "Exmoor National Park", "Exmoor", "", "Exmoor", "ExmoorParser"
"Eastbourne Borough Council", "Eastbourne", "", "Eastbourne", "EastbourneParser"

+ 3
- 3
python_scrapers/WAM.py 查看文件

@@ -172,15 +172,15 @@ if __name__ == '__main__':
#parser = BraintreeParser("Colchester", "Colchester", "http://www.planning.colchester.gov.uk/WAM/weeklyApplications.do", debug=True) #parser = BraintreeParser("Colchester", "Colchester", "http://www.planning.colchester.gov.uk/WAM/weeklyApplications.do", debug=True)
#parser = WAMParser("East Lothian", "East Lothian", "http://www.planning.eastlothian.gov.uk/WAM/pas/searchApplications.do", debug=True) #parser = WAMParser("East Lothian", "East Lothian", "http://www.planning.eastlothian.gov.uk/WAM/pas/searchApplications.do", debug=True)
#parser = BraintreeParser("North Somerset", "North Somerset", "http://wam.n-somerset.gov.uk/MULTIWAM/weeklyApplications.do", debug=True) #parser = BraintreeParser("North Somerset", "North Somerset", "http://wam.n-somerset.gov.uk/MULTIWAM/weeklyApplications.do", debug=True)
#parser = WAMParser("Nottingham", "Nottingham", "http://plan4.nottinghamcity.gov.uk/WAM/pas/searchApplications.do", debug=True)
parser = WAMParser("Nottingham", "Nottingham", "http://plan4.nottinghamcity.gov.uk/WAM/pas/searchApplications.do", debug=True)
#parser = PooleParser("Poole long", "Poole", "http://wam.boroughofpoole.com/WAM/pas/searchApplications.do", debug=True) #parser = PooleParser("Poole long", "Poole", "http://wam.boroughofpoole.com/WAM/pas/searchApplications.do", debug=True)
#parser = WAMParser("Rother long", "Rother", "http://www.planning.rother.gov.uk/WAM/pas/searchApplications.do", debug=True) #parser = WAMParser("Rother long", "Rother", "http://www.planning.rother.gov.uk/WAM/pas/searchApplications.do", debug=True)
#parser = BraintreeParser("South Gloucestershire", "South Gloucestershire", "http://planning.southglos.gov.uk/WAM/pas/WeeklyApplications.do", debug=True) #parser = BraintreeParser("South Gloucestershire", "South Gloucestershire", "http://planning.southglos.gov.uk/WAM/pas/WeeklyApplications.do", debug=True)
#parser = WAMParser("South Norfolk", "South Norfolk", "http://wam.south-norfolk.gov.uk/WAM/pas/searchApplications.do", debug=True) #parser = WAMParser("South Norfolk", "South Norfolk", "http://wam.south-norfolk.gov.uk/WAM/pas/searchApplications.do", debug=True)
#parser = BraintreeParser("Tower Hamlets", "Tower Hamlets", "http://194.201.98.213/WAM/weeklyApplications.do", debug=True) #parser = BraintreeParser("Tower Hamlets", "Tower Hamlets", "http://194.201.98.213/WAM/weeklyApplications.do", debug=True)
parser = WAMParser("Westminster", "Westminster", "http://idocs.westminster.gov.uk:8080/WAM/search/pas/index.htm", debug=True)
#parser = WAMParser("Westminster", "Westminster", "http://idocs.westminster.gov.uk:8080/WAM/search/pas/index.htm", debug=True)


print parser.getResults(1,8,2008)
print parser.getResults(31,8,2008)


# Left to fix # Left to fix




Loading…
取消
儲存