소스 검색

Add scraper for the Cairngorms National Park.

import/raw
duncan.parkes 16 년 전
부모
커밋
689474a703
3개의 변경된 파일136개의 추가작업 그리고 0개의 파일을 삭제
  1. +134
    -0
      trunk/python_scrapers/Cairngorms.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 134
- 0
trunk/python_scrapers/Cairngorms.py 파일 보기

@@ -0,0 +1,134 @@
"""
"""

import time

import urlparse
import pycurl
import StringIO

import datetime


from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

class CairngormsParser:
def __init__(self, *args):
self.authority_name = "Cairngorms National Park"
self.authority_short_name = "Cairngorms"
self.referer = "http://www.cairngorms.co.uk/planning/e-planning/index.php"

self.base_url = "http://www.cairngorms.co.uk/planning/e-planning/holding.php"

# The timestamp here looks like the number of milliseconds since 1970
self.first_post_url = "http://www.cairngorms.co.uk/planning/e-planning/search.php?timeStamp=%d"

self.comments_email_address = "planning@cairngorms.co.uk"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

post_data = [
("CNPA_ref", ""),
("application_number", ""),
("LA_id", "%"),
("applicant_type", "%"),
("applicant_name", ""),
("development_address", ""),
("agent_name", ""),
("status", "%"),
("startDay", "%02d" %day),
("startMonth", "%02d" %month),
("startYear", "%d" %year),
("endDay", "%02d" %day),
("endMonth", "%02d" %month),
("endYear", "%d" %year),
]

first_post_data = "CNPA_ref=&application_number=&applicant_name=&development_address=&agent_name=&applicant_type=%%&LA_id=%%&status=%%&startYear=%(year)d&startMonth=%(month)02d&startDay=%(day)02d&endYear=%(year)d&endMonth=%(month)02d&endDay=%(day)02d" %{"day": day, "month": month, "year": year}

curlobj = pycurl.Curl()
curlobj.setopt(pycurl.FOLLOWLOCATION, True)
curlobj.setopt(pycurl.MAXREDIRS, 10)


# First we do a normal post, this would happen as an AJAX query
# from the browser and just returns the number of applications found.
fakefile = StringIO.StringIO()

curlobj.setopt(pycurl.URL, self.first_post_url %(int(time.time()*1000)))
curlobj.setopt(pycurl.POST, True)
curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
curlobj.setopt(pycurl.POSTFIELDS, first_post_data)

curlobj.perform()

app_count = int(fakefile.getvalue())
fakefile.close()

if app_count:
# Now we do another multipart form post
# This gives us something to use as the callback
fakefile = StringIO.StringIO()

curlobj.setopt(pycurl.URL, self.base_url)
curlobj.setopt(pycurl.HTTPPOST, post_data)
curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
curlobj.setopt(pycurl.REFERER, self.referer)
curlobj.perform()

soup = BeautifulSoup(fakefile.getvalue())
# We may as well free up the memory used by fakefile
fakefile.close()

for tr in soup.table.findAll("tr")[1:]:
application = PlanningApplication()
application.date_received = search_date
application.comment_url = self.comments_email_address

tds = tr.findAll("td")

application.council_reference = tds[1].string.strip()
application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])

application.address = tds[2].string.strip()
application.postcode = getPostcodeFromText(application.address)

# We're going to need to get the info page in order to get the description
# We can't pass a unicode string to pycurl, so we'll have to encode it.
curlobj.setopt(pycurl.URL, application.info_url.encode())
curlobj.setopt(pycurl.HTTPGET, True)

# This gives us something to use as the callback
fakefile = StringIO.StringIO()
curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)

curlobj.perform()
info_soup = BeautifulSoup(fakefile.getvalue())
fakefile.close()

application.description = info_soup.find(text="Development Details").findNext("td").string.strip()
application.osgb_x = info_soup.find(text="Grid Ref East").findNext("td").string.strip()
application.osgb_y = info_soup.find(text="Grid Ref North").findNext("td").string.strip()

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = CairngormsParser()
print parser.getResults(3,10,2008)


# TODO
# Is there pagination?

+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv 파일 보기

@@ -63,3 +63,4 @@
"Kirklees.py", "420"
"Lichfield.py", "420"
"Leicestershire.py", "420"
"Cairngorms.py", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv 파일 보기

@@ -269,3 +269,4 @@
"Kirklees Council", "Kirklees", "", "Kirklees", "KirkleesParser"
"Lichfield District Council", "Lichfield", "", "Lichfield", "LichfieldParser"
"Leicestershire County Council", "Leicestershire", "", "Leicestershire", "LeicestershireParser"
"Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser"

불러오는 중...
취소
저장