Browse Source

Add scraper for the Cairngorms National Park.

import/raw
duncan.parkes 16 years ago
parent
commit
689474a703
3 changed files with 136 additions and 0 deletions
  1. +134
    -0
      trunk/python_scrapers/Cairngorms.py
  2. +1
    -0
      trunk/python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      trunk/python_scrapers/SitesToGenerate.csv

+ 134
- 0
trunk/python_scrapers/Cairngorms.py View File

@@ -0,0 +1,134 @@
"""
"""

import time

import urlparse
import pycurl
import StringIO

import datetime


from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

class CairngormsParser:
def __init__(self, *args):
self.authority_name = "Cairngorms National Park"
self.authority_short_name = "Cairngorms"
self.referer = "http://www.cairngorms.co.uk/planning/e-planning/index.php"

self.base_url = "http://www.cairngorms.co.uk/planning/e-planning/holding.php"

# The timestamp here looks like the number of milliseconds since 1970
self.first_post_url = "http://www.cairngorms.co.uk/planning/e-planning/search.php?timeStamp=%d"

self.comments_email_address = "planning@cairngorms.co.uk"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

post_data = [
("CNPA_ref", ""),
("application_number", ""),
("LA_id", "%"),
("applicant_type", "%"),
("applicant_name", ""),
("development_address", ""),
("agent_name", ""),
("status", "%"),
("startDay", "%02d" %day),
("startMonth", "%02d" %month),
("startYear", "%d" %year),
("endDay", "%02d" %day),
("endMonth", "%02d" %month),
("endYear", "%d" %year),
]

first_post_data = "CNPA_ref=&application_number=&applicant_name=&development_address=&agent_name=&applicant_type=%%&LA_id=%%&status=%%&startYear=%(year)d&startMonth=%(month)02d&startDay=%(day)02d&endYear=%(year)d&endMonth=%(month)02d&endDay=%(day)02d" %{"day": day, "month": month, "year": year}

curlobj = pycurl.Curl()
curlobj.setopt(pycurl.FOLLOWLOCATION, True)
curlobj.setopt(pycurl.MAXREDIRS, 10)


# First we do a normal post, this would happen as an AJAX query
# from the browser and just returns the number of applications found.
fakefile = StringIO.StringIO()

curlobj.setopt(pycurl.URL, self.first_post_url %(int(time.time()*1000)))
curlobj.setopt(pycurl.POST, True)
curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
curlobj.setopt(pycurl.POSTFIELDS, first_post_data)

curlobj.perform()

app_count = int(fakefile.getvalue())
fakefile.close()

if app_count:
# Now we do another multipart form post
# This gives us something to use as the callback
fakefile = StringIO.StringIO()

curlobj.setopt(pycurl.URL, self.base_url)
curlobj.setopt(pycurl.HTTPPOST, post_data)
curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
curlobj.setopt(pycurl.REFERER, self.referer)
curlobj.perform()

soup = BeautifulSoup(fakefile.getvalue())
# We may as well free up the memory used by fakefile
fakefile.close()

for tr in soup.table.findAll("tr")[1:]:
application = PlanningApplication()
application.date_received = search_date
application.comment_url = self.comments_email_address

tds = tr.findAll("td")

application.council_reference = tds[1].string.strip()
application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])

application.address = tds[2].string.strip()
application.postcode = getPostcodeFromText(application.address)

# We're going to need to get the info page in order to get the description
# We can't pass a unicode string to pycurl, so we'll have to encode it.
curlobj.setopt(pycurl.URL, application.info_url.encode())
curlobj.setopt(pycurl.HTTPGET, True)

# This gives us something to use as the callback
fakefile = StringIO.StringIO()
curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)

curlobj.perform()
info_soup = BeautifulSoup(fakefile.getvalue())
fakefile.close()

application.description = info_soup.find(text="Development Details").findNext("td").string.strip()
application.osgb_x = info_soup.find(text="Grid Ref East").findNext("td").string.strip()
application.osgb_y = info_soup.find(text="Grid Ref North").findNext("td").string.strip()

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = CairngormsParser()
print parser.getResults(3,10,2008)


# TODO
# Is there pagination?

+ 1
- 0
trunk/python_scrapers/OtherFilesToCopy.csv View File

@@ -63,3 +63,4 @@
"Kirklees.py", "420" "Kirklees.py", "420"
"Lichfield.py", "420" "Lichfield.py", "420"
"Leicestershire.py", "420" "Leicestershire.py", "420"
"Cairngorms.py", "420"

+ 1
- 0
trunk/python_scrapers/SitesToGenerate.csv View File

@@ -269,3 +269,4 @@
"Kirklees Council", "Kirklees", "", "Kirklees", "KirkleesParser" "Kirklees Council", "Kirklees", "", "Kirklees", "KirkleesParser"
"Lichfield District Council", "Lichfield", "", "Lichfield", "LichfieldParser" "Lichfield District Council", "Lichfield", "", "Lichfield", "LichfieldParser"
"Leicestershire County Council", "Leicestershire", "", "Leicestershire", "LeicestershireParser" "Leicestershire County Council", "Leicestershire", "", "Leicestershire", "LeicestershireParser"
"Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser"

Loading…
Cancel
Save