Pārlūkot izejas kodu

Add scraper for the Cairngorms National Park.

master
duncan.parkes pirms 16 gadiem
vecāks
revīzija
1cd560ace6
3 mainītis faili ar 136 papildinājumiem un 0 dzēšanām
  1. +134
    -0
      python_scrapers/Cairngorms.py
  2. +1
    -0
      python_scrapers/OtherFilesToCopy.csv
  3. +1
    -0
      python_scrapers/SitesToGenerate.csv

+ 134
- 0
python_scrapers/Cairngorms.py Parādīt failu

@@ -0,0 +1,134 @@
"""
"""

import time

import urlparse
import pycurl
import StringIO

import datetime


from BeautifulSoup import BeautifulSoup

from PlanningUtils import PlanningApplication, \
PlanningAuthorityResults, \
getPostcodeFromText

class CairngormsParser:
def __init__(self, *args):
self.authority_name = "Cairngorms National Park"
self.authority_short_name = "Cairngorms"
self.referer = "http://www.cairngorms.co.uk/planning/e-planning/index.php"

self.base_url = "http://www.cairngorms.co.uk/planning/e-planning/holding.php"

# The timestamp here looks like the number of milliseconds since 1970
self.first_post_url = "http://www.cairngorms.co.uk/planning/e-planning/search.php?timeStamp=%d"

self.comments_email_address = "planning@cairngorms.co.uk"

self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)


def getResultsByDayMonthYear(self, day, month, year):
search_date = datetime.date(year, month, day)

post_data = [
("CNPA_ref", ""),
("application_number", ""),
("LA_id", "%"),
("applicant_type", "%"),
("applicant_name", ""),
("development_address", ""),
("agent_name", ""),
("status", "%"),
("startDay", "%02d" %day),
("startMonth", "%02d" %month),
("startYear", "%d" %year),
("endDay", "%02d" %day),
("endMonth", "%02d" %month),
("endYear", "%d" %year),
]

first_post_data = "CNPA_ref=&application_number=&applicant_name=&development_address=&agent_name=&applicant_type=%%&LA_id=%%&status=%%&startYear=%(year)d&startMonth=%(month)02d&startDay=%(day)02d&endYear=%(year)d&endMonth=%(month)02d&endDay=%(day)02d" %{"day": day, "month": month, "year": year}

curlobj = pycurl.Curl()
curlobj.setopt(pycurl.FOLLOWLOCATION, True)
curlobj.setopt(pycurl.MAXREDIRS, 10)


# First we do a normal post, this would happen as an AJAX query
# from the browser and just returns the number of applications found.
fakefile = StringIO.StringIO()

curlobj.setopt(pycurl.URL, self.first_post_url %(int(time.time()*1000)))
curlobj.setopt(pycurl.POST, True)
curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
curlobj.setopt(pycurl.POSTFIELDS, first_post_data)

curlobj.perform()

app_count = int(fakefile.getvalue())
fakefile.close()

if app_count:
# Now we do another multipart form post
# This gives us something to use as the callback
fakefile = StringIO.StringIO()

curlobj.setopt(pycurl.URL, self.base_url)
curlobj.setopt(pycurl.HTTPPOST, post_data)
curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
curlobj.setopt(pycurl.REFERER, self.referer)
curlobj.perform()

soup = BeautifulSoup(fakefile.getvalue())
# We may as well free up the memory used by fakefile
fakefile.close()

for tr in soup.table.findAll("tr")[1:]:
application = PlanningApplication()
application.date_received = search_date
application.comment_url = self.comments_email_address

tds = tr.findAll("td")

application.council_reference = tds[1].string.strip()
application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])

application.address = tds[2].string.strip()
application.postcode = getPostcodeFromText(application.address)

# We're going to need to get the info page in order to get the description
# We can't pass a unicode string to pycurl, so we'll have to encode it.
curlobj.setopt(pycurl.URL, application.info_url.encode())
curlobj.setopt(pycurl.HTTPGET, True)

# This gives us something to use as the callback
fakefile = StringIO.StringIO()
curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)

curlobj.perform()
info_soup = BeautifulSoup(fakefile.getvalue())
fakefile.close()

application.description = info_soup.find(text="Development Details").findNext("td").string.strip()
application.osgb_x = info_soup.find(text="Grid Ref East").findNext("td").string.strip()
application.osgb_y = info_soup.find(text="Grid Ref North").findNext("td").string.strip()

self._results.addApplication(application)

return self._results

def getResults(self, day, month, year):
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()

if __name__ == '__main__':
parser = CairngormsParser()
print parser.getResults(3,10,2008)


# TODO
# Is there pagination?

+ 1
- 0
python_scrapers/OtherFilesToCopy.csv Parādīt failu

@@ -63,3 +63,4 @@
"Kirklees.py", "420"
"Lichfield.py", "420"
"Leicestershire.py", "420"
"Cairngorms.py", "420"

+ 1
- 0
python_scrapers/SitesToGenerate.csv Parādīt failu

@@ -269,3 +269,4 @@
"Kirklees Council", "Kirklees", "", "Kirklees", "KirkleesParser"
"Lichfield District Council", "Lichfield", "", "Lichfield", "LichfieldParser"
"Leicestershire County Council", "Leicestershire", "", "Leicestershire", "LeicestershireParser"
"Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser"

Notiek ielāde…
Atcelt
Saglabāt