From 1cd560ace623ffa09c2c611dc7572aba37b34f36 Mon Sep 17 00:00:00 2001
From: "duncan.parkes" <duncan.parkes@cf68f80a-222c-0410-aabe-5f79d0504a29>
Date: Wed, 15 Oct 2008 22:10:05 +0000
Subject: [PATCH] Add scraper for the Cairngorms National Park.

---
 python_scrapers/Cairngorms.py        | 134 +++++++++++++++++++++++++++
 python_scrapers/OtherFilesToCopy.csv |   1 +
 python_scrapers/SitesToGenerate.csv  |   1 +
 3 files changed, 136 insertions(+)
 create mode 100644 python_scrapers/Cairngorms.py

diff --git a/python_scrapers/Cairngorms.py b/python_scrapers/Cairngorms.py
new file mode 100644
index 0000000..f1c1fc4
--- /dev/null
+++ b/python_scrapers/Cairngorms.py
@@ -0,0 +1,134 @@
+"""
+"""
+
+import time
+
+import urlparse
+import pycurl
+import StringIO
+
+import datetime
+
+
+from BeautifulSoup import BeautifulSoup
+
+from PlanningUtils import PlanningApplication, \
+    PlanningAuthorityResults, \
+    getPostcodeFromText
+
+class CairngormsParser:
+    def __init__(self, *args):
+        self.authority_name = "Cairngorms National Park"
+        self.authority_short_name = "Cairngorms"
+        self.referer = "http://www.cairngorms.co.uk/planning/e-planning/index.php"
+
+        self.base_url = "http://www.cairngorms.co.uk/planning/e-planning/holding.php"
+
+        # The timestamp here looks like the number of milliseconds since 1970
+        self.first_post_url = "http://www.cairngorms.co.uk/planning/e-planning/search.php?timeStamp=%d"
+
+        self.comments_email_address = "planning@cairngorms.co.uk"
+
+        self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name)
+
+
+    def getResultsByDayMonthYear(self, day, month, year):
+        search_date = datetime.date(year, month, day)
+
+        post_data = [
+            ("CNPA_ref", ""),
+            ("application_number", ""),
+            ("LA_id", "%"),
+            ("applicant_type", "%"),
+            ("applicant_name", ""),
+            ("development_address", ""),
+            ("agent_name", ""),
+            ("status", "%"),
+            ("startDay", "%02d" %day),
+            ("startMonth", "%02d" %month),
+            ("startYear", "%d" %year),
+            ("endDay", "%02d" %day),
+            ("endMonth", "%02d" %month),
+            ("endYear", "%d" %year),
+            ]
+
+        first_post_data = "CNPA_ref=&application_number=&applicant_name=&development_address=&agent_name=&applicant_type=%%&LA_id=%%&status=%%&startYear=%(year)d&startMonth=%(month)02d&startDay=%(day)02d&endYear=%(year)d&endMonth=%(month)02d&endDay=%(day)02d" %{"day": day, "month": month, "year": year}
+
+        curlobj = pycurl.Curl()
+        curlobj.setopt(pycurl.FOLLOWLOCATION, True)
+        curlobj.setopt(pycurl.MAXREDIRS, 10)
+
+
+        # First we do a normal post, this would happen as an AJAX query 
+        # from the browser and just returns the number of applications found.
+        fakefile = StringIO.StringIO() 
+
+        curlobj.setopt(pycurl.URL, self.first_post_url %(int(time.time()*1000)))
+        curlobj.setopt(pycurl.POST, True)
+        curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
+        curlobj.setopt(pycurl.POSTFIELDS, first_post_data)
+
+        curlobj.perform()
+
+        app_count = int(fakefile.getvalue())
+        fakefile.close()
+
+        if app_count:
+            # Now we do another multipart form post
+            # This gives us something to use as the callback
+            fakefile = StringIO.StringIO() 
+
+            curlobj.setopt(pycurl.URL, self.base_url)
+            curlobj.setopt(pycurl.HTTPPOST, post_data)
+            curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
+            curlobj.setopt(pycurl.REFERER, self.referer)
+            curlobj.perform()
+
+            soup = BeautifulSoup(fakefile.getvalue())
+            # We may as well free up the memory used by fakefile
+            fakefile.close()
+
+            for tr in soup.table.findAll("tr")[1:]:
+                application = PlanningApplication()
+                application.date_received = search_date
+                application.comment_url = self.comments_email_address
+
+                tds = tr.findAll("td")
+
+                application.council_reference = tds[1].string.strip()
+                application.info_url = urlparse.urljoin(self.base_url, tds[0].a['href'])
+
+                application.address = tds[2].string.strip()
+                application.postcode = getPostcodeFromText(application.address)
+
+                # We're going to need to get the info page in order to get the description
+                # We can't pass a unicode string to pycurl, so we'll have to encode it.
+                curlobj.setopt(pycurl.URL, application.info_url.encode())
+                curlobj.setopt(pycurl.HTTPGET, True)
+
+                # This gives us something to use as the callback
+                fakefile = StringIO.StringIO() 
+                curlobj.setopt(pycurl.WRITEFUNCTION, fakefile.write)
+
+                curlobj.perform()
+                info_soup = BeautifulSoup(fakefile.getvalue())
+                fakefile.close()
+
+                application.description = info_soup.find(text="Development Details").findNext("td").string.strip()
+                application.osgb_x = info_soup.find(text="Grid Ref East").findNext("td").string.strip()
+                application.osgb_y = info_soup.find(text="Grid Ref North").findNext("td").string.strip()
+
+                self._results.addApplication(application)
+
+        return self._results
+
+    def getResults(self, day, month, year):
+        return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML()
+
+if __name__ == '__main__':
+    parser = CairngormsParser()
+    print parser.getResults(3,10,2008)
+
+
+# TODO
+# Is there pagination?
diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv
index 193b34c..bc3bfb8 100644
--- a/python_scrapers/OtherFilesToCopy.csv
+++ b/python_scrapers/OtherFilesToCopy.csv
@@ -63,3 +63,4 @@
 "Kirklees.py", "420"
 "Lichfield.py", "420"
 "Leicestershire.py", "420"
+"Cairngorms.py", "420"
diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv
index 7d7429f..adb9a18 100644
--- a/python_scrapers/SitesToGenerate.csv
+++ b/python_scrapers/SitesToGenerate.csv
@@ -269,3 +269,4 @@
 "Kirklees Council", "Kirklees", "", "Kirklees", "KirkleesParser"
 "Lichfield District Council", "Lichfield", "", "Lichfield", "LichfieldParser"
 "Leicestershire County Council", "Leicestershire", "", "Leicestershire", "LeicestershireParser"
+"Cairngorms National Park", "Cairngorms", "", "Cairngorms", "CairngormsParser"