From e74b8cb63130d71ee302c51efdcbf0b782d9b45a Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Sat, 24 Nov 2007 16:03:45 +0000 Subject: [PATCH] Add a parser for sites with urls ending in searchPageLoad.do This includes: Cumbria Lincolnshire West Sussex It should also include: Dorset Somerset but they are both down. All of these sites are county councils which are only responsible for a small number of apps, many of which are not at sites which have postcodes, so don't expect too much from these scrapers! --- python_scrapers/AtriumePlanning.py | 112 +++++++++++++++++++++++++++ python_scrapers/OtherFilesToCopy.csv | 1 + python_scrapers/PlanningUtils.py | 13 ++++ python_scrapers/SitesToGenerate.csv | 3 + 4 files changed, 129 insertions(+) create mode 100644 python_scrapers/AtriumePlanning.py diff --git a/python_scrapers/AtriumePlanning.py b/python_scrapers/AtriumePlanning.py new file mode 100644 index 0000000..dda04a0 --- /dev/null +++ b/python_scrapers/AtriumePlanning.py @@ -0,0 +1,112 @@ +import urllib2 +import urllib +import urlparse + +import datetime +#import re + +from BeautifulSoup import BeautifulSoup + +from PlanningUtils import PlanningApplication, \ + PlanningAuthorityResults, \ + getPostcodeFromText + + +info_path = "loadFullDetails.do" +comment_path = "loadRepresentation.do" + +class AtriumePlanningParser: + def __init__(self, + authority_name, + authority_short_name, + base_url, + debug=False): + + self.authority_name = authority_name + self.authority_short_name = authority_short_name + self.base_url = base_url + + self.info_url = urlparse.urljoin(base_url, info_path) + self.comment_url = urlparse.urljoin(base_url, comment_path) + + self.debug = debug + + self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) + + + def getResultsByDayMonthYear(self, day, month, year): + + # The end date for the search needs to be one day after the start + # date - presumably the date is used as a timestamp at midnight + search_start_date = datetime.date(year, month, day) + search_end_date = search_start_date + datetime.timedelta(1) + + + search_data = urllib.urlencode({"dayRegStart": search_start_date.strftime("%d"), + "monthRegStart": search_start_date.strftime("%b"), + "yearRegStart": search_start_date.strftime("%Y"), + "dayRegEnd": search_end_date.strftime("%d"), + "monthRegEnd": search_end_date.strftime("%b"), + "yearRegEnd": search_end_date.strftime("%Y"), + "searchType": "current", + "dispatch": "Search" + }) + + response = urllib2.urlopen(self.base_url, search_data) + + html = response.read() + + soup = BeautifulSoup(html) + + # Get a list of the trs in the results table + if soup.find(text="Results"): + + tds = soup.find(text="Results").parent.findNext("table").findAll("td") + + for td in tds: + if td.string: + if td.string.strip() == "Date Registered": + # We are starting a new App + self._current_application = PlanningApplication() + self._current_application.date_received = datetime.datetime.strptime(td.findNext("td").string, "%d-%m-%Y") + elif td.string.strip() == "Application Number": + self._current_application.council_reference = td.findNext("td").string + elif td.string.strip() == "Location": + location = td.findNext("td").string + self._current_application.address = location + + postcode = getPostcodeFromText(location) + if postcode: + self._current_application.postcode = postcode + elif td.string.strip() == "Proposal": + self._current_application.description = td.findNext("td").string + elif td.a and td.a.string.strip() == "View Full Details": + # The info url is td.a + messy_info_url = td.a["href"] + + # We need to get an id out of this url + query_str = urlparse.urlsplit(messy_info_url)[3] + + self._current_application.info_url = self.info_url + "?" + query_str + self._current_application.comment_url = self.comment_url + "?" + query_str + + if self._current_application.is_ready(): + self._results.addApplication(self._current_application) + + + + return self._results + + + def getResults(self, day, month, year): + return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() + + +#if __name__ == '__main__': +# cumbria_parser = AtriumePlanningParser("Cumbria County Council", "Cumbria", "http://217.114.50.149:7778/ePlanningOPS/loadResults.do") + +# print cumbria_parser.getResults(22,11,2007) +# lincolnshire_parser = AtriumePlanningParser("Lincolnshire County Council", "Lincolnshire", "") + +# print cumbria_parser.getResults(22,11,2007) + diff --git a/python_scrapers/OtherFilesToCopy.csv b/python_scrapers/OtherFilesToCopy.csv index b95ed11..2883758 100644 --- a/python_scrapers/OtherFilesToCopy.csv +++ b/python_scrapers/OtherFilesToCopy.csv @@ -12,3 +12,4 @@ "NorthHerts.cgi", "493" "Enfield.cgi", "493" "RutlandLike.py", "420" +"AtriumePlanning.py", "420" \ No newline at end of file diff --git a/python_scrapers/PlanningUtils.py b/python_scrapers/PlanningUtils.py index 9210446..273a652 100644 --- a/python_scrapers/PlanningUtils.py +++ b/python_scrapers/PlanningUtils.py @@ -88,6 +88,19 @@ class PlanningApplication: def __repr__(self): return self.displayXML() + + def is_ready(self): + # This method tells us if the application is complete + # Because of the postcode default, we can't really + # check the postcode - make sure it is filled in when + # you do the address. + return self.council_reference \ + and self.address \ + and self.description \ + and self.info_url \ + and self.comment_url \ + and self.date_received + def displayXML(self): #print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index c11b358..12a7f1f 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -141,3 +141,6 @@ "Melton Borough Council", "Melton", "http://www.meltononline.co.uk/planning/searchparam.asp", "RutlandLike", "RutlandLikeParser" "Harborough District Council", "Harborough", "http://pa.harborough.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" "East Northamptonshire Council", "East Northants", "http://publicaccesssrv.east-northamptonshire.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" +"Cumbria County Council", "Cumbria", "http://217.114.50.149:7778/ePlanningOPS/loadResults.do", "AtriumePlanning", "AtriumePlanningParser" +"Lincolnshire County Council", "Lincolnshire", "http://apps1.lincolnshire.gov.uk/ePlanning/loadResults.do", "AtriumePlanning", "AtriumePlanningParser" +"West Sussex County Council", "West Sussex", "http://eplanning.westsussex.gov.uk/ePlanningOPS/loadResults.do", "AtriumePlanning", "AtriumePlanningParser"