Alter PlanningUtils to CDATA everything, scrapping the xmlquote function.master
@@ -0,0 +1,85 @@ | |||||
import urllib2 | |||||
import urllib | |||||
import urlparse | |||||
import datetime, time | |||||
import cgi | |||||
import re | |||||
from BeautifulSoup import BeautifulSoup | |||||
from PlanningUtils import PlanningApplication, \ | |||||
PlanningAuthorityResults, \ | |||||
getPostcodeFromText | |||||
date_format = "%d/%m/%Y" | |||||
class HerefordshireParser: | |||||
comments_email_address = "Developmentcontrol@barnsley.gov.uk" | |||||
def __init__(self, *args): | |||||
self.authority_name = "Herefordshire County Council" | |||||
self.authority_short_name = "Herefordshire" | |||||
self.base_url = "http://www.herefordshire.gov.uk/gis/planListResults.aspx?pc=&address=&querytype=current&startdate=%(date)s&enddate=%(date)s&startrecord=0" | |||||
#As we are going to the info page, we may as well pick up the comment url from there. | |||||
# self.comment_url = "http://www.herefordshire.gov.uk/gis/planDetailCommentAddress.aspx?ApplicationId=%s" # This need the reference inserting | |||||
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) | |||||
def getResultsByDayMonthYear(self, day, month, year): | |||||
search_day = datetime.date(year, month, day) | |||||
post_data = urllib.urlencode( | |||||
(("show", "0"), | |||||
("Go", "GO"), | |||||
) | |||||
) | |||||
# Now get the search page | |||||
response = urllib2.urlopen(self.base_url %{"date": search_day.strftime(date_format)}) | |||||
soup = BeautifulSoup(response.read()) | |||||
if not soup.find(text=re.compile("Sorry, no matches found")): | |||||
# There were apps for this date | |||||
trs = soup.find("table", {"class": "gis_table"}).findAll("tr")[2:] | |||||
for tr in trs: | |||||
application = PlanningApplication() | |||||
application.date_received = search_day | |||||
application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) | |||||
application.council_reference = tr.a.string | |||||
# application.comment_url = self.comment_url %(application.council_reference) | |||||
tds = tr.findAll("td") | |||||
application.address = tds[1].string | |||||
application.postcode = getPostcodeFromText(application.address) | |||||
# This just gets us an initial segment of the description. | |||||
# We are going to have to download the info page... | |||||
#application.description = tds[2].string.strip() | |||||
info_response = urllib.urlopen(application.info_url) | |||||
info_soup = BeautifulSoup(info_response.read()) | |||||
application.description = info_soup.find(text="Proposal:").findNext("td").string.strip() | |||||
application.comment_url = urlparse.urljoin(self.base_url, info_soup.find("a", title="Link to Planning Application Comment page")['href']) | |||||
self._results.addApplication(application) | |||||
return self._results | |||||
def getResults(self, day, month, year): | |||||
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() | |||||
if __name__ == '__main__': | |||||
parser = HerefordshireParser() | |||||
print parser.getResults(31,8,2008) | |||||
@@ -55,3 +55,4 @@ | |||||
"Halton.py", "420" | "Halton.py", "420" | ||||
"Hampshire.py", "420" | "Hampshire.py", "420" | ||||
"Hastings.py", "420" | "Hastings.py", "420" | ||||
"Herefordshire.py", "420" |
@@ -631,7 +631,7 @@ if __name__ == '__main__': | |||||
# NOTE - 04/11/2007 is a sunday | # NOTE - 04/11/2007 is a sunday | ||||
# I'm using it to test that the scrapers behave on days with no apps. | # I'm using it to test that the scrapers behave on days with no apps. | ||||
parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/") | |||||
# parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/") | |||||
# parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/") | # parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/") | ||||
# parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/") | # parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/") | ||||
# parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/") | # parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/") | ||||
@@ -639,7 +639,7 @@ if __name__ == '__main__': | |||||
# parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/") | # parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/") | ||||
# parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/") | # parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/") | ||||
# parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/") | # parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/") | ||||
# parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/") | |||||
parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/") | |||||
# parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/") | # parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/") | ||||
# parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/") | # parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/") | ||||
# parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/") | # parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/") | ||||
@@ -655,7 +655,7 @@ if __name__ == '__main__': | |||||
# parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/") | # parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/") | ||||
# parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/") | # parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/") | ||||
# parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk") | # parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk") | ||||
print parser.getResults(3, 7, 2008) | |||||
print parser.getResults(4, 9, 2008) | |||||
# To Do | # To Do | ||||
@@ -4,13 +4,6 @@ import re | |||||
date_format = "%d/%m/%Y" | date_format = "%d/%m/%Y" | ||||
def xmlQuote(text): | |||||
# Change &s to &s | |||||
# I suspect there is probably some standard python | |||||
# function I should be using for this... | |||||
return text.replace('&', '&') | |||||
def fixNewlines(text): | def fixNewlines(text): | ||||
# This can be used to sort out windows newlines | # This can be used to sort out windows newlines | ||||
return text.replace("\r\n","\n") | return text.replace("\r\n","\n") | ||||
@@ -112,13 +105,13 @@ class PlanningApplication: | |||||
#print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received | #print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received | ||||
contents = [ | contents = [ | ||||
u"<council_reference>%s</council_reference>" %xmlQuote(self.council_reference), | |||||
u"<address>%s</address>" %xmlQuote(self.address), | |||||
u"<postcode>%s</postcode>" %self.postcode, | |||||
u"<description>%s</description>" %xmlQuote(self.description), | |||||
u"<info_url>%s</info_url>" %xmlQuote(self.info_url), | |||||
u"<comment_url>%s</comment_url>" %xmlQuote(self.comment_url), | |||||
u"<date_received>%s</date_received>" %self.date_received.strftime(date_format), | |||||
u"<council_reference><![CDATA[%s]]></council_reference>" %(self.council_reference), | |||||
u"<address><![CDATA[%s]]></address>" %(self.address), | |||||
u"<postcode><![CDATA[%s]]></postcode>" %self.postcode, | |||||
u"<description><![CDATA[%s]]></description>" %(self.description), | |||||
u"<info_url><![CDATA[%s]]></info_url>" %(self.info_url), | |||||
u"<comment_url><![CDATA[%s]]></comment_url>" %(self.comment_url), | |||||
u"<date_received><![CDATA[%s]]></date_received>" %self.date_received.strftime(date_format), | |||||
] | ] | ||||
if self.osgb_x: | if self.osgb_x: | ||||
contents.append(u"<osgb_x>%s</osgb_x>" %(self.osgb_x)) | contents.append(u"<osgb_x>%s</osgb_x>" %(self.osgb_x)) | ||||
@@ -259,3 +259,4 @@ | |||||
"Halton Borough Council", "Halton", "", "Halton", "HaltonParser" | "Halton Borough Council", "Halton", "", "Halton", "HaltonParser" | ||||
"Hampshire County Council", "Hampshire", "", "Hampshire", "HampshireParser" | "Hampshire County Council", "Hampshire", "", "Hampshire", "HampshireParser" | ||||
"Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser" | "Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser" | ||||
"Herefordshire County Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser" |