Alter PlanningUtils to CDATA everything, scrapping the xmlquote function.import/raw
@@ -0,0 +1,85 @@ | |||
import urllib2 | |||
import urllib | |||
import urlparse | |||
import datetime, time | |||
import cgi | |||
import re | |||
from BeautifulSoup import BeautifulSoup | |||
from PlanningUtils import PlanningApplication, \ | |||
PlanningAuthorityResults, \ | |||
getPostcodeFromText | |||
date_format = "%d/%m/%Y" | |||
class HerefordshireParser: | |||
comments_email_address = "Developmentcontrol@barnsley.gov.uk" | |||
def __init__(self, *args): | |||
self.authority_name = "Herefordshire County Council" | |||
self.authority_short_name = "Herefordshire" | |||
self.base_url = "http://www.herefordshire.gov.uk/gis/planListResults.aspx?pc=&address=&querytype=current&startdate=%(date)s&enddate=%(date)s&startrecord=0" | |||
#As we are going to the info page, we may as well pick up the comment url from there. | |||
# self.comment_url = "http://www.herefordshire.gov.uk/gis/planDetailCommentAddress.aspx?ApplicationId=%s" # This need the reference inserting | |||
self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) | |||
def getResultsByDayMonthYear(self, day, month, year): | |||
search_day = datetime.date(year, month, day) | |||
post_data = urllib.urlencode( | |||
(("show", "0"), | |||
("Go", "GO"), | |||
) | |||
) | |||
# Now get the search page | |||
response = urllib2.urlopen(self.base_url %{"date": search_day.strftime(date_format)}) | |||
soup = BeautifulSoup(response.read()) | |||
if not soup.find(text=re.compile("Sorry, no matches found")): | |||
# There were apps for this date | |||
trs = soup.find("table", {"class": "gis_table"}).findAll("tr")[2:] | |||
for tr in trs: | |||
application = PlanningApplication() | |||
application.date_received = search_day | |||
application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) | |||
application.council_reference = tr.a.string | |||
# application.comment_url = self.comment_url %(application.council_reference) | |||
tds = tr.findAll("td") | |||
application.address = tds[1].string | |||
application.postcode = getPostcodeFromText(application.address) | |||
# This just gets us an initial segment of the description. | |||
# We are going to have to download the info page... | |||
#application.description = tds[2].string.strip() | |||
info_response = urllib.urlopen(application.info_url) | |||
info_soup = BeautifulSoup(info_response.read()) | |||
application.description = info_soup.find(text="Proposal:").findNext("td").string.strip() | |||
application.comment_url = urlparse.urljoin(self.base_url, info_soup.find("a", title="Link to Planning Application Comment page")['href']) | |||
self._results.addApplication(application) | |||
return self._results | |||
def getResults(self, day, month, year): | |||
return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() | |||
if __name__ == '__main__': | |||
parser = HerefordshireParser() | |||
print parser.getResults(31,8,2008) | |||
@@ -55,3 +55,4 @@ | |||
"Halton.py", "420" | |||
"Hampshire.py", "420" | |||
"Hastings.py", "420" | |||
"Herefordshire.py", "420" |
@@ -631,7 +631,7 @@ if __name__ == '__main__': | |||
# NOTE - 04/11/2007 is a sunday | |||
# I'm using it to test that the scrapers behave on days with no apps. | |||
parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/") | |||
# parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/") | |||
# parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/") | |||
# parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/") | |||
# parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/") | |||
@@ -639,7 +639,7 @@ if __name__ == '__main__': | |||
# parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/") | |||
# parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/") | |||
# parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/") | |||
# parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/") | |||
parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/") | |||
# parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/") | |||
# parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/") | |||
# parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/") | |||
@@ -655,7 +655,7 @@ if __name__ == '__main__': | |||
# parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/") | |||
# parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/") | |||
# parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk") | |||
print parser.getResults(3, 7, 2008) | |||
print parser.getResults(4, 9, 2008) | |||
# To Do | |||
@@ -4,13 +4,6 @@ import re | |||
date_format = "%d/%m/%Y" | |||
def xmlQuote(text): | |||
# Change &s to &s | |||
# I suspect there is probably some standard python | |||
# function I should be using for this... | |||
return text.replace('&', '&') | |||
def fixNewlines(text): | |||
# This can be used to sort out windows newlines | |||
return text.replace("\r\n","\n") | |||
@@ -112,13 +105,13 @@ class PlanningApplication: | |||
#print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received | |||
contents = [ | |||
u"<council_reference>%s</council_reference>" %xmlQuote(self.council_reference), | |||
u"<address>%s</address>" %xmlQuote(self.address), | |||
u"<postcode>%s</postcode>" %self.postcode, | |||
u"<description>%s</description>" %xmlQuote(self.description), | |||
u"<info_url>%s</info_url>" %xmlQuote(self.info_url), | |||
u"<comment_url>%s</comment_url>" %xmlQuote(self.comment_url), | |||
u"<date_received>%s</date_received>" %self.date_received.strftime(date_format), | |||
u"<council_reference><![CDATA[%s]]></council_reference>" %(self.council_reference), | |||
u"<address><![CDATA[%s]]></address>" %(self.address), | |||
u"<postcode><![CDATA[%s]]></postcode>" %self.postcode, | |||
u"<description><![CDATA[%s]]></description>" %(self.description), | |||
u"<info_url><![CDATA[%s]]></info_url>" %(self.info_url), | |||
u"<comment_url><![CDATA[%s]]></comment_url>" %(self.comment_url), | |||
u"<date_received><![CDATA[%s]]></date_received>" %self.date_received.strftime(date_format), | |||
] | |||
if self.osgb_x: | |||
contents.append(u"<osgb_x>%s</osgb_x>" %(self.osgb_x)) | |||
@@ -259,3 +259,4 @@ | |||
"Halton Borough Council", "Halton", "", "Halton", "HaltonParser" | |||
"Hampshire County Council", "Hampshire", "", "Hampshire", "HampshireParser" | |||
"Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser" | |||
"Herefordshire County Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser" |