Alter PlanningUtils to CDATA everything, scrapping the xmlquote function.master
| @@ -0,0 +1,85 @@ | |||||
| import urllib2 | |||||
| import urllib | |||||
| import urlparse | |||||
| import datetime, time | |||||
| import cgi | |||||
| import re | |||||
| from BeautifulSoup import BeautifulSoup | |||||
| from PlanningUtils import PlanningApplication, \ | |||||
| PlanningAuthorityResults, \ | |||||
| getPostcodeFromText | |||||
| date_format = "%d/%m/%Y" | |||||
| class HerefordshireParser: | |||||
| comments_email_address = "Developmentcontrol@barnsley.gov.uk" | |||||
| def __init__(self, *args): | |||||
| self.authority_name = "Herefordshire County Council" | |||||
| self.authority_short_name = "Herefordshire" | |||||
| self.base_url = "http://www.herefordshire.gov.uk/gis/planListResults.aspx?pc=&address=&querytype=current&startdate=%(date)s&enddate=%(date)s&startrecord=0" | |||||
| #As we are going to the info page, we may as well pick up the comment url from there. | |||||
| # self.comment_url = "http://www.herefordshire.gov.uk/gis/planDetailCommentAddress.aspx?ApplicationId=%s" # This need the reference inserting | |||||
| self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) | |||||
| def getResultsByDayMonthYear(self, day, month, year): | |||||
| search_day = datetime.date(year, month, day) | |||||
| post_data = urllib.urlencode( | |||||
| (("show", "0"), | |||||
| ("Go", "GO"), | |||||
| ) | |||||
| ) | |||||
| # Now get the search page | |||||
| response = urllib2.urlopen(self.base_url %{"date": search_day.strftime(date_format)}) | |||||
| soup = BeautifulSoup(response.read()) | |||||
| if not soup.find(text=re.compile("Sorry, no matches found")): | |||||
| # There were apps for this date | |||||
| trs = soup.find("table", {"class": "gis_table"}).findAll("tr")[2:] | |||||
| for tr in trs: | |||||
| application = PlanningApplication() | |||||
| application.date_received = search_day | |||||
| application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) | |||||
| application.council_reference = tr.a.string | |||||
| # application.comment_url = self.comment_url %(application.council_reference) | |||||
| tds = tr.findAll("td") | |||||
| application.address = tds[1].string | |||||
| application.postcode = getPostcodeFromText(application.address) | |||||
| # This just gets us an initial segment of the description. | |||||
| # We are going to have to download the info page... | |||||
| #application.description = tds[2].string.strip() | |||||
| info_response = urllib.urlopen(application.info_url) | |||||
| info_soup = BeautifulSoup(info_response.read()) | |||||
| application.description = info_soup.find(text="Proposal:").findNext("td").string.strip() | |||||
| application.comment_url = urlparse.urljoin(self.base_url, info_soup.find("a", title="Link to Planning Application Comment page")['href']) | |||||
| self._results.addApplication(application) | |||||
| return self._results | |||||
| def getResults(self, day, month, year): | |||||
| return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() | |||||
| if __name__ == '__main__': | |||||
| parser = HerefordshireParser() | |||||
| print parser.getResults(31,8,2008) | |||||
| @@ -55,3 +55,4 @@ | |||||
| "Halton.py", "420" | "Halton.py", "420" | ||||
| "Hampshire.py", "420" | "Hampshire.py", "420" | ||||
| "Hastings.py", "420" | "Hastings.py", "420" | ||||
| "Herefordshire.py", "420" | |||||
| @@ -631,7 +631,7 @@ if __name__ == '__main__': | |||||
| # NOTE - 04/11/2007 is a sunday | # NOTE - 04/11/2007 is a sunday | ||||
| # I'm using it to test that the scrapers behave on days with no apps. | # I'm using it to test that the scrapers behave on days with no apps. | ||||
| parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/") | |||||
| # parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/") | |||||
| # parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/") | # parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/") | ||||
| # parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/") | # parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/") | ||||
| # parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/") | # parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/") | ||||
| @@ -639,7 +639,7 @@ if __name__ == '__main__': | |||||
| # parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/") | # parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/") | ||||
| # parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/") | # parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/") | ||||
| # parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/") | # parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/") | ||||
| # parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/") | |||||
| parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/") | |||||
| # parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/") | # parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/") | ||||
| # parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/") | # parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/") | ||||
| # parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/") | # parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/") | ||||
| @@ -655,7 +655,7 @@ if __name__ == '__main__': | |||||
| # parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/") | # parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/") | ||||
| # parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/") | # parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/") | ||||
| # parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk") | # parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk") | ||||
| print parser.getResults(3, 7, 2008) | |||||
| print parser.getResults(4, 9, 2008) | |||||
| # To Do | # To Do | ||||
| @@ -4,13 +4,6 @@ import re | |||||
| date_format = "%d/%m/%Y" | date_format = "%d/%m/%Y" | ||||
| def xmlQuote(text): | |||||
| # Change &s to &s | |||||
| # I suspect there is probably some standard python | |||||
| # function I should be using for this... | |||||
| return text.replace('&', '&') | |||||
| def fixNewlines(text): | def fixNewlines(text): | ||||
| # This can be used to sort out windows newlines | # This can be used to sort out windows newlines | ||||
| return text.replace("\r\n","\n") | return text.replace("\r\n","\n") | ||||
| @@ -112,13 +105,13 @@ class PlanningApplication: | |||||
| #print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received | #print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received | ||||
| contents = [ | contents = [ | ||||
| u"<council_reference>%s</council_reference>" %xmlQuote(self.council_reference), | |||||
| u"<address>%s</address>" %xmlQuote(self.address), | |||||
| u"<postcode>%s</postcode>" %self.postcode, | |||||
| u"<description>%s</description>" %xmlQuote(self.description), | |||||
| u"<info_url>%s</info_url>" %xmlQuote(self.info_url), | |||||
| u"<comment_url>%s</comment_url>" %xmlQuote(self.comment_url), | |||||
| u"<date_received>%s</date_received>" %self.date_received.strftime(date_format), | |||||
| u"<council_reference><![CDATA[%s]]></council_reference>" %(self.council_reference), | |||||
| u"<address><![CDATA[%s]]></address>" %(self.address), | |||||
| u"<postcode><![CDATA[%s]]></postcode>" %self.postcode, | |||||
| u"<description><![CDATA[%s]]></description>" %(self.description), | |||||
| u"<info_url><![CDATA[%s]]></info_url>" %(self.info_url), | |||||
| u"<comment_url><![CDATA[%s]]></comment_url>" %(self.comment_url), | |||||
| u"<date_received><![CDATA[%s]]></date_received>" %self.date_received.strftime(date_format), | |||||
| ] | ] | ||||
| if self.osgb_x: | if self.osgb_x: | ||||
| contents.append(u"<osgb_x>%s</osgb_x>" %(self.osgb_x)) | contents.append(u"<osgb_x>%s</osgb_x>" %(self.osgb_x)) | ||||
| @@ -259,3 +259,4 @@ | |||||
| "Halton Borough Council", "Halton", "", "Halton", "HaltonParser" | "Halton Borough Council", "Halton", "", "Halton", "HaltonParser" | ||||
| "Hampshire County Council", "Hampshire", "", "Hampshire", "HampshireParser" | "Hampshire County Council", "Hampshire", "", "Hampshire", "HampshireParser" | ||||
| "Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser" | "Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser" | ||||
| "Herefordshire County Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser" | |||||