Alter PlanningUtils to CDATA everything, scrapping the xmlquote function.master
| @@ -0,0 +1,85 @@ | |||
| import urllib2 | |||
| import urllib | |||
| import urlparse | |||
| import datetime, time | |||
| import cgi | |||
| import re | |||
| from BeautifulSoup import BeautifulSoup | |||
| from PlanningUtils import PlanningApplication, \ | |||
| PlanningAuthorityResults, \ | |||
| getPostcodeFromText | |||
| date_format = "%d/%m/%Y" | |||
| class HerefordshireParser: | |||
| comments_email_address = "Developmentcontrol@barnsley.gov.uk" | |||
| def __init__(self, *args): | |||
| self.authority_name = "Herefordshire County Council" | |||
| self.authority_short_name = "Herefordshire" | |||
| self.base_url = "http://www.herefordshire.gov.uk/gis/planListResults.aspx?pc=&address=&querytype=current&startdate=%(date)s&enddate=%(date)s&startrecord=0" | |||
| #As we are going to the info page, we may as well pick up the comment url from there. | |||
| # self.comment_url = "http://www.herefordshire.gov.uk/gis/planDetailCommentAddress.aspx?ApplicationId=%s" # This need the reference inserting | |||
| self._results = PlanningAuthorityResults(self.authority_name, self.authority_short_name) | |||
| def getResultsByDayMonthYear(self, day, month, year): | |||
| search_day = datetime.date(year, month, day) | |||
| post_data = urllib.urlencode( | |||
| (("show", "0"), | |||
| ("Go", "GO"), | |||
| ) | |||
| ) | |||
| # Now get the search page | |||
| response = urllib2.urlopen(self.base_url %{"date": search_day.strftime(date_format)}) | |||
| soup = BeautifulSoup(response.read()) | |||
| if not soup.find(text=re.compile("Sorry, no matches found")): | |||
| # There were apps for this date | |||
| trs = soup.find("table", {"class": "gis_table"}).findAll("tr")[2:] | |||
| for tr in trs: | |||
| application = PlanningApplication() | |||
| application.date_received = search_day | |||
| application.info_url = urlparse.urljoin(self.base_url, tr.a['href']) | |||
| application.council_reference = tr.a.string | |||
| # application.comment_url = self.comment_url %(application.council_reference) | |||
| tds = tr.findAll("td") | |||
| application.address = tds[1].string | |||
| application.postcode = getPostcodeFromText(application.address) | |||
| # This just gets us an initial segment of the description. | |||
| # We are going to have to download the info page... | |||
| #application.description = tds[2].string.strip() | |||
| info_response = urllib.urlopen(application.info_url) | |||
| info_soup = BeautifulSoup(info_response.read()) | |||
| application.description = info_soup.find(text="Proposal:").findNext("td").string.strip() | |||
| application.comment_url = urlparse.urljoin(self.base_url, info_soup.find("a", title="Link to Planning Application Comment page")['href']) | |||
| self._results.addApplication(application) | |||
| return self._results | |||
| def getResults(self, day, month, year): | |||
| return self.getResultsByDayMonthYear(int(day), int(month), int(year)).displayXML() | |||
| if __name__ == '__main__': | |||
| parser = HerefordshireParser() | |||
| print parser.getResults(31,8,2008) | |||
| @@ -55,3 +55,4 @@ | |||
| "Halton.py", "420" | |||
| "Hampshire.py", "420" | |||
| "Hastings.py", "420" | |||
| "Herefordshire.py", "420" | |||
| @@ -631,7 +631,7 @@ if __name__ == '__main__': | |||
| # NOTE - 04/11/2007 is a sunday | |||
| # I'm using it to test that the scrapers behave on days with no apps. | |||
| parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/") | |||
| # parser = BlackburnParser("Blackburn With Darwen Borough Council", "Blackburn", "http://195.8.175.6/") | |||
| # parser = BroadlandParser("Broadland Council", "Broadland", "http://www.broadland.gov.uk/") | |||
| # parser = CamdenParser("London Borough of Camden", "Camden", "http://planningrecords.camden.gov.uk/") | |||
| # parser = CharnwoodParser("Charnwood Borough Council", "Charnwood", "http://portal.charnwoodbc.gov.uk/") | |||
| @@ -639,7 +639,7 @@ if __name__ == '__main__': | |||
| # parser = EastStaffsParser("East Staffordshire Borough Council", "East Staffs", "http://www2.eaststaffsbc.gov.uk/") | |||
| # parser = EppingForestParser("Epping Forest District Council", "Epping Forest", "http://plan1.eppingforestdc.gov.uk/") | |||
| # parser = ForestHeathParser("Forest Heath District Council", "Forest Heath", "http://195.171.177.73/") | |||
| # parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/") | |||
| parser = HackneyParser("London Borough of Hackney", "Hackney", "http://www.hackney.gov.uk/servapps/") | |||
| # parser = KennetParser("Kennet District Council", "Kennet", "http://mvm-planning.kennet.gov.uk/") | |||
| # parser = LincolnParser("Lincoln City Council", "Lincoln", "http://online.lincoln.gov.uk/") | |||
| # parser = LiverpoolParser("Liverpool City Council", "Liverpool", "http://www.liverpool.gov.uk/") | |||
| @@ -655,7 +655,7 @@ if __name__ == '__main__': | |||
| # parser = WalthamForestParser("Waltham Forest", "Waltham Forest", "http://planning.walthamforest.gov.uk/") | |||
| # parser = ConwyParser("Conwy County Borough Council", "Conwy", "http://www.conwy.gov.uk/") | |||
| # parser = MertonParser("London Borough of Merton", "Merton", "http://planning.merton.gov.uk") | |||
| print parser.getResults(3, 7, 2008) | |||
| print parser.getResults(4, 9, 2008) | |||
| # To Do | |||
| @@ -4,13 +4,6 @@ import re | |||
| date_format = "%d/%m/%Y" | |||
| def xmlQuote(text): | |||
| # Change &s to &s | |||
| # I suspect there is probably some standard python | |||
| # function I should be using for this... | |||
| return text.replace('&', '&') | |||
| def fixNewlines(text): | |||
| # This can be used to sort out windows newlines | |||
| return text.replace("\r\n","\n") | |||
| @@ -112,13 +105,13 @@ class PlanningApplication: | |||
| #print self.council_reference, self.address, self.postcode, self.description, self.info_url, self.comment_url, self.date_received | |||
| contents = [ | |||
| u"<council_reference>%s</council_reference>" %xmlQuote(self.council_reference), | |||
| u"<address>%s</address>" %xmlQuote(self.address), | |||
| u"<postcode>%s</postcode>" %self.postcode, | |||
| u"<description>%s</description>" %xmlQuote(self.description), | |||
| u"<info_url>%s</info_url>" %xmlQuote(self.info_url), | |||
| u"<comment_url>%s</comment_url>" %xmlQuote(self.comment_url), | |||
| u"<date_received>%s</date_received>" %self.date_received.strftime(date_format), | |||
| u"<council_reference><![CDATA[%s]]></council_reference>" %(self.council_reference), | |||
| u"<address><![CDATA[%s]]></address>" %(self.address), | |||
| u"<postcode><![CDATA[%s]]></postcode>" %self.postcode, | |||
| u"<description><![CDATA[%s]]></description>" %(self.description), | |||
| u"<info_url><![CDATA[%s]]></info_url>" %(self.info_url), | |||
| u"<comment_url><![CDATA[%s]]></comment_url>" %(self.comment_url), | |||
| u"<date_received><![CDATA[%s]]></date_received>" %self.date_received.strftime(date_format), | |||
| ] | |||
| if self.osgb_x: | |||
| contents.append(u"<osgb_x>%s</osgb_x>" %(self.osgb_x)) | |||
| @@ -259,3 +259,4 @@ | |||
| "Halton Borough Council", "Halton", "", "Halton", "HaltonParser" | |||
| "Hampshire County Council", "Hampshire", "", "Hampshire", "HampshireParser" | |||
| "Hastings Borough Council", "Hastings", "", "Hastings", "HastingsParser" | |||
| "Herefordshire County Council", "Herefordshire", "", "Herefordshire", "HerefordshireParser" | |||