From 1bc23b0b9ce1c20f888ef897fb2d7babdf04edd5 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Sun, 30 Nov 2008 18:23:20 +0000 Subject: [PATCH] Fix Lichfield scraper. --- trunk/python_scrapers/Carmarthenshire.py | 2 +- trunk/python_scrapers/Dacorum.cgi | 1 - trunk/python_scrapers/Lichfield.py | 13 ++++++++++--- trunk/python_scrapers/Ocella.py | 6 +++--- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/trunk/python_scrapers/Carmarthenshire.py b/trunk/python_scrapers/Carmarthenshire.py index b0b3d46..9fc98fd 100644 --- a/trunk/python_scrapers/Carmarthenshire.py +++ b/trunk/python_scrapers/Carmarthenshire.py @@ -74,5 +74,5 @@ class CarmarthenshireParser: if __name__ == '__main__': parser = CarmarthenshireParser() - print parser.getResults(8,8,2008) + print parser.getResults(20,11,2008) diff --git a/trunk/python_scrapers/Dacorum.cgi b/trunk/python_scrapers/Dacorum.cgi index 48eb689..865cbf3 100644 --- a/trunk/python_scrapers/Dacorum.cgi +++ b/trunk/python_scrapers/Dacorum.cgi @@ -82,7 +82,6 @@ my $table = $page->look_down("_tag" => "table", "class" => "FormDataGrid"); # Process each row of the results foreach my $row ($table->look_down("_tag" => "tr")) { - $Writer->dataElement("test", "in for loop"); my @cells = $row->look_down("_tag" => "td"); if ($cells[0]->attr("class") eq "FormGridDataItem" || diff --git a/trunk/python_scrapers/Lichfield.py b/trunk/python_scrapers/Lichfield.py index 326572b..507fc2d 100644 --- a/trunk/python_scrapers/Lichfield.py +++ b/trunk/python_scrapers/Lichfield.py @@ -7,6 +7,7 @@ ignoring the date passed in. import urllib2 import urlparse +import re import datetime @@ -18,6 +19,8 @@ from PlanningUtils import PlanningApplication, \ date_format = "%d/%m/%Y" +date_received_re = re.compile("(\d\d?)[a-z]{2} ([a-zA-Z]*) (\d{4})") + class LichfieldParser: def __init__(self, *args): @@ -51,8 +54,12 @@ class LichfieldParser: info_response = urllib2.urlopen(application.info_url) info_soup = BeautifulSoup.BeautifulSoup(info_response.read()) - application.description = info_soup.find(text="Proposal:").findPrevious("div").contents[1].strip() - application.date_received = datetime.datetime.strptime(info_soup.find(text="Date Application Valid:").findNext("span").string.strip(), date_format).date() + application.description = info_soup.find(text="Proposal").findNext(text=True).strip() + date_received_str = info_soup.find(text="Date Application Valid").findNext(text=True).split(",")[1].strip() + + # This is a nasty botch, but the easiest way I can see to get a date out of this is to make another string and use strptime + better_date_str = "%s %s %s" %date_received_re.match(date_received_str).groups() + application.date_received = datetime.datetime.strptime(better_date_str, "%d %B %Y").date() application.comment_url = info_soup.find("a", title="Comment on this planning application.")['href'] self._results.addApplication(application) @@ -64,4 +71,4 @@ class LichfieldParser: if __name__ == '__main__': parser = LichfieldParser() - print parser.getResults(12,10,2008) + print parser.getResults(20,11,2008) diff --git a/trunk/python_scrapers/Ocella.py b/trunk/python_scrapers/Ocella.py index 61ffa99..94ff9e0 100644 --- a/trunk/python_scrapers/Ocella.py +++ b/trunk/python_scrapers/Ocella.py @@ -198,13 +198,13 @@ if __name__ == '__main__': # parser = OcellaParser("Middlesbrough", "Middlesbrough", "http://planserv.middlesbrough.gov.uk/portal/page?_pageid=33,4178&_dad=portal&_schema=PORTAL") # parser = OcellaParser("North East Lincolnshire", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,64104&_dad=portal&_schema=PORTAL") # parser = OcellaParser("Uttlesford", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page/portal/plan/weekly") -# parser = OcellaParser("Bridgend", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL") + parser = OcellaParser("Bridgend", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL") # parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL") # parser = OcellaParser("Castle Point", "Castle Point", "http://planning.castlepoint.gov.uk/portal/page?_pageid=35,38205&_dad=portal&_schema=PORTAL") - parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly") +# parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly") - print parser.getResults(21,5,2008) + print parser.getResults(21,11,2008) #TODO