Browse Source

Fix Lichfield scraper.

import/raw
duncan.parkes 16 years ago
parent
commit
1bc23b0b9c
4 changed files with 14 additions and 8 deletions
  1. +1
    -1
      trunk/python_scrapers/Carmarthenshire.py
  2. +0
    -1
      trunk/python_scrapers/Dacorum.cgi
  3. +10
    -3
      trunk/python_scrapers/Lichfield.py
  4. +3
    -3
      trunk/python_scrapers/Ocella.py

+ 1
- 1
trunk/python_scrapers/Carmarthenshire.py View File

@@ -74,5 +74,5 @@ class CarmarthenshireParser:


if __name__ == '__main__': if __name__ == '__main__':
parser = CarmarthenshireParser() parser = CarmarthenshireParser()
print parser.getResults(8,8,2008)
print parser.getResults(20,11,2008)



+ 0
- 1
trunk/python_scrapers/Dacorum.cgi View File

@@ -82,7 +82,6 @@ my $table = $page->look_down("_tag" => "table", "class" => "FormDataGrid");
# Process each row of the results # Process each row of the results
foreach my $row ($table->look_down("_tag" => "tr")) foreach my $row ($table->look_down("_tag" => "tr"))
{ {
$Writer->dataElement("test", "in for loop");
my @cells = $row->look_down("_tag" => "td"); my @cells = $row->look_down("_tag" => "td");


if ($cells[0]->attr("class") eq "FormGridDataItem" || if ($cells[0]->attr("class") eq "FormGridDataItem" ||


+ 10
- 3
trunk/python_scrapers/Lichfield.py View File

@@ -7,6 +7,7 @@ ignoring the date passed in.


import urllib2 import urllib2
import urlparse import urlparse
import re


import datetime import datetime


@@ -18,6 +19,8 @@ from PlanningUtils import PlanningApplication, \


date_format = "%d/%m/%Y" date_format = "%d/%m/%Y"


date_received_re = re.compile("(\d\d?)[a-z]{2} ([a-zA-Z]*) (\d{4})")

class LichfieldParser: class LichfieldParser:
def __init__(self, *args): def __init__(self, *args):


@@ -51,8 +54,12 @@ class LichfieldParser:
info_response = urllib2.urlopen(application.info_url) info_response = urllib2.urlopen(application.info_url)
info_soup = BeautifulSoup.BeautifulSoup(info_response.read()) info_soup = BeautifulSoup.BeautifulSoup(info_response.read())


application.description = info_soup.find(text="Proposal:").findPrevious("div").contents[1].strip()
application.date_received = datetime.datetime.strptime(info_soup.find(text="Date Application Valid:").findNext("span").string.strip(), date_format).date()
application.description = info_soup.find(text="Proposal").findNext(text=True).strip()
date_received_str = info_soup.find(text="Date Application Valid").findNext(text=True).split(",")[1].strip()

# This is a nasty botch, but the easiest way I can see to get a date out of this is to make another string and use strptime
better_date_str = "%s %s %s" %date_received_re.match(date_received_str).groups()
application.date_received = datetime.datetime.strptime(better_date_str, "%d %B %Y").date()
application.comment_url = info_soup.find("a", title="Comment on this planning application.")['href'] application.comment_url = info_soup.find("a", title="Comment on this planning application.")['href']


self._results.addApplication(application) self._results.addApplication(application)
@@ -64,4 +71,4 @@ class LichfieldParser:


if __name__ == '__main__': if __name__ == '__main__':
parser = LichfieldParser() parser = LichfieldParser()
print parser.getResults(12,10,2008)
print parser.getResults(20,11,2008)

+ 3
- 3
trunk/python_scrapers/Ocella.py View File

@@ -198,13 +198,13 @@ if __name__ == '__main__':
# parser = OcellaParser("Middlesbrough", "Middlesbrough", "http://planserv.middlesbrough.gov.uk/portal/page?_pageid=33,4178&_dad=portal&_schema=PORTAL") # parser = OcellaParser("Middlesbrough", "Middlesbrough", "http://planserv.middlesbrough.gov.uk/portal/page?_pageid=33,4178&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("North East Lincolnshire", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,64104&_dad=portal&_schema=PORTAL") # parser = OcellaParser("North East Lincolnshire", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,64104&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Uttlesford", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page/portal/plan/weekly") # parser = OcellaParser("Uttlesford", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page/portal/plan/weekly")
# parser = OcellaParser("Bridgend", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL")
parser = OcellaParser("Bridgend", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL") # parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL")
# parser = OcellaParser("Castle Point", "Castle Point", "http://planning.castlepoint.gov.uk/portal/page?_pageid=35,38205&_dad=portal&_schema=PORTAL") # parser = OcellaParser("Castle Point", "Castle Point", "http://planning.castlepoint.gov.uk/portal/page?_pageid=35,38205&_dad=portal&_schema=PORTAL")
parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly")
# parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/weekly")




print parser.getResults(21,5,2008)
print parser.getResults(21,11,2008)


#TODO #TODO




Loading…
Cancel
Save