From fff87cdfb31a8eb94d2e2ef7b018bb4dbf54a5d8 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Mon, 9 Jun 2008 16:15:55 +0000 Subject: [PATCH] Working scrapers for 8 Ocella sites. --- trunk/python_scrapers/Ocella.py | 26 +++++++++++++++------- trunk/python_scrapers/OtherFilesToCopy.csv | 3 ++- trunk/python_scrapers/SitesToGenerate.csv | 8 +++++++ 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/trunk/python_scrapers/Ocella.py b/trunk/python_scrapers/Ocella.py index 2c4585d..4709f67 100644 --- a/trunk/python_scrapers/Ocella.py +++ b/trunk/python_scrapers/Ocella.py @@ -67,7 +67,13 @@ class OcellaParser: # We need to find where the post action goes action = get_soup.form['action'] - session_id = get_soup.find('input', {'name': 'p_session_id'})['value'] + + try: + session_id = get_soup.find('input', {'name': 'p_session_id'})['value'] + except TypeError: + # In the case of Middlesbrough, there is no session cookie, + # but it seems we don't need it... + session_id = None # # From Breckland @@ -176,18 +182,22 @@ if __name__ == '__main__': # parser = OcellaParser("Arun", "Arun", "http://www.arun.gov.uk/iplanning/portal/page?_pageid=33,4139&_dad=portal&_schema=PORTAL") # parser = OcellaParser("Breckland Council", "Breckland", "http://wplan01.intranet.breckland.gov.uk:7778/portal/page?_pageid=33,30988&_dad=portal&_schema=PORTAL") # parser = OcellaParser("Ellesmere Port", "Ellesmere Port", "http://ocella.epnbc.gov.uk/portal/page?_pageid=33,38205&_dad=portal&_schema=PORTAL") -# parser = OcellaParser("Uttlesford", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page?_pageid=33,35447&_dad=portal&_schema=PORTAL") -# parser = OcellaParser("North East Lincolnshire", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,68034&_dad=portal&_schema=PORTAL") # parser = OcellaParser("Fareham", "Fareham", "http://eocella.fareham.gov.uk/portal/page?_pageid=33,31754&_dad=portal&_schema=PORTAL") # parser = OcellaParser("Hillingdon", "Hillingdon", "http://w09.hillingdon.gov.uk/portal/page?_pageid=33,82093&_dad=portal&_schema=PORTAL") +# parser = OcellaParser("North East Lincolnshire", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,68034&_dad=portal&_schema=PORTAL") +# parser = OcellaParser("Middlesbrough", "Middlesbrough", "http://planserv.middlesbrough.gov.uk/portal/page?_pageid=33,4166&_dad=portal&_schema=PORTAL") +# parser = OcellaParser("Uttlesford", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page?_pageid=33,35447&_dad=portal&_schema=PORTAL") - # Bad status line? -# parser = BrecklandParser("Bridgend", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL") -# parser = ArunParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL") - # Can't find the URL similar to the others, even though it is clearly Ocella - parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://www.great-yarmouth.gov.uk/wmplan_application_search-6.htm") + # Bad status line? Try changing browser id string? +# parser = OcellaParser("Bridgend", "Bridgend", "http://eplan.bridgend.gov.uk:7778/portal/page?_pageid=55,31779&_dad=portal&_schema=PORTAL") +# Post never comes back +# parser = OcellaParser("Havering", "Havering", "http://planning.havering.gov.uk/portal/page?_pageid=33,1026&_dad=portal&_schema=PORTAL") + + # Can't find the URL similar to the others, even though it is clearly Ocella + # We get a 406 at the moment. Try browser id string? + parser = OcellaParser("Great Yarmouth", "Great Yarmouth", "http://planning.great-yarmouth.gov.uk/portal/page/portal/plan/search") print parser.getResults(21,5,2008) diff --git a/trunk/python_scrapers/OtherFilesToCopy.csv b/trunk/python_scrapers/OtherFilesToCopy.csv index 222cc80..d2ead61 100644 --- a/trunk/python_scrapers/OtherFilesToCopy.csv +++ b/trunk/python_scrapers/OtherFilesToCopy.csv @@ -19,4 +19,5 @@ "WestDorset.cgi", "493" "Christchurch.cgi", "493" "WAM.py", "420" -"Planet.py", "420" \ No newline at end of file +"Planet.py", "420" +"Ocella.py", "420" diff --git a/trunk/python_scrapers/SitesToGenerate.csv b/trunk/python_scrapers/SitesToGenerate.csv index 6a802c0..808ea09 100644 --- a/trunk/python_scrapers/SitesToGenerate.csv +++ b/trunk/python_scrapers/SitesToGenerate.csv @@ -208,3 +208,11 @@ "Tewkesbury Borough Council", "Tewkesbury", "http://planning.tewkesbury.gov.uk/Planet/ispforms.asp?serviceKey=07WCC04163103430", "Planet", "PlanetParser" "Worcester City Council", "Worcester", "http://www.worcester.gov.uk:8080/planet/ispforms.asp?serviceKey=SysDoc-PlanetApplicationEnquiry", "Planet", "PlanetParser" "London Borough of Barnet", "Barnet", "http://194.75.183.100/planning-cases/acolnetcgi.exe?ACTION=UNWRAP&RIPNAME=Root.pgesearch", "AcolnetParser", "AcolnetParser" +"Arun District Council", "Arun", "http://www.arun.gov.uk/iplanning/portal/page?_pageid=33,4139&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser" +"Breckland Council", "Breckland", "http://wplan01.intranet.breckland.gov.uk:7778/portal/page?_pageid=33,30988&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser" +"Ellesmere Port and Neston Borough Council", "Ellesmere Port", "http://ocella.epnbc.gov.uk/portal/page?_pageid=33,38205&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser" +"Fareham Borough Council", "Fareham", "http://eocella.fareham.gov.uk/portal/page?_pageid=33,31754&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser" +"London Borough of Hillingdon", "Hillingdon", "http://w09.hillingdon.gov.uk/portal/page?_pageid=33,82093&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser" +"North East Lincolnshire Council", "North East Lincolnshire", "http://planning.nelincs.gov.uk/portal/page?_pageid=33,68034&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser" +"Middlesbrough Borough Council", "Middlesbrough", "http://planserv.middlesbrough.gov.uk/portal/page?_pageid=33,4166&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser" +"Uttlesford District Council", "Uttlesford", "http://planning.uttlesford.gov.uk/portal/page?_pageid=33,35447&_dad=portal&_schema=PORTAL", "Ocella", "OcellaParser"