From 47654fd2b3850737b726b8645b654ee160ba3dc5 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Sun, 16 Sep 2007 14:56:48 +0000 Subject: [PATCH] fix regex in php publicaccess scraper --- cgi-bin/AcolnetParser.py | 17 +++++++++++++---- docs/include/scraper_support.php | 15 ++++++++++++--- docs/scrapers/stafford.php | 2 +- python_scrapers/AcolnetParser.py | 17 +++++++++++++---- python_scrapers/SitesToGenerate.csv | 4 +++- 5 files changed, 42 insertions(+), 13 deletions(-) diff --git a/cgi-bin/AcolnetParser.py b/cgi-bin/AcolnetParser.py index a700188..40dcba1 100644 --- a/cgi-bin/AcolnetParser.py +++ b/cgi-bin/AcolnetParser.py @@ -433,6 +433,14 @@ class SuffolkCoastalParser(AcolnetParser): comments_email_address = "d.c.admin@suffolkcoastal.gov.uk" +class GuildfordParser(AcolnetParser): + case_number_tr = 1 + reg_date_tr = 7 + location_tr = 2 + proposal_tr = 3 + + #http://www.guildford.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&Root=PgeSearch + class SurreyHeathParser(AcolnetParser): # This is not working yet. # _getSearchResponse is an attempt to work around @@ -476,9 +484,9 @@ class SurreyHeathParser(AcolnetParser): if __name__ == '__main__': - day = 31 - month = 8 - year = 2007 + day = 22 + month = 2 + year = 2005 # returns error 400 - bad request #parser = BridgenorthParser() @@ -489,7 +497,8 @@ if __name__ == '__main__': # canterbury # results as columns of one table - parser = SurreyHeathParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") + #parser = SurreyHeathParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") + parser = GuildfordParser("Guildford", "Guildford", "http://www.guildford.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&Root=PgeSearch") print parser.getResults(day, month, year) diff --git a/docs/include/scraper_support.php b/docs/include/scraper_support.php index f98ea34..45d229e 100644 --- a/docs/include/scraper_support.php +++ b/docs/include/scraper_support.php @@ -10,7 +10,8 @@ require_once('phpcoord.php'); function scrape_applications_publicaccess ($search_url, $info_url_base, $comment_url_base){ $applications = array(); - $application_pattern = "/([0-9]*)<\/th>([^;]*)([^<]*)/"; + //$application_pattern = "/([0-9]*)<\/th>([^;]*)([^<]*)/"; + $application_pattern = "/([0-9]*)<\/th>.*(?=<\/tr)/U"; //grab the page $html = safe_scrape_page($search_url); @@ -21,6 +22,10 @@ function scrape_applications_publicaccess ($search_url, $info_url_base, $comment preg_match_all($application_pattern, $html, $application_matches, PREG_PATTERN_ORDER); foreach ($application_matches[0] as $application_match){ + //START Duncan's debug + //print_r($application_match); + //print_r("END"); + // END Duncan's debug $detail_pattern = "/([^<])*/"; preg_match_all($detail_pattern, $application_match, $detail_matches, PREG_PATTERN_ORDER); @@ -36,12 +41,16 @@ function scrape_applications_publicaccess ($search_url, $info_url_base, $comment //match case number $casenumber_pattern = "/caseno=([^&]*)/"; preg_match($casenumber_pattern, $application_match, $casenumber_matches); - + //START Duncan's debug + //print_r($application_match); + //var_dump($casenumber_matches); + //END Duncan's debug + $case_number =""; if(sizeof($casenumber_matches)>0){ $case_number = str_replace("caseno=","", $casenumber_matches[0]); } - + //if weve found a caase number, then get the details if($case_number !=""){ //Comment and info urls diff --git a/docs/scrapers/stafford.php b/docs/scrapers/stafford.php index 8ed896c..46ad0b7 100644 --- a/docs/scrapers/stafford.php +++ b/docs/scrapers/stafford.php @@ -47,4 +47,4 @@ if (isset($_GET['year'])){ $smarty->display("xml.tpl"); -?> \ No newline at end of file +?> diff --git a/python_scrapers/AcolnetParser.py b/python_scrapers/AcolnetParser.py index a700188..40dcba1 100644 --- a/python_scrapers/AcolnetParser.py +++ b/python_scrapers/AcolnetParser.py @@ -433,6 +433,14 @@ class SuffolkCoastalParser(AcolnetParser): comments_email_address = "d.c.admin@suffolkcoastal.gov.uk" +class GuildfordParser(AcolnetParser): + case_number_tr = 1 + reg_date_tr = 7 + location_tr = 2 + proposal_tr = 3 + + #http://www.guildford.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&Root=PgeSearch + class SurreyHeathParser(AcolnetParser): # This is not working yet. # _getSearchResponse is an attempt to work around @@ -476,9 +484,9 @@ class SurreyHeathParser(AcolnetParser): if __name__ == '__main__': - day = 31 - month = 8 - year = 2007 + day = 22 + month = 2 + year = 2005 # returns error 400 - bad request #parser = BridgenorthParser() @@ -489,7 +497,8 @@ if __name__ == '__main__': # canterbury # results as columns of one table - parser = SurreyHeathParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") + #parser = SurreyHeathParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch") + parser = GuildfordParser("Guildford", "Guildford", "http://www.guildford.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&Root=PgeSearch") print parser.getResults(day, month, year) diff --git a/python_scrapers/SitesToGenerate.csv b/python_scrapers/SitesToGenerate.csv index f8eef74..59250d3 100644 --- a/python_scrapers/SitesToGenerate.csv +++ b/python_scrapers/SitesToGenerate.csv @@ -131,4 +131,6 @@ "Bracknell Forest Borough Council", "Bracknell Forest", "https://my.bracknell-forest.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" "Chiltern District Council", "Chiltern", "https://isa.chiltern.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" "Hinkley and Bosworth Borough Council", "Hinkley and Bosworth", "https://cx.hinckley-bosworth.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" -"Tendring District Council", "Tendring", "http://195.99.151.54/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" \ No newline at end of file +"Tendring District Council", "Tendring", "http://195.99.151.54/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" +"Argyl And Bute Council", "Argyl and Bute", "http://www.argyll-bute.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser" +"Oxford City Council", "Oxford", "http://uniformpublicaccess.oxford.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser" \ No newline at end of file