Browse Source

fix regex in php publicaccess scraper

import/raw
duncan.parkes 17 years ago
parent
commit
698eabb414
5 changed files with 42 additions and 13 deletions
  1. +13
    -4
      trunk/cgi-bin/AcolnetParser.py
  2. +12
    -3
      trunk/docs/include/scraper_support.php
  3. +1
    -1
      trunk/docs/scrapers/stafford.php
  4. +13
    -4
      trunk/python_scrapers/AcolnetParser.py
  5. +3
    -1
      trunk/python_scrapers/SitesToGenerate.csv

+ 13
- 4
trunk/cgi-bin/AcolnetParser.py View File

@@ -433,6 +433,14 @@ class SuffolkCoastalParser(AcolnetParser):

comments_email_address = "d.c.admin@suffolkcoastal.gov.uk"

class GuildfordParser(AcolnetParser):
case_number_tr = 1
reg_date_tr = 7
location_tr = 2
proposal_tr = 3
#http://www.guildford.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&Root=PgeSearch

class SurreyHeathParser(AcolnetParser):
# This is not working yet.
# _getSearchResponse is an attempt to work around
@@ -476,9 +484,9 @@ class SurreyHeathParser(AcolnetParser):
if __name__ == '__main__':
day = 31
month = 8
year = 2007
day = 22
month = 2
year = 2005

# returns error 400 - bad request
#parser = BridgenorthParser()
@@ -489,7 +497,8 @@ if __name__ == '__main__':
# canterbury
# results as columns of one table

parser = SurreyHeathParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
#parser = SurreyHeathParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")

parser = GuildfordParser("Guildford", "Guildford", "http://www.guildford.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&Root=PgeSearch")
print parser.getResults(day, month, year)

+ 12
- 3
trunk/docs/include/scraper_support.php View File

@@ -10,7 +10,8 @@ require_once('phpcoord.php');
function scrape_applications_publicaccess ($search_url, $info_url_base, $comment_url_base){

$applications = array();
$application_pattern = "/<tr><th>([0-9]*)<\/th>([^;]*)([^<]*)/";
//$application_pattern = "/<tr><th>([0-9]*)<\/th>([^;]*)([^<]*)/";
$application_pattern = "/<tr><th>([0-9]*)<\/th>.*(?=<\/tr)/U";

//grab the page
$html = safe_scrape_page($search_url);
@@ -21,6 +22,10 @@ function scrape_applications_publicaccess ($search_url, $info_url_base, $comment
preg_match_all($application_pattern, $html, $application_matches, PREG_PATTERN_ORDER);

foreach ($application_matches[0] as $application_match){
//START Duncan's debug
//print_r($application_match);
//print_r("END");
// END Duncan's debug

$detail_pattern = "/<td>([^<])*/";
preg_match_all($detail_pattern, $application_match, $detail_matches, PREG_PATTERN_ORDER);
@@ -36,12 +41,16 @@ function scrape_applications_publicaccess ($search_url, $info_url_base, $comment
//match case number
$casenumber_pattern = "/caseno=([^&]*)/";
preg_match($casenumber_pattern, $application_match, $casenumber_matches);
//START Duncan's debug
//print_r($application_match);
//var_dump($casenumber_matches);
//END Duncan's debug

$case_number ="";
if(sizeof($casenumber_matches)>0){
$case_number = str_replace("caseno=","", $casenumber_matches[0]);
}
//if weve found a caase number, then get the details
if($case_number !=""){
//Comment and info urls


+ 1
- 1
trunk/docs/scrapers/stafford.php View File

@@ -47,4 +47,4 @@ if (isset($_GET['year'])){
$smarty->display("xml.tpl");

?>
?>

+ 13
- 4
trunk/python_scrapers/AcolnetParser.py View File

@@ -433,6 +433,14 @@ class SuffolkCoastalParser(AcolnetParser):

comments_email_address = "d.c.admin@suffolkcoastal.gov.uk"

class GuildfordParser(AcolnetParser):
case_number_tr = 1
reg_date_tr = 7
location_tr = 2
proposal_tr = 3
#http://www.guildford.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&Root=PgeSearch

class SurreyHeathParser(AcolnetParser):
# This is not working yet.
# _getSearchResponse is an attempt to work around
@@ -476,9 +484,9 @@ class SurreyHeathParser(AcolnetParser):
if __name__ == '__main__':
day = 31
month = 8
year = 2007
day = 22
month = 2
year = 2005

# returns error 400 - bad request
#parser = BridgenorthParser()
@@ -489,7 +497,8 @@ if __name__ == '__main__':
# canterbury
# results as columns of one table

parser = SurreyHeathParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")
#parser = SurreyHeathParser("Surrey Heath", "Surrey Heath", "https://www.public.surreyheath-online.gov.uk/whalecom60b1ef305f59f921/whalecom0/Scripts/PlanningPagesOnline/acolnetcgi.gov?ACTION=UNWRAP&RIPNAME=Root.pgesearch")

parser = GuildfordParser("Guildford", "Guildford", "http://www.guildford.gov.uk/acolnet/acolnetcgi.gov?ACTION=UNWRAP&Root=PgeSearch")
print parser.getResults(day, month, year)

+ 3
- 1
trunk/python_scrapers/SitesToGenerate.csv View File

@@ -131,4 +131,6 @@
"Bracknell Forest Borough Council", "Bracknell Forest", "https://my.bracknell-forest.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
"Chiltern District Council", "Chiltern", "https://isa.chiltern.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
"Hinkley and Bosworth Borough Council", "Hinkley and Bosworth", "https://cx.hinckley-bosworth.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"Tendring District Council", "Tendring", "http://195.99.151.54/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
"Tendring District Council", "Tendring", "http://195.99.151.54/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"
"Argyl And Bute Council", "Argyl and Bute", "http://www.argyll-bute.gov.uk/PublicAccess/tdc/", "PublicAccess", "PublicAccessParser"
"Oxford City Council", "Oxford", "http://uniformpublicaccess.oxford.gov.uk/publicaccess/tdc/", "PublicAccess", "PublicAccessParser"

Loading…
Cancel
Save