From 2f8a9842cfe274087d5957afe62daea86b680432 Mon Sep 17 00:00:00 2001 From: "duncan.parkes" Date: Thu, 28 Aug 2008 18:28:23 +0000 Subject: [PATCH] Add Matt Ford's scraper for wealden. --- trunk/docs/scrapers/wealden.php | 153 ++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 trunk/docs/scrapers/wealden.php diff --git a/trunk/docs/scrapers/wealden.php b/trunk/docs/scrapers/wealden.php new file mode 100644 index 0000000..0d8bdf2 --- /dev/null +++ b/trunk/docs/scrapers/wealden.php @@ -0,0 +1,153 @@ + 0 && $_GET['day'] < 32) ? $_GET['day'] : 1; + +//Check a month is set and is valid +$month = (isset($_GET['month']) && !empty($_GET['month']) && $_GET['month'] > 0 && $_GET['month'] < 13) ? $_GET['month'] : 1; + +//Check a year is set and is valid +$year = (isset($_GET['year']) && !empty($_GET['year']) && $_GET['year'] > 2003 && $_GET['year'] <= gmdate('Y')) ? $_GET['year'] : gmdate('Y'); + +//Do you want all information or only the common stuff? +$all = (isset($_GET['all']) && $_GET['all'] != false) ? true : false; + +$date = $day.'/'.$month.'/'.$year; + +$applications = array(); + +function fetch_page($url,$post_string = false,$post_count = false) { + if(!isset($ch)) { + $ch = curl_init(); + } + curl_setopt($ch, CURLOPT_URL, $url); + curl_setopt($ch, CURLOPT_HEADER, 0); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_REFERER, $url); + curl_setopt($ch, CURLOPT_COOKIEJAR, $cookiejar); + curl_setopt($ch, CURLOPT_COOKIEFILE, $cookiejar); + if($post_count > 0) { + curl_setopt($ch, CURLOPT_POST, $post_count); + curl_setopt($ch, CURLOPT_POSTFIELDS,$post_string); + } + $data = curl_exec($ch); + return $data; +} + +function extract_data($string) { + list($junk,$return) = explode('',$string); + return trim(strip_tags($return)); +} + +function parse_search($date,$page_no=1,$AppRef='') { + global $applications; + + $url = 'http://www.planning.wealden.gov.uk/aspxpages/SearchResults.aspx?pageno='.$page_no.'&QueryType=9&WeekNo=&WeekStart=&WeekEnd=&CaseNo=&Add=&ShowInd=&DocId=&AppRef='.$AppRef.'&Category=DC&DateType=R&StartDate='.$date.'&EndDate='.$date.'&Agent=&ParishCode=&WardCode=&Parish=&Ward=&AdvAppNo=&AdvAdd=&AdvProposal=&DecisionCode=&Det='; + //echo 'Loading page '.$page_no.' of data for '.$date.' URL:'.$url.'
'; + + $data = fetch_page($url); + if(strpos($data,"Wealden District Council's applications online - Copyright, disclaimer & personal data")) { + //Accept their terms + list($junk,$viewstate) = explode('',$viewstate,2); + //echo 'Attempting to bypass copyright page...
'; + $url = 'http://www.planning.wealden.gov.uk/aspxpages/Copyright.aspx?pageno='.$page_no.'&QueryType=9&WeekNo=&WeekStart=&WeekEnd=&CaseNo=&Add=&ShowInd=&DocId=&AppRef='.$AppRef.'&Category=DC&DateType=R&StartDate='.$date.'&EndDate='.$date.'&Agent=&ParishCode=&WardCode=&Parish=&Ward=&AdvAppNo=&AdvAdd=&AdvProposal=&DecisionCode=&Det='; + $data = fetch_page($url,'btnCopyrightAccept=Accept&__VIEWSTATE='.urlencode($viewstate).'',2); + } + list($junk,$data) = explode('', $data); + list($data,$next_page) = explode('
',$data); + $data = explode('',$data); + unset($data[10]); + foreach($data as $application) { + $application = explode('',$application); + $AppNo = extract_data($application[0]); + if(!empty($AppNo)) { + $applications[$AppNo]['AppNo'] = $AppNo; + $Loc = extract_data($application[1]); + $applications[$AppNo]['Address'] = $Loc; + preg_match("/([A-Z]{1,2}[0-9][0-9A-Z]?\s?[0-9][A-Z]{2})/",$Loc,$PostCode); + if(isset($PostCode[1])) { + $applications[$AppNo]['PostCode'] = $PostCode[1]; + } else { + $applications[$AppNo]['PostCode'] = false; + } + $applications[$AppNo]['Info'] = extract_data($application[2]); + parse_detail($AppNo); + } + } + if(strpos($next_page,'Next

')) { + $page_no++; + //echo "Loading next page..."; + if($page_no < 6) { + parse_search($date,$page_no,$AppNo); + } + } +} + +function parse_detail($AppNo) { + global $applications; + $url = 'http://www.planning.wealden.gov.uk/aspxpages/ResultsDetail.aspx?appref='.$AppNo.'&Category=DC'; + list($junk,$data) = explode('',fetch_page($url),2); + list($data,$junk) = explode('
',$data,2); + $data = explode('',$data); + $applications[$AppNo]['AppType'] = extract_data($data[1]); + $applications[$AppNo]['DateRec'] = extract_data($data[2]); + $applications[$AppNo]['DateExp'] = extract_data($data[3]); + $applications[$AppNo]['Parish'] = extract_data($data[6]); + $applications[$AppNo]['GridRef'] = extract_data($data[7]); + $applications[$AppNo]['UPRN'] = extract_data($data[8]); + list($status,$junk) = explode(' - ',extract_data($data[9])); + $applications[$AppNo]['Status'] = trim($status); + $applications[$AppNo]['DateConExp'] = extract_data($data[10]); + $applications[$AppNo]['DateComDel'] = extract_data($data[11]); + $applications[$AppNo]['Decision'] = extract_data($data[12]); + $applications[$AppNo]['DateDec'] = extract_data($data[13]); + $applications[$AppNo]['CaseOfficer'] = extract_data($data[14]); +} +parse_search($date); + +header("Content-Type: text/xml"); +echo "\n"; +echo "\n"; +echo "\tWealden\n"; +echo "\tWealden\n"; +echo "\t\n"; +foreach($applications as $application) { + echo "\t\t\n"; + echo "\t\t\t".$application['AppNo']."\n"; + echo "\t\t\t
".$application['Address']."
\n"; + echo "\t\t\t".$application['PostCode']."\n"; + echo "\t\t\t".$application['Info']."\n"; + echo "\t\t\t\n"; + echo "\t\t\tplanning@wealden.gov.uk\n"; + echo "\t\t\t".$application['DateRec']."\n"; + if($all) { + echo "\t\t\t".$application['AppType']."\n"; + echo "\t\t\t".$application['DateExp']."\n"; + echo "\t\t\t".$application['Parish']."\n"; + echo "\t\t\t".$application['GridRef']."\n"; + echo "\t\t\t".$application['UPRN']."\n"; + echo "\t\t\t".$application['Status']."\n"; + echo "\t\t\t".$application['DateConExp']."\n"; + echo "\t\t\t".$application['DateComDel']."\n"; + echo "\t\t\t".$application['Decision']."\n"; + echo "\t\t\t".$application['DateDec']."\n"; + echo "\t\t\t".$application['CaseOfficer']."\n"; + } + echo "\t\t
\n"; +} +echo "\t
\n"; +echo "
"; +?> +