Преглед на файлове

Add Matt Ford's scraper for wealden.

master
duncan.parkes преди 16 години
родител
ревизия
d132672a37
променени са 1 файла, в които са добавени 153 реда и са изтрити 0 реда
  1. +153
    -0
      docs/scrapers/wealden.php

+ 153
- 0
docs/scrapers/wealden.php Целия файл

@@ -0,0 +1,153 @@
<?php
//
// Scraper for Wealden District Council Planning Website
// Created by Matt Ford on Sunday 24th August 2008
//
// The script works according to requirements of PlanningAlerts.com
// The default output is according to PlanningAlerts requirements,
// to get all the data add 'all=true' to the end of the query string
//
// You need to set the location of the 'cookie jar' for the scraper to work
$cookiejar = '/tmp/wealden_cookies.txt';

//Check a day is set and is valid
$day = (isset($_GET['day']) && !empty($_GET['day']) && $_GET['day'] > 0 && $_GET['day'] < 32) ? $_GET['day'] : 1;

//Check a month is set and is valid
$month = (isset($_GET['month']) && !empty($_GET['month']) && $_GET['month'] > 0 && $_GET['month'] < 13) ? $_GET['month'] : 1;

//Check a year is set and is valid
$year = (isset($_GET['year']) && !empty($_GET['year']) && $_GET['year'] > 2003 && $_GET['year'] <= gmdate('Y')) ? $_GET['year'] : gmdate('Y');

//Do you want all information or only the common stuff?
$all = (isset($_GET['all']) && $_GET['all'] != false) ? true : false;

$date = $day.'/'.$month.'/'.$year;

$applications = array();

function fetch_page($url,$post_string = false,$post_count = false) {
if(!isset($ch)) {
$ch = curl_init();
}
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_REFERER, $url);
curl_setopt($ch, CURLOPT_COOKIEJAR, $cookiejar);
curl_setopt($ch, CURLOPT_COOKIEFILE, $cookiejar);
if($post_count > 0) {
curl_setopt($ch, CURLOPT_POST, $post_count);
curl_setopt($ch, CURLOPT_POSTFIELDS,$post_string);
}
$data = curl_exec($ch);
return $data;
}

function extract_data($string) {
list($junk,$return) = explode('</h5>',$string);
return trim(strip_tags($return));
}

function parse_search($date,$page_no=1,$AppRef='') {
global $applications;

$url = 'http://www.planning.wealden.gov.uk/aspxpages/SearchResults.aspx?pageno='.$page_no.'&QueryType=9&WeekNo=&WeekStart=&WeekEnd=&CaseNo=&Add=&ShowInd=&DocId=&AppRef='.$AppRef.'&Category=DC&DateType=R&StartDate='.$date.'&EndDate='.$date.'&Agent=&ParishCode=&WardCode=&Parish=&Ward=&AdvAppNo=&AdvAdd=&AdvProposal=&DecisionCode=&Det=';
//echo 'Loading page '.$page_no.' of data for '.$date.' URL:'.$url.'<br />';

$data = fetch_page($url);
if(strpos($data,"<title>Wealden District Council's applications online - Copyright, disclaimer & personal data</title>")) {
//Accept their terms
list($junk,$viewstate) = explode('<input type="hidden" name="__VIEWSTATE" value="',$data,2);
list($viewstate,$junk) = explode('" />',$viewstate,2);
//echo 'Attempting to bypass copyright page...<br />';
$url = 'http://www.planning.wealden.gov.uk/aspxpages/Copyright.aspx?pageno='.$page_no.'&QueryType=9&WeekNo=&WeekStart=&WeekEnd=&CaseNo=&Add=&ShowInd=&DocId=&AppRef='.$AppRef.'&Category=DC&DateType=R&StartDate='.$date.'&EndDate='.$date.'&Agent=&ParishCode=&WardCode=&Parish=&Ward=&AdvAppNo=&AdvAdd=&AdvProposal=&DecisionCode=&Det=';
$data = fetch_page($url,'btnCopyrightAccept=Accept&__VIEWSTATE='.urlencode($viewstate).'',2);
}
list($junk,$data) = explode('<span id="lblSearchResults">', $data);
list($data,$next_page) = explode('<div id="pagenumbers">',$data);
$data = explode('</ul>',$data);
unset($data[10]);
foreach($data as $application) {
$application = explode('</li>',$application);
$AppNo = extract_data($application[0]);
if(!empty($AppNo)) {
$applications[$AppNo]['AppNo'] = $AppNo;
$Loc = extract_data($application[1]);
$applications[$AppNo]['Address'] = $Loc;
preg_match("/([A-Z]{1,2}[0-9][0-9A-Z]?\s?[0-9][A-Z]{2})/",$Loc,$PostCode);
if(isset($PostCode[1])) {
$applications[$AppNo]['PostCode'] = $PostCode[1];
} else {
$applications[$AppNo]['PostCode'] = false;
}
$applications[$AppNo]['Info'] = extract_data($application[2]);
parse_detail($AppNo);
}
}
if(strpos($next_page,'Next</a></div></span> <br />')) {
$page_no++;
//echo "Loading next page...";
if($page_no < 6) {
parse_search($date,$page_no,$AppNo);
}
}
}

function parse_detail($AppNo) {
global $applications;
$url = 'http://www.planning.wealden.gov.uk/aspxpages/ResultsDetail.aspx?appref='.$AppNo.'&Category=DC';
list($junk,$data) = explode('<span id="lblSearchDetails">',fetch_page($url),2);
list($data,$junk) = explode('<div class="linkborder">',$data,2);
$data = explode('</li>',$data);
$applications[$AppNo]['AppType'] = extract_data($data[1]);
$applications[$AppNo]['DateRec'] = extract_data($data[2]);
$applications[$AppNo]['DateExp'] = extract_data($data[3]);
$applications[$AppNo]['Parish'] = extract_data($data[6]);
$applications[$AppNo]['GridRef'] = extract_data($data[7]);
$applications[$AppNo]['UPRN'] = extract_data($data[8]);
list($status,$junk) = explode(' - ',extract_data($data[9]));
$applications[$AppNo]['Status'] = trim($status);
$applications[$AppNo]['DateConExp'] = extract_data($data[10]);
$applications[$AppNo]['DateComDel'] = extract_data($data[11]);
$applications[$AppNo]['Decision'] = extract_data($data[12]);
$applications[$AppNo]['DateDec'] = extract_data($data[13]);
$applications[$AppNo]['CaseOfficer'] = extract_data($data[14]);
}
parse_search($date);

header("Content-Type: text/xml");
echo "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
echo "<planning>\n";
echo "\t<authority_name>Wealden</authority_name>\n";
echo "\t<authority_short_name>Wealden</authority_short_name>\n";
echo "\t<applications>\n";
foreach($applications as $application) {
echo "\t\t<application>\n";
echo "\t\t\t<council_reference>".$application['AppNo']."</council_reference>\n";
echo "\t\t\t<address>".$application['Address']."</address>\n";
echo "\t\t\t<postcode>".$application['PostCode']."</postcode>\n";
echo "\t\t\t<description>".$application['Info']."</description>\n";
echo "\t\t\t<info_url><![CDATA[http://www.planning.wealden.gov.uk/aspxpages/ResultsDetail.aspx?appref=".$application['AppNo']."&Category=DC]]></info_url>\n";
echo "\t\t\t<comment_url>planning@wealden.gov.uk</comment_url>\n";
echo "\t\t\t<date_received>".$application['DateRec']."</date_received>\n";
if($all) {
echo "\t\t\t<application_type>".$application['AppType']."</application_type>\n";
echo "\t\t\t<date_expires>".$application['DateExp']."</date_expires>\n";
echo "\t\t\t<parish>".$application['Parish']."</parish>\n";
echo "\t\t\t<grid_reference>".$application['GridRef']."</grid_reference>\n";
echo "\t\t\t<uprn>".$application['UPRN']."</uprn>\n";
echo "\t\t\t<status>".$application['Status']."</status>\n";
echo "\t\t\t<consultation_expiry_date>".$application['DateConExp']."</consultation_expiry_date>\n";
echo "\t\t\t<committee_delegated_date>".$application['DateComDel']."</committee_delegated_date>\n";
echo "\t\t\t<decision>".$application['Decision']."</decision>\n";
echo "\t\t\t<decision_date>".$application['DateDec']."</decision_date>\n";
echo "\t\t\t<case_officer>".$application['CaseOfficer']."</case_officer>\n";
}
echo "\t\t</application>\n";
}
echo "\t</applications>\n";
echo "</planning>";
?>


Зареждане…
Отказ
Запис