Automatically exported from code.google.com/p/planningalerts
 
 
 
 
 
 

158 linhas
7.2 KiB

  1. <?php
  2. //
  3. // Scraper for Wealden District Council Planning Website
  4. // Created by Matt Ford on Sunday 24th August 2008
  5. //
  6. // The script works according to requirements of PlanningAlerts.com
  7. // The default output is according to PlanningAlerts requirements,
  8. // to get all the data add 'all=true' to the end of the query string
  9. //
  10. // You need to set the location of the 'cookie jar' for the scraper to work
  11. // This is truly horrible - the tempnam function can't be called without
  12. // the arguments in order to get the system temporary directory, but it falls
  13. // back on it if the first argument doesn't exist - Duncan
  14. $cookiejar = tempnam('nonexistantdirectory', '');
  15. //Check a day is set and is valid
  16. $day = (isset($_GET['day']) && !empty($_GET['day']) && $_GET['day'] > 0 && $_GET['day'] < 32) ? $_GET['day'] : 1;
  17. //Check a month is set and is valid
  18. $month = (isset($_GET['month']) && !empty($_GET['month']) && $_GET['month'] > 0 && $_GET['month'] < 13) ? $_GET['month'] : 1;
  19. //Check a year is set and is valid
  20. $year = (isset($_GET['year']) && !empty($_GET['year']) && $_GET['year'] > 2003 && $_GET['year'] <= gmdate('Y')) ? $_GET['year'] : gmdate('Y');
  21. //Do you want all information or only the common stuff?
  22. $all = (isset($_GET['all']) && $_GET['all'] != false) ? true : false;
  23. $date = $day.'/'.$month.'/'.$year;
  24. $applications = array();
  25. function fetch_page($url,$post_string = false,$post_count = false) {
  26. if(!isset($ch)) {
  27. $ch = curl_init();
  28. }
  29. curl_setopt($ch, CURLOPT_URL, $url);
  30. curl_setopt($ch, CURLOPT_HEADER, 0);
  31. curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
  32. curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
  33. curl_setopt($ch, CURLOPT_REFERER, $url);
  34. curl_setopt($ch, CURLOPT_COOKIEJAR, $cookiejar);
  35. curl_setopt($ch, CURLOPT_COOKIEFILE, $cookiejar);
  36. if($post_count > 0) {
  37. curl_setopt($ch, CURLOPT_POST, $post_count);
  38. curl_setopt($ch, CURLOPT_POSTFIELDS,$post_string);
  39. }
  40. $data = curl_exec($ch);
  41. return $data;
  42. }
  43. function extract_data($string) {
  44. list($junk,$return) = explode('</h5>',$string);
  45. return trim(strip_tags($return));
  46. }
  47. function parse_search($date,$page_no=1,$AppRef='') {
  48. global $applications;
  49. $url = 'http://www.planning.wealden.gov.uk/aspxpages/SearchResults.aspx?pageno='.$page_no.'&QueryType=9&WeekNo=&WeekStart=&WeekEnd=&CaseNo=&Add=&ShowInd=&DocId=&AppRef='.$AppRef.'&Category=DC&DateType=R&StartDate='.$date.'&EndDate='.$date.'&Agent=&ParishCode=&WardCode=&Parish=&Ward=&AdvAppNo=&AdvAdd=&AdvProposal=&DecisionCode=&Det=';
  50. //echo 'Loading page '.$page_no.' of data for '.$date.' URL:'.$url.'<br />';
  51. $data = fetch_page($url);
  52. if(strpos($data,"<title>Wealden District Council's applications online - Copyright, disclaimer & personal data</title>")) {
  53. //Accept their terms
  54. list($junk,$viewstate) = explode('<input type="hidden" name="__VIEWSTATE" value="',$data,2);
  55. list($viewstate,$junk) = explode('" />',$viewstate,2);
  56. //echo 'Attempting to bypass copyright page...<br />';
  57. $url = 'http://www.planning.wealden.gov.uk/aspxpages/Copyright.aspx?pageno='.$page_no.'&QueryType=9&WeekNo=&WeekStart=&WeekEnd=&CaseNo=&Add=&ShowInd=&DocId=&AppRef='.$AppRef.'&Category=DC&DateType=R&StartDate='.$date.'&EndDate='.$date.'&Agent=&ParishCode=&WardCode=&Parish=&Ward=&AdvAppNo=&AdvAdd=&AdvProposal=&DecisionCode=&Det=';
  58. $data = fetch_page($url,'btnCopyrightAccept=Accept&__VIEWSTATE='.urlencode($viewstate).'',2);
  59. }
  60. list($junk,$data) = explode('<span id="lblSearchResults">', $data);
  61. list($data,$next_page) = explode('<div id="pagenumbers">',$data);
  62. $data = explode('</ul>',$data);
  63. unset($data[10]);
  64. foreach($data as $application) {
  65. $application = explode('</li>',$application);
  66. $AppNo = extract_data($application[0]);
  67. if(!empty($AppNo)) {
  68. $applications[$AppNo]['AppNo'] = $AppNo;
  69. $Loc = extract_data($application[1]);
  70. $applications[$AppNo]['Address'] = $Loc;
  71. preg_match("/([A-Z]{1,2}[0-9][0-9A-Z]?\s?[0-9][A-Z]{2})/",$Loc,$PostCode);
  72. if(isset($PostCode[1])) {
  73. $applications[$AppNo]['PostCode'] = $PostCode[1];
  74. } else {
  75. $applications[$AppNo]['PostCode'] = false;
  76. }
  77. $applications[$AppNo]['Info'] = extract_data($application[2]);
  78. parse_detail($AppNo);
  79. }
  80. }
  81. if(strpos($next_page,'Next</a></div></span> <br />')) {
  82. $page_no++;
  83. //echo "Loading next page...";
  84. if($page_no < 6) {
  85. parse_search($date,$page_no,$AppNo);
  86. }
  87. }
  88. }
  89. function parse_detail($AppNo) {
  90. global $applications;
  91. $url = 'http://www.planning.wealden.gov.uk/aspxpages/ResultsDetail.aspx?appref='.$AppNo.'&Category=DC';
  92. list($junk,$data) = explode('<span id="lblSearchDetails">',fetch_page($url),2);
  93. list($data,$junk) = explode('<div class="linkborder">',$data,2);
  94. $data = explode('</li>',$data);
  95. $applications[$AppNo]['AppType'] = extract_data($data[1]);
  96. $applications[$AppNo]['DateRec'] = extract_data($data[2]);
  97. $applications[$AppNo]['DateExp'] = extract_data($data[3]);
  98. $applications[$AppNo]['Parish'] = extract_data($data[6]);
  99. $applications[$AppNo]['GridRef'] = extract_data($data[7]);
  100. $applications[$AppNo]['UPRN'] = extract_data($data[8]);
  101. list($status,$junk) = explode(' - ',extract_data($data[9]));
  102. $applications[$AppNo]['Status'] = trim($status);
  103. $applications[$AppNo]['DateConExp'] = extract_data($data[10]);
  104. $applications[$AppNo]['DateComDel'] = extract_data($data[11]);
  105. $applications[$AppNo]['Decision'] = extract_data($data[12]);
  106. $applications[$AppNo]['DateDec'] = extract_data($data[13]);
  107. $applications[$AppNo]['CaseOfficer'] = extract_data($data[14]);
  108. }
  109. parse_search($date);
  110. header("Content-Type: text/xml");
  111. echo "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
  112. echo "<planning>\n";
  113. echo "\t<authority_name>Wealden District Council</authority_name>\n";
  114. echo "\t<authority_short_name>Wealden</authority_short_name>\n";
  115. echo "\t<applications>\n";
  116. foreach($applications as $application) {
  117. echo "\t\t<application>\n";
  118. echo "\t\t\t<council_reference>".$application['AppNo']."</council_reference>\n";
  119. echo "\t\t\t<address>".$application['Address']."</address>\n";
  120. echo "\t\t\t<postcode>".$application['PostCode']."</postcode>\n";
  121. echo "\t\t\t<description>".$application['Info']."</description>\n";
  122. echo "\t\t\t<info_url><![CDATA[http://www.planning.wealden.gov.uk/aspxpages/ResultsDetail.aspx?appref=".$application['AppNo']."&Category=DC]]></info_url>\n";
  123. echo "\t\t\t<comment_url>planning@wealden.gov.uk</comment_url>\n";
  124. echo "\t\t\t<date_received>".$application['DateRec']."</date_received>\n";
  125. if($all) {
  126. echo "\t\t\t<application_type>".$application['AppType']."</application_type>\n";
  127. echo "\t\t\t<date_expires>".$application['DateExp']."</date_expires>\n";
  128. echo "\t\t\t<parish>".$application['Parish']."</parish>\n";
  129. echo "\t\t\t<grid_reference>".$application['GridRef']."</grid_reference>\n";
  130. echo "\t\t\t<uprn>".$application['UPRN']."</uprn>\n";
  131. echo "\t\t\t<status>".$application['Status']."</status>\n";
  132. echo "\t\t\t<consultation_expiry_date>".$application['DateConExp']."</consultation_expiry_date>\n";
  133. echo "\t\t\t<committee_delegated_date>".$application['DateComDel']."</committee_delegated_date>\n";
  134. echo "\t\t\t<decision>".$application['Decision']."</decision>\n";
  135. echo "\t\t\t<decision_date>".$application['DateDec']."</decision_date>\n";
  136. echo "\t\t\t<case_officer>".$application['CaseOfficer']."</case_officer>\n";
  137. }
  138. echo "\t\t</application>\n";
  139. }
  140. echo "\t</applications>\n";
  141. echo "</planning>";
  142. ?>