Automatically exported from code.google.com/p/planningalerts
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

scraper_support.php 14 KiB

18 年前
18 年前
18 年前
18 年前
18 年前
18 年前
18 年前
18 年前
18 年前
18 年前
18 年前
18 年前
18 年前
18 年前
18 年前
18 年前
15 年前
15 年前
18 年前
15 年前
15 年前
18 年前
15 年前
18 年前
18 年前
18 年前
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452
  1. <?php
  2. //Includes
  3. require_once('config.php');
  4. require_once('application.php');
  5. require_once ("PEAR/HTTP/Request.php");
  6. require_once('phpcoord.php');
  7. //Generic scrapers
  8. function scrape_applications_publicaccess ($search_url, $info_url_base, $comment_url_base){
  9. $applications = array();
  10. //$application_pattern = "/<tr><th>([0-9]*)<\/th>([^;]*)([^<]*)/";
  11. $application_pattern = "/<tr><th>([0-9]*)<\/th>.*(?=<\/tr)/U";
  12. //grab the page
  13. $html = safe_scrape_page($search_url);
  14. //clean html
  15. $html = str_replace("\r\n","", $html);
  16. preg_match_all($application_pattern, $html, $application_matches, PREG_PATTERN_ORDER);
  17. foreach ($application_matches[0] as $application_match){
  18. //START Duncan's debug
  19. //print_r($application_match);
  20. //print_r("END");
  21. // END Duncan's debug
  22. $detail_pattern = "/<td>([^<])*/";
  23. preg_match_all($detail_pattern, $application_match, $detail_matches, PREG_PATTERN_ORDER);
  24. $application = new Application();
  25. //match the basic details
  26. $application->council_reference = str_replace("<td>", "", $detail_matches[0][0]);
  27. $application->date_received = str_replace("<td>", "", $detail_matches[0][1]);
  28. $application->address = str_replace("<td>", "", $detail_matches[0][2]);
  29. //$application->status = str_replace("<td>", "", $detail_matches[0][4]);
  30. //match case number
  31. $casenumber_pattern = "/caseno=([^&]*)/";
  32. preg_match($casenumber_pattern, $application_match, $casenumber_matches);
  33. //START Duncan's debug
  34. //print_r($application_match);
  35. //var_dump($casenumber_matches);
  36. //END Duncan's debug
  37. $case_number ="";
  38. if(sizeof($casenumber_matches)>0){
  39. $case_number = str_replace("caseno=","", $casenumber_matches[0]);
  40. }
  41. //if weve found a caase number, then get the details
  42. if($case_number !=""){
  43. //Comment and info urls
  44. $application->info_url = $info_url_base . $case_number;
  45. $application->comment_url = $comment_url_base . $case_number;
  46. //Get the postcode
  47. $postcode_pattern = "/[A-Z][A-Z]?[0-9][A-Z0-9]? ?[0-9][ABDEFGHJLNPQRSTUWXYZ]{2}/";
  48. preg_match($postcode_pattern, $application->address, $postcode_matches);
  49. if(isset($postcode_matches[0])){
  50. $application->postcode = $postcode_matches[0];
  51. }
  52. //get full details
  53. $details_html = "";
  54. $details_html = safe_scrape_page($info_url_base . $case_number);
  55. //regular expresion and clean
  56. $full_detail_pattern = '/id="desc" rows="[1-9]" cols="80" class="cDetailInput">([^<]*)/';
  57. preg_match($full_detail_pattern, $details_html, $full_detail_matches);
  58. if (isset($full_detail_matches[0])){
  59. $application->description = substr($full_detail_matches[0], strpos($full_detail_matches[0], ">") + 1);
  60. }
  61. //only add it if we have a postcode (bit useless otherwise)
  62. if(is_postcode($application->postcode)){
  63. array_push($applications, $application);
  64. }
  65. }else{
  66. error_log("Unable to find case number for an application at " . $search_url);
  67. }
  68. }
  69. //return
  70. return $applications;
  71. }
  72. function scrape_applications_wam ($search_url, $info_url_base, $comment_url_base, $detail_mode = 1){
  73. $applications = array();
  74. $application_pattern = '/<tr><td class=[^>]*>([^<]*)<\/td><td class=[^>]*><a href="[^"]*">([^<]*)<\/a><\/td><td class=[^>]*>([^<]*)<\/td><td class=[^>]*>([^<]*)<\/td>/';
  75. //grab the page
  76. $html = safe_scrape_page($search_url);
  77. //clean html
  78. $html = str_replace("\r\n","", $html);
  79. preg_match_all($application_pattern, $html, $application_matches, PREG_SET_ORDER);
  80. foreach ($application_matches as $application_match){
  81. if ($application_match[4] != 'Current') { continue; }
  82. $application = new Application();
  83. //match the basic details
  84. $application->council_reference = $application_match[2];
  85. $case_number = $application_match[2];
  86. $application->date_received = $application_match[1];
  87. $application->address = $application_match[3];
  88. //$application->status = $application_match[4];
  89. //if weve found a caase number, then get the details
  90. if($case_number !=""){
  91. //Comment and info urls
  92. $application->info_url = $info_url_base . $case_number;
  93. $application->comment_url = $comment_url_base . $case_number;
  94. //Get the postcode
  95. $postcode_pattern = "/[A-Z][A-Z]?[0-9][A-Z0-9]? ?[0-9][ABDEFGHJLNPQRSTUWXYZ]{2}/";
  96. preg_match($postcode_pattern, $application->address, $postcode_matches);
  97. if(isset($postcode_matches[0])){
  98. $application->postcode = $postcode_matches[0];
  99. }
  100. //get full details
  101. $details_html = "";
  102. $details_html = safe_scrape_page($info_url_base . $case_number);
  103. $details_html = str_replace("\r\n","",$details_html);
  104. //regular expresion and clean. SItes vary a tiny bit in their html, so there's a bit of a hack here
  105. if ($detail_mode == 1){
  106. $full_detail_pattern = '/Development:<.*<td colspan="3">([^<]*)<\/td>/';
  107. }
  108. if ($detail_mode == 2){
  109. $full_detail_pattern = '/Development:<\/td><td>([^<]*)/';
  110. }
  111. preg_match($full_detail_pattern, $details_html, $full_detail_matches);
  112. if (isset($full_detail_matches[1])){
  113. $application->description = $full_detail_matches[1];
  114. }
  115. //only add it if we have a postcode (bit useless otherwise)
  116. if(is_postcode($application->postcode)){
  117. //removed the xy for the moment. It is slowing down the scrape and will be added when the app is parsed anyway (Richard)
  118. /* $xy = postcode_to_location($application->postcode);
  119. $application->x = $xy[0];
  120. $application->y = $xy[1];
  121. $os = new OSRef($xy[0],$xy[1]);
  122. $latlon = $os->toLatLng();
  123. $application->lat = $latlon->lat;
  124. $application->lon = $latlon->lng;
  125. */
  126. array_push($applications, $application);
  127. }
  128. }else{
  129. error_log("Unable to find case number for an application at " . $search_url);
  130. }
  131. }
  132. //return
  133. return $applications;
  134. }
  135. // Council specific scapers
  136. function scrape_applications_islington ($search_url, $info_url_base, $comment_url_base){
  137. $applications = array();
  138. $application_pattern = '/<TR>([^<]*)<TD class="lg" valign="top" >([^<]*)<a href([^<]*)<a href=wphappcriteria.display>Search Criteria(.*)([^<]*)<(.*)>([^<]*)<TD class="lg" >([^<]*)<\/TD>([^<]*)<TD class="lg" >([^<]*)<INPUT TYPE=HIDDEN NAME([^>]*)([^<]*)/';
  139. //grab the page
  140. $html = safe_scrape_page($search_url);
  141. preg_match_all($application_pattern, $html, $application_matches, PREG_PATTERN_ORDER);
  142. foreach ($application_matches[0] as $application_match){
  143. $application_string = str_replace("\n","", $application_match);
  144. $reference_pattern = '/Search Results<\/a>">([^<]*)/';
  145. preg_match_all($reference_pattern, $application_string, $reference_matches, PREG_PATTERN_ORDER);
  146. $application = new Application();
  147. //match the applicaiton number
  148. $application->council_reference = str_replace('Search Results</a>">', "", $reference_matches[0][0]);
  149. //Comment and info urls
  150. $application->info_url = $info_url_base . $application->council_reference;
  151. $application->comment_url = $comment_url_base . $application->council_reference;
  152. //get full details
  153. $details_html = "";
  154. $details_html = safe_scrape_page($info_url_base . $application->council_reference);
  155. $details_html = str_replace("\r\n","",$details_html);
  156. //Details
  157. $full_detail_pattern = '/Proposal:<\/label><\/td>([^<]*)<td colspan="3">([^<]*)/';
  158. preg_match($full_detail_pattern, $details_html, $full_detail_matches);
  159. if (isset($full_detail_matches[2])){
  160. $application->description = $full_detail_matches[2];
  161. }
  162. //Address
  163. $address_pattern = '/Main location:<\/label><\/td>([^<]*)<td colspan="3">([^<]*)/';
  164. $address = "";
  165. preg_match($address_pattern, $details_html, $address_matches);
  166. if(isset($address_matches[2])){
  167. $application->address = $address_matches[2];
  168. }
  169. //postcode
  170. $postcode_pattern = "/[A-Z][A-Z]?[0-9][A-Z0-9]? ?[0-9][ABDEFGHJLNPQRSTUWXYZ]{2}/";
  171. preg_match($postcode_pattern, $application->address, $postcode_matches);
  172. if(isset($postcode_matches[0])){
  173. $application->postcode = $postcode_matches[0];
  174. }
  175. //only add it if we have a postcode (bit useless otherwise)
  176. if(is_postcode($application->postcode)){
  177. array_push($applications, $application);
  178. }
  179. }
  180. //return
  181. return $applications;
  182. }
  183. //validate postcode
  184. function is_postcode ($postcode){
  185. $valid = false;
  186. $postcode=str_replace(" ","",$postcode);
  187. if(ereg ('^[a-zA-Z]{1,2}[0-9]{1,2}[a-zA-Z]{0,1}[0-9]{1}[a-zA-Z]{2}$', $postcode)){
  188. $valid = true;
  189. }
  190. return $valid;
  191. }
  192. function clean_postcode ($postcode, $upper = true) {
  193. $reg = array();
  194. $postcode = trim($postcode);
  195. preg_match('/^(.+?)([0-9][a-z]{2})$/',$postcode, $reg);
  196. $clean_postcode = trim($reg[1]) . ' ' . trim($reg[2]);
  197. if($upper){
  198. $clean_postcode = strtoupper($clean_postcode);
  199. }
  200. return $clean_postcode;
  201. }
  202. //Tiny url
  203. function tiny_url($url,$length=30){
  204. // make nasty big url all small
  205. if (strlen($url) >= $length){
  206. $tinyurl = @file ("http://tinyurl.com/api-create.php?url=$url");
  207. if (is_array($tinyurl)){
  208. $tinyurl = join ('', $tinyurl);
  209. } else {
  210. $tinyurl = $url;
  211. }
  212. } else {
  213. $tinyurl = $url;
  214. }
  215. return $tinyurl;
  216. }
  217. //Google maps url
  218. function googlemap_url_from_postcode($postcode, $zoom = 15){
  219. $postcode = strtolower(str_replace(" ", "+", $postcode));
  220. return "http://maps.google.co.uk/maps?q=$postcode&z=$zoom";
  221. }
  222. //postcode to location
  223. function postcode_to_location($postcode){
  224. // We don't actually need to fetch the page, we
  225. // can get everything we need from the url we are
  226. // redirected to.
  227. $clean_postcode = strtolower($postcode);
  228. $clean_postcode = str_replace(" ","+", $clean_postcode);
  229. $url = "http://ernestmarples.com/?p=" . $clean_postcode . "&f=csv";
  230. $result = file_get_contents($url);
  231. $result = split(",", $result);
  232. if(count($result) != 2){
  233. trigger_error("No lat/long could be found");
  234. }
  235. $lat = $result[0];
  236. $lng = $result[1];
  237. $LatLng = new LatLng($lat, $lng);
  238. $OSBG = $LatLng->toOSRef();
  239. $return = array($OSBG->easting, $OSBG->northing);
  240. }
  241. function location_to_postcode($easting, $northing) {
  242. $url = sprintf(
  243. "http://streetmap.co.uk/streetmap.dll?GridConvert?name=%d,%d&type=OSGrid",
  244. $easting, $northing);
  245. $resp = @file($url);
  246. if (is_array($resp)) $resp = join("\n", $resp);
  247. $resp = strip_tags($resp);
  248. // Kinda ghetto. Would be nice to have a nicer regex for postcodes.
  249. if (preg_match('/Nearest\s+Post\s+Code\s+(\S+\s+\S+)/i', $resp, $mat))
  250. return $mat[1];
  251. return NULL;
  252. }
  253. function valid_email ($string) {
  254. $valid = false;
  255. if (!ereg('^[-!#$%&\'*+\\./0-9=?A-Z^_`a-z{|}~]+'.
  256. '@'.
  257. '[-!#$%&\'*+\\/0-9=?A-Z^_`a-z{|}~]+\.'.
  258. '[-!#$%&\'*+\\./0-9=?A-Z^_`a-z{|}~]+$', $string)) {
  259. $valid = false;
  260. } else {
  261. $valid = true;
  262. }
  263. return $valid;
  264. }
  265. function alert_size_to_meters($alert_area_size){
  266. $area_size_meters = 0;
  267. if ($alert_area_size == "s"){
  268. $area_size_meters = SMALL_ZONE_SIZE;
  269. }elseif ($alert_area_size == "m"){
  270. $area_size_meters = MEDIUM_ZONE_SIZE;
  271. }elseif ($alert_area_size == "l"){
  272. $area_size_meters = LARGE_ZONE_SIZE;
  273. }
  274. return $area_size_meters;
  275. }
  276. //Send a text email
  277. function send_text_email($to, $from_name, $from_email, $subject, $body){
  278. $headers = 'MIME-Version: 1.0' . "\r\n";
  279. $headers .= 'Content-type: text/plain; charset=iso-8859-1' . "\r\n";
  280. $headers .= 'From: ' . $from_name. ' <' . $from_email . ">\r\n";
  281. mail($to, $subject, $body, $headers);
  282. }
  283. // Format a date to mysql format
  284. function mysql_date($date){
  285. return date("Y-m-d H::i:s", $date);
  286. }
  287. function safe_scrape_page($url, $method = "GET"){
  288. $page = "";
  289. for ($i=0; $i < 3; $i++){
  290. if($page == false){
  291. if (SCRAPE_METHOD == "PEAR"){
  292. $page = scrape_page_pear($url, $method);
  293. }else{
  294. $page = scrape_page_curl($url, $method);
  295. }
  296. }
  297. }
  298. return $page;
  299. }
  300. function scrape_page_pear($url, $method = "GET"){
  301. $page = "";
  302. $request = new HTTP_Request($url, array("method" => $method));
  303. $request->sendRequest();
  304. $page = $request->getResponseBody();
  305. return $page;
  306. }
  307. function scrape_page_curl($url) {
  308. $ch = curl_init($url);
  309. curl_setopt($ch,CURLOPT_RETURNTRANSFER,TRUE);
  310. curl_setopt($ch,CURLOPT_FOLLOWLOCATION,TRUE);
  311. return curl_exec($ch);
  312. }
  313. function display_applications($applications, $authority_name, $authority_short_name){
  314. //smarty
  315. $smarty = new Smarty;
  316. $smarty->force_compile = true;
  317. $smarty->compile_dir = SMARTY_COMPILE_DIRECTORY;
  318. $smarty->template_dir = "../templates";
  319. $smarty->assign("authority_name", $authority_name);
  320. $smarty->assign("authority_short_name", $authority_short_name);
  321. if (sizeof($applications) > 0){
  322. $smarty->assign("applications", $applications);
  323. }
  324. $smarty->display("xml.tpl");
  325. }
  326. function get_time_from_get(){
  327. //if any get params were passed, overwrite the default date
  328. if (isset($_GET['day'])){
  329. $day = $_GET['day'];
  330. }else{
  331. throw_error("No day set in get string");
  332. }
  333. if (isset($_GET['month'])){
  334. $month = $_GET['month'];
  335. }else{
  336. throw_error("No year set in get string");
  337. }
  338. if (isset($_GET['year'])){
  339. $year = $_GET['year'];
  340. }else{
  341. throw_error("No year set in get string");
  342. }
  343. return mktime(0,0,0,$month,$day,$year);
  344. }
  345. function throw_error($message){
  346. throw new exception($message);
  347. }
  348. function redirect ($url){
  349. header("Location: $url");
  350. }
  351. ?>