Automatically exported from code.google.com/p/planningalerts
Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

scraper_support.php 14 KiB

18 anos atrás
18 anos atrás
18 anos atrás
18 anos atrás
18 anos atrás
18 anos atrás
18 anos atrás
18 anos atrás
18 anos atrás
18 anos atrás
18 anos atrás
18 anos atrás
18 anos atrás
18 anos atrás
18 anos atrás
18 anos atrás
18 anos atrás
18 anos atrás
18 anos atrás
18 anos atrás
18 anos atrás
18 anos atrás
18 anos atrás
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. <?php
  2. //Includes
  3. require_once('config.php');
  4. require_once('application.php');
  5. require_once ("PEAR/HTTP/Request.php");
  6. require_once('phpcoord.php');
  7. //Generic scrapers
  8. function scrape_applications_publicaccess ($search_url, $info_url_base, $comment_url_base){
  9. $applications = array();
  10. //$application_pattern = "/<tr><th>([0-9]*)<\/th>([^;]*)([^<]*)/";
  11. $application_pattern = "/<tr><th>([0-9]*)<\/th>.*(?=<\/tr)/U";
  12. //grab the page
  13. $html = safe_scrape_page($search_url);
  14. //clean html
  15. $html = str_replace("\r\n","", $html);
  16. preg_match_all($application_pattern, $html, $application_matches, PREG_PATTERN_ORDER);
  17. foreach ($application_matches[0] as $application_match){
  18. //START Duncan's debug
  19. //print_r($application_match);
  20. //print_r("END");
  21. // END Duncan's debug
  22. $detail_pattern = "/<td>([^<])*/";
  23. preg_match_all($detail_pattern, $application_match, $detail_matches, PREG_PATTERN_ORDER);
  24. $application = new Application();
  25. //match the basic details
  26. $application->council_reference = str_replace("<td>", "", $detail_matches[0][0]);
  27. $application->date_received = str_replace("<td>", "", $detail_matches[0][1]);
  28. $application->address = str_replace("<td>", "", $detail_matches[0][2]);
  29. //$application->status = str_replace("<td>", "", $detail_matches[0][4]);
  30. //match case number
  31. $casenumber_pattern = "/caseno=([^&]*)/";
  32. preg_match($casenumber_pattern, $application_match, $casenumber_matches);
  33. //START Duncan's debug
  34. //print_r($application_match);
  35. //var_dump($casenumber_matches);
  36. //END Duncan's debug
  37. $case_number ="";
  38. if(sizeof($casenumber_matches)>0){
  39. $case_number = str_replace("caseno=","", $casenumber_matches[0]);
  40. }
  41. //if weve found a caase number, then get the details
  42. if($case_number !=""){
  43. //Comment and info urls
  44. $application->info_url = $info_url_base . $case_number;
  45. $application->comment_url = $comment_url_base . $case_number;
  46. //Get the postcode
  47. $postcode_pattern = "/[A-Z][A-Z]?[0-9][A-Z0-9]? ?[0-9][ABDEFGHJLNPQRSTUWXYZ]{2}/";
  48. preg_match($postcode_pattern, $application->address, $postcode_matches);
  49. if(isset($postcode_matches[0])){
  50. $application->postcode = $postcode_matches[0];
  51. }
  52. //get full details
  53. $details_html = "";
  54. $details_html = safe_scrape_page($info_url_base . $case_number);
  55. //regular expresion and clean
  56. $full_detail_pattern = '/id="desc" rows="[1-9]" cols="80" class="cDetailInput">([^<]*)/';
  57. preg_match($full_detail_pattern, $details_html, $full_detail_matches);
  58. if (isset($full_detail_matches[0])){
  59. $application->description = substr($full_detail_matches[0], strpos($full_detail_matches[0], ">") + 1);
  60. }
  61. //only add it if we have a postcode (bit useless otherwise)
  62. if(is_postcode($application->postcode)){
  63. array_push($applications, $application);
  64. }
  65. }else{
  66. error_log("Unable to find case number for an application at " . $search_url);
  67. }
  68. }
  69. //return
  70. return $applications;
  71. }
  72. function scrape_applications_wam ($search_url, $info_url_base, $comment_url_base, $detail_mode = 1){
  73. $applications = array();
  74. $application_pattern = '/<tr><td class=[^>]*>([^<]*)<\/td><td class=[^>]*><a href="[^"]*">([^<]*)<\/a><\/td><td class=[^>]*>([^<]*)<\/td><td class=[^>]*>([^<]*)<\/td>/';
  75. //grab the page
  76. $html = safe_scrape_page($search_url);
  77. //clean html
  78. $html = str_replace("\r\n","", $html);
  79. preg_match_all($application_pattern, $html, $application_matches, PREG_SET_ORDER);
  80. foreach ($application_matches as $application_match){
  81. if ($application_match[4] != 'Current') { continue; }
  82. $application = new Application();
  83. //match the basic details
  84. $application->council_reference = $application_match[2];
  85. $case_number = $application_match[2];
  86. $application->date_received = $application_match[1];
  87. $application->address = $application_match[3];
  88. //$application->status = $application_match[4];
  89. //if weve found a caase number, then get the details
  90. if($case_number !=""){
  91. //Comment and info urls
  92. $application->info_url = $info_url_base . $case_number;
  93. $application->comment_url = $comment_url_base . $case_number;
  94. //Get the postcode
  95. $postcode_pattern = "/[A-Z][A-Z]?[0-9][A-Z0-9]? ?[0-9][ABDEFGHJLNPQRSTUWXYZ]{2}/";
  96. preg_match($postcode_pattern, $application->address, $postcode_matches);
  97. if(isset($postcode_matches[0])){
  98. $application->postcode = $postcode_matches[0];
  99. }
  100. //get full details
  101. $details_html = "";
  102. $details_html = safe_scrape_page($info_url_base . $case_number);
  103. $details_html = str_replace("\r\n","",$details_html);
  104. //regular expresion and clean. SItes vary a tiny bit in their html, so there's a bit of a hack here
  105. if ($detail_mode == 1){
  106. $full_detail_pattern = '/Development:<.*<td colspan="3">([^<]*)<\/td>/';
  107. }
  108. if ($detail_mode == 2){
  109. $full_detail_pattern = '/Development:<\/td><td>([^<]*)/';
  110. }
  111. preg_match($full_detail_pattern, $details_html, $full_detail_matches);
  112. if (isset($full_detail_matches[1])){
  113. $application->description = $full_detail_matches[1];
  114. }
  115. //only add it if we have a postcode (bit useless otherwise)
  116. if(is_postcode($application->postcode)){
  117. //removed the xy for the moment. It is slowing down the scrape and will be added when the app is parsed anyway (Richard)
  118. /* $xy = postcode_to_location($application->postcode);
  119. $application->x = $xy[0];
  120. $application->y = $xy[1];
  121. $os = new OSRef($xy[0],$xy[1]);
  122. $latlon = $os->toLatLng();
  123. $application->lat = $latlon->lat;
  124. $application->lon = $latlon->lng;
  125. */
  126. array_push($applications, $application);
  127. }
  128. }else{
  129. error_log("Unable to find case number for an application at " . $search_url);
  130. }
  131. }
  132. //return
  133. return $applications;
  134. }
  135. // Council specific scapers
  136. function scrape_applications_islington ($search_url, $info_url_base, $comment_url_base){
  137. $applications = array();
  138. $application_pattern = '/<TR>([^<]*)<TD class="lg" valign="top" >([^<]*)<a href([^<]*)<a href=wphappcriteria.display>Search Criteria(.*)([^<]*)<(.*)>([^<]*)<TD class="lg" >([^<]*)<\/TD>([^<]*)<TD class="lg" >([^<]*)<INPUT TYPE=HIDDEN NAME([^>]*)([^<]*)/';
  139. //grab the page
  140. $html = safe_scrape_page($search_url);
  141. preg_match_all($application_pattern, $html, $application_matches, PREG_PATTERN_ORDER);
  142. foreach ($application_matches[0] as $application_match){
  143. $application_string = str_replace("\n","", $application_match);
  144. $reference_pattern = '/Search Results<\/a>">([^<]*)/';
  145. preg_match_all($reference_pattern, $application_string, $reference_matches, PREG_PATTERN_ORDER);
  146. $application = new Application();
  147. //match the applicaiton number
  148. $application->council_reference = str_replace('Search Results</a>">', "", $reference_matches[0][0]);
  149. //Comment and info urls
  150. $application->info_url = $info_url_base . $application->council_reference;
  151. $application->comment_url = $comment_url_base . $application->council_reference;
  152. //get full details
  153. $details_html = "";
  154. $details_html = safe_scrape_page($info_url_base . $application->council_reference);
  155. $details_html = str_replace("\r\n","",$details_html);
  156. //Details
  157. $full_detail_pattern = '/Proposal:<\/label><\/td>([^<]*)<td colspan="3">([^<]*)/';
  158. preg_match($full_detail_pattern, $details_html, $full_detail_matches);
  159. if (isset($full_detail_matches[2])){
  160. $application->description = $full_detail_matches[2];
  161. }
  162. //Address
  163. $address_pattern = '/Main location:<\/label><\/td>([^<]*)<td colspan="3">([^<]*)/';
  164. $address = "";
  165. preg_match($address_pattern, $details_html, $address_matches);
  166. if(isset($address_matches[2])){
  167. $application->address = $address_matches[2];
  168. }
  169. //postcode
  170. $postcode_pattern = "/[A-Z][A-Z]?[0-9][A-Z0-9]? ?[0-9][ABDEFGHJLNPQRSTUWXYZ]{2}/";
  171. preg_match($postcode_pattern, $application->address, $postcode_matches);
  172. if(isset($postcode_matches[0])){
  173. $application->postcode = $postcode_matches[0];
  174. }
  175. //only add it if we have a postcode (bit useless otherwise)
  176. if(is_postcode($application->postcode)){
  177. array_push($applications, $application);
  178. }
  179. }
  180. //return
  181. return $applications;
  182. }
  183. //validate postcode
  184. function is_postcode ($postcode){
  185. $valid = false;
  186. $postcode=str_replace(" ","",$postcode);
  187. if(ereg ('^[a-zA-Z]{1,2}[0-9]{1,2}[a-zA-Z]{0,1}[0-9]{1}[a-zA-Z]{2}$', $postcode)){
  188. $valid = true;
  189. }
  190. return $valid;
  191. }
  192. function clean_postcode ($postcode, $upper = true) {
  193. $reg = array();
  194. $postcode = trim($postcode);
  195. preg_match('/^(.+?)([0-9][a-z]{2})$/',$postcode, $reg);
  196. $clean_postcode = trim($reg[1]) . ' ' . trim($reg[2]);
  197. if($upper){
  198. $clean_postcode = strtoupper($clean_postcode);
  199. }
  200. return $clean_postcode;
  201. }
  202. //Tiny url
  203. function tiny_url($url,$length=30){
  204. // make nasty big url all small
  205. if (strlen($url) >= $length){
  206. $tinyurl = @file ("http://tinyurl.com/api-create.php?url=$url");
  207. if (is_array($tinyurl)){
  208. $tinyurl = join ('', $tinyurl);
  209. } else {
  210. $tinyurl = $url;
  211. }
  212. } else {
  213. $tinyurl = $url;
  214. }
  215. return $tinyurl;
  216. }
  217. //Google maps url
  218. function googlemap_url_from_postcode($postcode, $zoom = 15){
  219. $postcode = strtolower(str_replace(" ", "+", $postcode));
  220. return "http://maps.google.co.uk/maps?q=$postcode&z=$zoom";
  221. }
  222. //postcode to location
  223. function postcode_to_location($postcode){
  224. $x = 0;
  225. $y = 0;
  226. $clean_postcode = strtolower($postcode);
  227. $clean_postcode = str_replace(" ","+", $clean_postcode);
  228. $url = "http://www.streetmap.co.uk/newsearch.srf?type=Postcode&name=" . $clean_postcode;
  229. $html = file_get_contents($url);
  230. $x_pattern = "/var _LocationX=\d*;/";
  231. $y_pattern = "/var _LocationY=\d*;/";
  232. //X
  233. preg_match($x_pattern, $html, $matches);
  234. if(sizeof($matches) >0){
  235. $x = $matches[0];
  236. $x = str_replace('var _LocationX=',"", $x);
  237. $x = str_replace(";","", $x);
  238. }
  239. //Y
  240. preg_match($y_pattern, $html, $matches);
  241. if(sizeof($matches) >0){
  242. $y = str_replace("var _LocationY=","", $matches[0]);
  243. $y = str_replace(";","", $y);
  244. }
  245. $return = array();
  246. $return[0] = $x;
  247. $return[1] = $y;
  248. return $return;
  249. }
  250. function valid_email ($string) {
  251. $valid = false;
  252. if (!ereg('^[-!#$%&\'*+\\./0-9=?A-Z^_`a-z{|}~]+'.
  253. '@'.
  254. '[-!#$%&\'*+\\/0-9=?A-Z^_`a-z{|}~]+\.'.
  255. '[-!#$%&\'*+\\./0-9=?A-Z^_`a-z{|}~]+$', $string)) {
  256. $valid = false;
  257. } else {
  258. $valid = true;
  259. }
  260. return $valid;
  261. }
  262. function alert_size_to_meters($alert_area_size){
  263. $area_size_meters = 0;
  264. if ($alert_area_size == "s"){
  265. $area_size_meters = SMALL_ZONE_SIZE;
  266. }elseif ($alert_area_size == "m"){
  267. $area_size_meters = MEDIUM_ZONE_SIZE;
  268. }elseif ($alert_area_size == "l"){
  269. $area_size_meters = LARGE_ZONE_SIZE;
  270. }
  271. return $area_size_meters;
  272. }
  273. //Send a text email
  274. function send_text_email($to, $from_name, $from_email, $subject, $body){
  275. $headers = 'MIME-Version: 1.0' . "\r\n";
  276. $headers .= 'Content-type: text/plain; charset=iso-8859-1' . "\r\n";
  277. $headers .= 'From: ' . $from_name. ' <' . $from_email . ">\r\n";
  278. mail($to, $subject, $body, $headers);
  279. }
  280. // Format a date to mysql format
  281. function mysql_date($date){
  282. return date("Y-m-d H::i:s", $date);
  283. }
  284. function safe_scrape_page($url, $method = "GET"){
  285. $page = "";
  286. for ($i=0; $i < 3; $i++){
  287. if($page == false){
  288. if (SCRAPE_METHOD == "PEAR"){
  289. $page = scrape_page_pear($url, $method);
  290. }else{
  291. $page = scrape_page_curl($url, $method);
  292. }
  293. }
  294. }
  295. return $page;
  296. }
  297. function scrape_page_pear($url, $method = "GET"){
  298. $page = "";
  299. $request = new HTTP_Request($url, array("method" => $method));
  300. $request->sendRequest();
  301. $page = $request->getResponseBody();
  302. return $page;
  303. }
  304. function scrape_page_curl($url) {
  305. $ch = curl_init($url);
  306. curl_setopt($ch,CURLOPT_RETURNTRANSFER,TRUE);
  307. curl_setopt($ch,CURLOPT_FOLLOWLOCATION,TRUE);
  308. return curl_exec($ch);
  309. }
  310. function display_applications($applications, $authority_name, $authority_short_name){
  311. //smarty
  312. $smarty = new Smarty;
  313. $smarty->force_compile = true;
  314. $smarty->compile_dir = SMARTY_COMPILE_DIRECTORY;
  315. $smarty->template_dir = "../templates";
  316. $smarty->assign("authority_name", $authority_name);
  317. $smarty->assign("authority_short_name", $authority_short_name);
  318. if (sizeof($applications) > 0){
  319. $smarty->assign("applications", $applications);
  320. }
  321. $smarty->display("xml.tpl");
  322. }
  323. function get_time_from_get(){
  324. //if any get params were passed, overwrite the default date
  325. if (isset($_GET['day'])){
  326. $day = $_GET['day'];
  327. }else{
  328. throw_error("No day set in get string");
  329. }
  330. if (isset($_GET['month'])){
  331. $month = $_GET['month'];
  332. }else{
  333. throw_error("No year set in get string");
  334. }
  335. if (isset($_GET['year'])){
  336. $year = $_GET['year'];
  337. }else{
  338. throw_error("No year set in get string");
  339. }
  340. return mktime(0,0,0,$month,$day,$year);
  341. }
  342. function throw_error($message){
  343. throw new exception($message);
  344. }
  345. function redirect ($url){
  346. header("Location: $url");
  347. }
  348. ?>