Automatically exported from code.google.com/p/planningalerts
25개 이상의 토픽을 선택하실 수 없습니다. Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

scraper_support.php 14 KiB

18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442
  1. <?php
  2. //Includes
  3. require_once('config.php');
  4. require_once('application.php');
  5. require_once ("PEAR/HTTP/Request.php");
  6. require_once('phpcoord.php');
  7. //Generic scrapers
  8. function scrape_applications_publicaccess ($search_url, $info_url_base, $comment_url_base){
  9. $applications = array();
  10. $application_pattern = "/<tr><th>([0-9]*)<\/th>([^;]*)([^<]*)/";
  11. //grab the page
  12. $html = safe_scrape_page($search_url);
  13. //clean html
  14. $html = str_replace("\r\n","", $html);
  15. preg_match_all($application_pattern, $html, $application_matches, PREG_PATTERN_ORDER);
  16. foreach ($application_matches[0] as $application_match){
  17. $detail_pattern = "/<td>([^<])*/";
  18. preg_match_all($detail_pattern, $application_match, $detail_matches, PREG_PATTERN_ORDER);
  19. $application = new Application();
  20. //match the basic details
  21. $application->council_reference = str_replace("<td>", "", $detail_matches[0][0]);
  22. $application->date_received = str_replace("<td>", "", $detail_matches[0][1]);
  23. $application->address = str_replace("<td>", "", $detail_matches[0][2]);
  24. //$application->status = str_replace("<td>", "", $detail_matches[0][4]);
  25. //match case number
  26. $casenumber_pattern = "/caseno=([^&]*)/";
  27. preg_match($casenumber_pattern, $application_match, $casenumber_matches);
  28. $case_number ="";
  29. if(sizeof($casenumber_matches)>0){
  30. $case_number = str_replace("caseno=","", $casenumber_matches[0]);
  31. }
  32. //if weve found a caase number, then get the details
  33. if($case_number !=""){
  34. //Comment and info urls
  35. $application->info_url = $info_url_base . $case_number;
  36. $application->comment_url = $comment_url_base . $case_number;
  37. //Get the postcode
  38. $postcode_pattern = "/[A-Z][A-Z]?[0-9][A-Z0-9]? ?[0-9][ABDEFGHJLNPQRSTUWXYZ]{2}/";
  39. preg_match($postcode_pattern, $application->address, $postcode_matches);
  40. if(isset($postcode_matches[0])){
  41. $application->postcode = $postcode_matches[0];
  42. }
  43. //get full details
  44. $details_html = "";
  45. $details_html = safe_scrape_page($info_url_base . $case_number);
  46. //regular expresion and clean
  47. $full_detail_pattern = '/id="desc" rows="[1-9]" cols="80" class="cDetailInput">([^<]*)/';
  48. preg_match($full_detail_pattern, $details_html, $full_detail_matches);
  49. if (isset($full_detail_matches[0])){
  50. $application->description = substr($full_detail_matches[0], strpos($full_detail_matches[0], ">") + 1);
  51. }
  52. //only add it if we have a postcode (bit useless otherwise)
  53. if(is_postcode($application->postcode)){
  54. array_push($applications, $application);
  55. }
  56. }else{
  57. error_log("Unable to find case number for an application at " . $search_url);
  58. }
  59. }
  60. //return
  61. return $applications;
  62. }
  63. function scrape_applications_wam ($search_url, $info_url_base, $comment_url_base, $detail_mode = 1){
  64. $applications = array();
  65. $application_pattern = '/<tr><td class=[^>]*>([^<]*)<\/td><td class=[^>]*><a href="[^"]*">([^<]*)<\/a><\/td><td class=[^>]*>([^<]*)<\/td><td class=[^>]*>([^<]*)<\/td>/';
  66. //grab the page
  67. $html = safe_scrape_page($search_url);
  68. //clean html
  69. $html = str_replace("\r\n","", $html);
  70. preg_match_all($application_pattern, $html, $application_matches, PREG_SET_ORDER);
  71. foreach ($application_matches as $application_match){
  72. if ($application_match[4] != 'Current') { continue; }
  73. $application = new Application();
  74. //match the basic details
  75. $application->council_reference = $application_match[2];
  76. $case_number = $application_match[2];
  77. $application->date_received = $application_match[1];
  78. $application->address = $application_match[3];
  79. //$application->status = $application_match[4];
  80. //if weve found a caase number, then get the details
  81. if($case_number !=""){
  82. //Comment and info urls
  83. $application->info_url = $info_url_base . $case_number;
  84. $application->comment_url = $comment_url_base . $case_number;
  85. //Get the postcode
  86. $postcode_pattern = "/[A-Z][A-Z]?[0-9][A-Z0-9]? ?[0-9][ABDEFGHJLNPQRSTUWXYZ]{2}/";
  87. preg_match($postcode_pattern, $application->address, $postcode_matches);
  88. if(isset($postcode_matches[0])){
  89. $application->postcode = $postcode_matches[0];
  90. }
  91. //get full details
  92. $details_html = "";
  93. $details_html = safe_scrape_page($info_url_base . $case_number);
  94. $details_html = str_replace("\r\n","",$details_html);
  95. //regular expresion and clean. SItes vary a tiny bit in their html, so there's a bit of a hack here
  96. if ($detail_mode == 1){
  97. $full_detail_pattern = '/Development:<.*<td colspan="3">([^<]*)<\/td>/';
  98. }
  99. if ($detail_mode == 2){
  100. $full_detail_pattern = '/Development:<\/td><td>([^<]*)/';
  101. }
  102. preg_match($full_detail_pattern, $details_html, $full_detail_matches);
  103. if (isset($full_detail_matches[1])){
  104. $application->description = $full_detail_matches[1];
  105. }
  106. //only add it if we have a postcode (bit useless otherwise)
  107. if(is_postcode($application->postcode)){
  108. //removed the xy for the moment. It is slowing down the scrape and will be added when the app is parsed anyway (Richard)
  109. /* $xy = postcode_to_location($application->postcode);
  110. $application->x = $xy[0];
  111. $application->y = $xy[1];
  112. $os = new OSRef($xy[0],$xy[1]);
  113. $latlon = $os->toLatLng();
  114. $application->lat = $latlon->lat;
  115. $application->lon = $latlon->lng;
  116. */
  117. array_push($applications, $application);
  118. }
  119. }else{
  120. error_log("Unable to find case number for an application at " . $search_url);
  121. }
  122. }
  123. //return
  124. return $applications;
  125. }
  126. // Council specific scapers
  127. function scrape_applications_islington ($search_url, $info_url_base, $comment_url_base){
  128. $applications = array();
  129. $application_pattern = '/<TR>([^<]*)<TD class="lg" valign="top" >([^<]*)<a href([^<]*)<a href=wphappcriteria.display>Search Criteria(.*)([^<]*)<(.*)>([^<]*)<TD class="lg" >([^<]*)<\/TD>([^<]*)<TD class="lg" >([^<]*)<INPUT TYPE=HIDDEN NAME([^>]*)([^<]*)/';
  130. //grab the page
  131. $html = safe_scrape_page($search_url);
  132. preg_match_all($application_pattern, $html, $application_matches, PREG_PATTERN_ORDER);
  133. foreach ($application_matches[0] as $application_match){
  134. $application_string = str_replace("\n","", $application_match);
  135. $reference_pattern = '/Search Results<\/a>">([^<]*)/';
  136. preg_match_all($reference_pattern, $application_string, $reference_matches, PREG_PATTERN_ORDER);
  137. $application = new Application();
  138. //match the applicaiton number
  139. $application->council_reference = str_replace('Search Results</a>">', "", $reference_matches[0][0]);
  140. //Comment and info urls
  141. $application->info_url = $info_url_base . $application->council_reference;
  142. $application->comment_url = $comment_url_base . $application->council_reference;
  143. //get full details
  144. $details_html = "";
  145. $details_html = safe_scrape_page($info_url_base . $application->council_reference);
  146. $details_html = str_replace("\r\n","",$details_html);
  147. //Details
  148. $full_detail_pattern = '/Proposal:<\/label><\/td>([^<]*)<td colspan="3">([^<]*)/';
  149. preg_match($full_detail_pattern, $details_html, $full_detail_matches);
  150. if (isset($full_detail_matches[2])){
  151. $application->description = $full_detail_matches[2];
  152. }
  153. //Address
  154. $address_pattern = '/Main location:<\/label><\/td>([^<]*)<td colspan="3">([^<]*)/';
  155. $address = "";
  156. preg_match($address_pattern, $details_html, $address_matches);
  157. if(isset($address_matches[2])){
  158. $application->address = $address_matches[2];
  159. }
  160. //postcode
  161. $postcode_pattern = "/[A-Z][A-Z]?[0-9][A-Z0-9]? ?[0-9][ABDEFGHJLNPQRSTUWXYZ]{2}/";
  162. preg_match($postcode_pattern, $application->address, $postcode_matches);
  163. if(isset($postcode_matches[0])){
  164. $application->postcode = $postcode_matches[0];
  165. }
  166. //only add it if we have a postcode (bit useless otherwise)
  167. if(is_postcode($application->postcode)){
  168. array_push($applications, $application);
  169. }
  170. }
  171. //return
  172. return $applications;
  173. }
  174. //validate postcode
  175. function is_postcode ($postcode){
  176. $valid = false;
  177. $postcode=str_replace(" ","",$postcode);
  178. if(ereg ('^[a-zA-Z]{1,2}[0-9]{1,2}[a-zA-Z]{0,1}[0-9]{1}[a-zA-Z]{2}$', $postcode)){
  179. $valid = true;
  180. }
  181. return $valid;
  182. }
  183. function clean_postcode ($postcode, $upper = true) {
  184. $reg = array();
  185. $postcode = trim($postcode);
  186. preg_match('/^(.+?)([0-9][a-z]{2})$/',$postcode, $reg);
  187. $clean_postcode = trim($reg[1]) . ' ' . trim($reg[2]);
  188. if($upper){
  189. $clean_postcode = strtoupper($clean_postcode);
  190. }
  191. return $clean_postcode;
  192. }
  193. //Tiny url
  194. function tiny_url($url,$length=30){
  195. // make nasty big url all small
  196. if (strlen($url) >= $length){
  197. $tinyurl = @file ("http://tinyurl.com/api-create.php?url=$url");
  198. if (is_array($tinyurl)){
  199. $tinyurl = join ('', $tinyurl);
  200. } else {
  201. $tinyurl = $url;
  202. }
  203. } else {
  204. $tinyurl = $url;
  205. }
  206. return $tinyurl;
  207. }
  208. //Google maps url
  209. function googlemap_url_from_postcode($postcode, $zoom = 15){
  210. $postcode = strtolower(str_replace(" ", "+", $postcode));
  211. return "http://maps.google.co.uk/maps?q=$postcode&z=$zoom";
  212. }
  213. //postcode to location
  214. function postcode_to_location($postcode){
  215. $x = 0;
  216. $y = 0;
  217. $clean_postcode = strtolower($postcode);
  218. $clean_postcode = str_replace(" ","+", $clean_postcode);
  219. $url = "http://www.streetmap.co.uk/newsearch.srf?type=Postcode&name=" . $clean_postcode;
  220. $html = file_get_contents($url);
  221. $x_pattern = "/var _LocationX=\d*;/";
  222. $y_pattern = "/var _LocationY=\d*;/";
  223. //X
  224. preg_match($x_pattern, $html, $matches);
  225. if(sizeof($matches) >0){
  226. $x = $matches[0];
  227. $x = str_replace('var _LocationX=',"", $x);
  228. $x = str_replace(";","", $x);
  229. }
  230. //Y
  231. preg_match($y_pattern, $html, $matches);
  232. if(sizeof($matches) >0){
  233. $y = str_replace("var _LocationY=","", $matches[0]);
  234. $y = str_replace(";","", $y);
  235. }
  236. $return = array();
  237. $return[0] = $x;
  238. $return[1] = $y;
  239. return $return;
  240. }
  241. function valid_email ($string) {
  242. $valid = false;
  243. if (!ereg('^[-!#$%&\'*+\\./0-9=?A-Z^_`a-z{|}~]+'.
  244. '@'.
  245. '[-!#$%&\'*+\\/0-9=?A-Z^_`a-z{|}~]+\.'.
  246. '[-!#$%&\'*+\\./0-9=?A-Z^_`a-z{|}~]+$', $string)) {
  247. $valid = false;
  248. } else {
  249. $valid = true;
  250. }
  251. return $valid;
  252. }
  253. function alert_size_to_meters($alert_area_size){
  254. $area_size_meters = 0;
  255. if ($alert_area_size == "s"){
  256. $area_size_meters = SMALL_ZONE_SIZE;
  257. }elseif ($alert_area_size == "m"){
  258. $area_size_meters = MEDIUM_ZONE_SIZE;
  259. }elseif ($alert_area_size == "l"){
  260. $area_size_meters = LARGE_ZONE_SIZE;
  261. }
  262. return $area_size_meters;
  263. }
  264. //Send a text email
  265. function send_text_email($to, $from_name, $from_email, $subject, $body){
  266. $headers = 'MIME-Version: 1.0' . "\r\n";
  267. $headers .= 'Content-type: text/plain; charset=iso-8859-1' . "\r\n";
  268. $headers .= 'From: ' . $from_name. ' <' . $from_email . ">\r\n";
  269. mail($to, $subject, $body, $headers);
  270. }
  271. // Format a date to mysql format
  272. function mysql_date($date){
  273. return date("Y-m-d H::i:s", $date);
  274. }
  275. function safe_scrape_page($url, $method = "GET"){
  276. $page = "";
  277. for ($i=0; $i < 3; $i++){
  278. if($page == false){
  279. if (SCRAPE_METHOD == "PEAR"){
  280. $page = scrape_page_pear($url, $method);
  281. }else{
  282. $page = scrape_page_curl($url, $method);
  283. }
  284. }
  285. }
  286. return $page;
  287. }
  288. function scrape_page_pear($url, $method = "GET"){
  289. $page = "";
  290. $request = new HTTP_Request($url, array("method" => $method));
  291. $request->sendRequest();
  292. $page = $request->getResponseBody();
  293. return $page;
  294. }
  295. function scrape_page_curl($url) {
  296. $ch = curl_init($url);
  297. curl_setopt($ch,CURLOPT_RETURNTRANSFER,TRUE);
  298. curl_setopt($ch,CURLOPT_FOLLOWLOCATION,TRUE);
  299. return curl_exec($ch);
  300. }
  301. function display_applications($applications, $authority_name, $authority_short_name){
  302. //smarty
  303. $smarty = new Smarty;
  304. $smarty->force_compile = true;
  305. $smarty->compile_dir = SMARTY_COMPILE_DIRECTORY;
  306. $smarty->template_dir = "../templates";
  307. $smarty->assign("authority_name", $authority_name);
  308. $smarty->assign("authority_short_name", $authority_short_name);
  309. if (sizeof($applications) > 0){
  310. $smarty->assign("applications", $applications);
  311. }
  312. $smarty->display("xml.tpl");
  313. }
  314. function get_time_from_get(){
  315. //if any get params were passed, overwrite the default date
  316. if (isset($_GET['day'])){
  317. $day = $_GET['day'];
  318. }else{
  319. throw_error("No day set in get string");
  320. }
  321. if (isset($_GET['month'])){
  322. $month = $_GET['month'];
  323. }else{
  324. throw_error("No year set in get string");
  325. }
  326. if (isset($_GET['year'])){
  327. $year = $_GET['year'];
  328. }else{
  329. throw_error("No year set in get string");
  330. }
  331. return mktime(0,0,0,$month,$day,$year);
  332. }
  333. function throw_error($message){
  334. throw new exception($message);
  335. }
  336. function redirect ($url){
  337. header("Location: $url");
  338. }
  339. ?>