Automatically exported from code.google.com/p/planningalerts
25개 이상의 토픽을 선택하실 수 없습니다. Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

scraper_support.php 14 KiB

18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
18 년 전
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
  1. <?php
  2. //Includes
  3. require_once('config.php');
  4. require_once('application.php');
  5. require_once ("PEAR/HTTP/Request.php");
  6. require_once('phpcoord.php');
  7. //Generic scrapers
  8. function scrape_applications_publicaccess ($search_url, $info_url_base, $comment_url_base){
  9. $applications = array();
  10. //$application_pattern = "/<tr><th>([0-9]*)<\/th>([^;]*)([^<]*)/";
  11. $application_pattern = "/<tr><th>([0-9]*)<\/th>.*(?=<\/tr)/U";
  12. //grab the page
  13. $html = safe_scrape_page($search_url);
  14. //clean html
  15. $html = str_replace("\r\n","", $html);
  16. preg_match_all($application_pattern, $html, $application_matches, PREG_PATTERN_ORDER);
  17. foreach ($application_matches[0] as $application_match){
  18. //START Duncan's debug
  19. //print_r($application_match);
  20. //print_r("END");
  21. // END Duncan's debug
  22. $detail_pattern = "/<td>([^<])*/";
  23. preg_match_all($detail_pattern, $application_match, $detail_matches, PREG_PATTERN_ORDER);
  24. $application = new Application();
  25. //match the basic details
  26. $application->council_reference = str_replace("<td>", "", $detail_matches[0][0]);
  27. $application->date_received = str_replace("<td>", "", $detail_matches[0][1]);
  28. $application->address = str_replace("<td>", "", $detail_matches[0][2]);
  29. //$application->status = str_replace("<td>", "", $detail_matches[0][4]);
  30. //match case number
  31. $casenumber_pattern = "/caseno=([^&]*)/";
  32. preg_match($casenumber_pattern, $application_match, $casenumber_matches);
  33. //START Duncan's debug
  34. //print_r($application_match);
  35. //var_dump($casenumber_matches);
  36. //END Duncan's debug
  37. $case_number ="";
  38. if(sizeof($casenumber_matches)>0){
  39. $case_number = str_replace("caseno=","", $casenumber_matches[0]);
  40. }
  41. //if weve found a caase number, then get the details
  42. if($case_number !=""){
  43. //Comment and info urls
  44. $application->info_url = $info_url_base . $case_number;
  45. $application->comment_url = $comment_url_base . $case_number;
  46. //Get the postcode
  47. $postcode_pattern = "/[A-Z][A-Z]?[0-9][A-Z0-9]? ?[0-9][ABDEFGHJLNPQRSTUWXYZ]{2}/";
  48. preg_match($postcode_pattern, $application->address, $postcode_matches);
  49. if(isset($postcode_matches[0])){
  50. $application->postcode = $postcode_matches[0];
  51. }
  52. //get full details
  53. $details_html = "";
  54. $details_html = safe_scrape_page($info_url_base . $case_number);
  55. //regular expresion and clean
  56. $full_detail_pattern = '/id="desc" rows="[1-9]" cols="80" class="cDetailInput">([^<]*)/';
  57. preg_match($full_detail_pattern, $details_html, $full_detail_matches);
  58. if (isset($full_detail_matches[0])){
  59. $application->description = substr($full_detail_matches[0], strpos($full_detail_matches[0], ">") + 1);
  60. }
  61. //only add it if we have a postcode (bit useless otherwise)
  62. if(is_postcode($application->postcode)){
  63. array_push($applications, $application);
  64. }
  65. }else{
  66. error_log("Unable to find case number for an application at " . $search_url);
  67. }
  68. }
  69. //return
  70. return $applications;
  71. }
  72. function scrape_applications_wam ($search_url, $info_url_base, $comment_url_base, $detail_mode = 1){
  73. $applications = array();
  74. $application_pattern = '/<tr><td class=[^>]*>([^<]*)<\/td><td class=[^>]*><a href="[^"]*">([^<]*)<\/a><\/td><td class=[^>]*>([^<]*)<\/td><td class=[^>]*>([^<]*)<\/td>/';
  75. //grab the page
  76. $html = safe_scrape_page($search_url);
  77. //clean html
  78. $html = str_replace("\r\n","", $html);
  79. preg_match_all($application_pattern, $html, $application_matches, PREG_SET_ORDER);
  80. foreach ($application_matches as $application_match){
  81. if ($application_match[4] != 'Current') { continue; }
  82. $application = new Application();
  83. //match the basic details
  84. $application->council_reference = $application_match[2];
  85. $case_number = $application_match[2];
  86. $application->date_received = $application_match[1];
  87. $application->address = $application_match[3];
  88. //$application->status = $application_match[4];
  89. //if weve found a caase number, then get the details
  90. if($case_number !=""){
  91. //Comment and info urls
  92. $application->info_url = $info_url_base . $case_number;
  93. $application->comment_url = $comment_url_base . $case_number;
  94. //Get the postcode
  95. $postcode_pattern = "/[A-Z][A-Z]?[0-9][A-Z0-9]? ?[0-9][ABDEFGHJLNPQRSTUWXYZ]{2}/";
  96. preg_match($postcode_pattern, $application->address, $postcode_matches);
  97. if(isset($postcode_matches[0])){
  98. $application->postcode = $postcode_matches[0];
  99. }
  100. //get full details
  101. $details_html = "";
  102. $details_html = safe_scrape_page($info_url_base . $case_number);
  103. $details_html = str_replace("\r\n","",$details_html);
  104. //regular expresion and clean. SItes vary a tiny bit in their html, so there's a bit of a hack here
  105. if ($detail_mode == 1){
  106. $full_detail_pattern = '/Development:<.*<td colspan="3">([^<]*)<\/td>/';
  107. }
  108. if ($detail_mode == 2){
  109. $full_detail_pattern = '/Development:<\/td><td>([^<]*)/';
  110. }
  111. preg_match($full_detail_pattern, $details_html, $full_detail_matches);
  112. if (isset($full_detail_matches[1])){
  113. $application->description = $full_detail_matches[1];
  114. }
  115. //only add it if we have a postcode (bit useless otherwise)
  116. if(is_postcode($application->postcode)){
  117. //removed the xy for the moment. It is slowing down the scrape and will be added when the app is parsed anyway (Richard)
  118. /* $xy = postcode_to_location($application->postcode);
  119. $application->x = $xy[0];
  120. $application->y = $xy[1];
  121. $os = new OSRef($xy[0],$xy[1]);
  122. $latlon = $os->toLatLng();
  123. $application->lat = $latlon->lat;
  124. $application->lon = $latlon->lng;
  125. */
  126. array_push($applications, $application);
  127. }
  128. }else{
  129. error_log("Unable to find case number for an application at " . $search_url);
  130. }
  131. }
  132. //return
  133. return $applications;
  134. }
  135. // Council specific scapers
  136. function scrape_applications_islington ($search_url, $info_url_base, $comment_url_base){
  137. $applications = array();
  138. $application_pattern = '/<TR>([^<]*)<TD class="lg" valign="top" >([^<]*)<a href([^<]*)<a href=wphappcriteria.display>Search Criteria(.*)([^<]*)<(.*)>([^<]*)<TD class="lg" >([^<]*)<\/TD>([^<]*)<TD class="lg" >([^<]*)<INPUT TYPE=HIDDEN NAME([^>]*)([^<]*)/';
  139. //grab the page
  140. $html = safe_scrape_page($search_url);
  141. preg_match_all($application_pattern, $html, $application_matches, PREG_PATTERN_ORDER);
  142. foreach ($application_matches[0] as $application_match){
  143. $application_string = str_replace("\n","", $application_match);
  144. $reference_pattern = '/Search Results<\/a>">([^<]*)/';
  145. preg_match_all($reference_pattern, $application_string, $reference_matches, PREG_PATTERN_ORDER);
  146. $application = new Application();
  147. //match the applicaiton number
  148. $application->council_reference = str_replace('Search Results</a>">', "", $reference_matches[0][0]);
  149. //Comment and info urls
  150. $application->info_url = $info_url_base . $application->council_reference;
  151. $application->comment_url = $comment_url_base . $application->council_reference;
  152. //get full details
  153. $details_html = "";
  154. $details_html = safe_scrape_page($info_url_base . $application->council_reference);
  155. $details_html = str_replace("\r\n","",$details_html);
  156. //Details
  157. $full_detail_pattern = '/Proposal:<\/label><\/td>([^<]*)<td colspan="3">([^<]*)/';
  158. preg_match($full_detail_pattern, $details_html, $full_detail_matches);
  159. if (isset($full_detail_matches[2])){
  160. $application->description = $full_detail_matches[2];
  161. }
  162. //Address
  163. $address_pattern = '/Main location:<\/label><\/td>([^<]*)<td colspan="3">([^<]*)/';
  164. $address = "";
  165. preg_match($address_pattern, $details_html, $address_matches);
  166. if(isset($address_matches[2])){
  167. $application->address = $address_matches[2];
  168. }
  169. //postcode
  170. $postcode_pattern = "/[A-Z][A-Z]?[0-9][A-Z0-9]? ?[0-9][ABDEFGHJLNPQRSTUWXYZ]{2}/";
  171. preg_match($postcode_pattern, $application->address, $postcode_matches);
  172. if(isset($postcode_matches[0])){
  173. $application->postcode = $postcode_matches[0];
  174. }
  175. //only add it if we have a postcode (bit useless otherwise)
  176. if(is_postcode($application->postcode)){
  177. array_push($applications, $application);
  178. }
  179. }
  180. //return
  181. return $applications;
  182. }
  183. //validate postcode
  184. function is_postcode ($postcode){
  185. $valid = false;
  186. $postcode=str_replace(" ","",$postcode);
  187. if(ereg ('^[a-zA-Z]{1,2}[0-9]{1,2}[a-zA-Z]{0,1}[0-9]{1}[a-zA-Z]{2}$', $postcode)){
  188. $valid = true;
  189. }
  190. return $valid;
  191. }
  192. function clean_postcode ($postcode, $upper = true) {
  193. $reg = array();
  194. $postcode = trim($postcode);
  195. preg_match('/^(.+?)([0-9][a-z]{2})$/',$postcode, $reg);
  196. $clean_postcode = trim($reg[1]) . ' ' . trim($reg[2]);
  197. if($upper){
  198. $clean_postcode = strtoupper($clean_postcode);
  199. }
  200. return $clean_postcode;
  201. }
  202. //Tiny url
  203. function tiny_url($url,$length=30){
  204. // make nasty big url all small
  205. if (strlen($url) >= $length){
  206. $tinyurl = @file ("http://tinyurl.com/api-create.php?url=$url");
  207. if (is_array($tinyurl)){
  208. $tinyurl = join ('', $tinyurl);
  209. } else {
  210. $tinyurl = $url;
  211. }
  212. } else {
  213. $tinyurl = $url;
  214. }
  215. return $tinyurl;
  216. }
  217. //Google maps url
  218. function googlemap_url_from_postcode($postcode, $zoom = 15){
  219. $postcode = strtolower(str_replace(" ", "+", $postcode));
  220. return "http://maps.google.co.uk/maps?q=$postcode&z=$zoom";
  221. }
  222. //postcode to location
  223. function postcode_to_location($postcode){
  224. // We don't actually need to fetch the page, we
  225. // can get everything we need from the url we are
  226. // redirected to.
  227. $clean_postcode = strtolower($postcode);
  228. $clean_postcode = str_replace(" ","+", $clean_postcode);
  229. $url = "http://www.streetmap.co.uk/newsearch.srf?type=Postcode&name=" . $clean_postcode;
  230. $headers = get_headers($url, 1);
  231. $location = $headers["Location"];
  232. $location_regex = "/x=(\d*)&y=(\d*)&/";
  233. preg_match ($location_regex, $location, $matches);
  234. return array_slice ($matches, 1);
  235. }
  236. function valid_email ($string) {
  237. $valid = false;
  238. if (!ereg('^[-!#$%&\'*+\\./0-9=?A-Z^_`a-z{|}~]+'.
  239. '@'.
  240. '[-!#$%&\'*+\\/0-9=?A-Z^_`a-z{|}~]+\.'.
  241. '[-!#$%&\'*+\\./0-9=?A-Z^_`a-z{|}~]+$', $string)) {
  242. $valid = false;
  243. } else {
  244. $valid = true;
  245. }
  246. return $valid;
  247. }
  248. function alert_size_to_meters($alert_area_size){
  249. $area_size_meters = 0;
  250. if ($alert_area_size == "s"){
  251. $area_size_meters = SMALL_ZONE_SIZE;
  252. }elseif ($alert_area_size == "m"){
  253. $area_size_meters = MEDIUM_ZONE_SIZE;
  254. }elseif ($alert_area_size == "l"){
  255. $area_size_meters = LARGE_ZONE_SIZE;
  256. }
  257. return $area_size_meters;
  258. }
  259. //Send a text email
  260. function send_text_email($to, $from_name, $from_email, $subject, $body){
  261. $headers = 'MIME-Version: 1.0' . "\r\n";
  262. $headers .= 'Content-type: text/plain; charset=iso-8859-1' . "\r\n";
  263. $headers .= 'From: ' . $from_name. ' <' . $from_email . ">\r\n";
  264. mail($to, $subject, $body, $headers);
  265. }
  266. // Format a date to mysql format
  267. function mysql_date($date){
  268. return date("Y-m-d H::i:s", $date);
  269. }
  270. function safe_scrape_page($url, $method = "GET"){
  271. $page = "";
  272. for ($i=0; $i < 3; $i++){
  273. if($page == false){
  274. if (SCRAPE_METHOD == "PEAR"){
  275. $page = scrape_page_pear($url, $method);
  276. }else{
  277. $page = scrape_page_curl($url, $method);
  278. }
  279. }
  280. }
  281. return $page;
  282. }
  283. function scrape_page_pear($url, $method = "GET"){
  284. $page = "";
  285. $request = new HTTP_Request($url, array("method" => $method));
  286. $request->sendRequest();
  287. $page = $request->getResponseBody();
  288. return $page;
  289. }
  290. function scrape_page_curl($url) {
  291. $ch = curl_init($url);
  292. curl_setopt($ch,CURLOPT_RETURNTRANSFER,TRUE);
  293. curl_setopt($ch,CURLOPT_FOLLOWLOCATION,TRUE);
  294. return curl_exec($ch);
  295. }
  296. function display_applications($applications, $authority_name, $authority_short_name){
  297. //smarty
  298. $smarty = new Smarty;
  299. $smarty->force_compile = true;
  300. $smarty->compile_dir = SMARTY_COMPILE_DIRECTORY;
  301. $smarty->template_dir = "../templates";
  302. $smarty->assign("authority_name", $authority_name);
  303. $smarty->assign("authority_short_name", $authority_short_name);
  304. if (sizeof($applications) > 0){
  305. $smarty->assign("applications", $applications);
  306. }
  307. $smarty->display("xml.tpl");
  308. }
  309. function get_time_from_get(){
  310. //if any get params were passed, overwrite the default date
  311. if (isset($_GET['day'])){
  312. $day = $_GET['day'];
  313. }else{
  314. throw_error("No day set in get string");
  315. }
  316. if (isset($_GET['month'])){
  317. $month = $_GET['month'];
  318. }else{
  319. throw_error("No year set in get string");
  320. }
  321. if (isset($_GET['year'])){
  322. $year = $_GET['year'];
  323. }else{
  324. throw_error("No year set in get string");
  325. }
  326. return mktime(0,0,0,$month,$day,$year);
  327. }
  328. function throw_error($message){
  329. throw new exception($message);
  330. }
  331. function redirect ($url){
  332. header("Location: $url");
  333. }
  334. ?>