Automatically exported from code.google.com/p/planningalerts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

scraper_support.php 14 KiB

17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
14 years ago
17 years ago
14 years ago
14 years ago
14 years ago
14 years ago
17 years ago
17 years ago
17 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447
  1. <?php
  2. //Includes
  3. require_once('config.php');
  4. require_once('application.php');
  5. require_once ("PEAR/HTTP/Request.php");
  6. require_once('phpcoord.php');
  7. //Generic scrapers
  8. function scrape_applications_publicaccess ($search_url, $info_url_base, $comment_url_base){
  9. $applications = array();
  10. //$application_pattern = "/<tr><th>([0-9]*)<\/th>([^;]*)([^<]*)/";
  11. $application_pattern = "/<tr><th>([0-9]*)<\/th>.*(?=<\/tr)/U";
  12. //grab the page
  13. $html = safe_scrape_page($search_url);
  14. //clean html
  15. $html = str_replace("\r\n","", $html);
  16. preg_match_all($application_pattern, $html, $application_matches, PREG_PATTERN_ORDER);
  17. foreach ($application_matches[0] as $application_match){
  18. //START Duncan's debug
  19. //print_r($application_match);
  20. //print_r("END");
  21. // END Duncan's debug
  22. $detail_pattern = "/<td>([^<])*/";
  23. preg_match_all($detail_pattern, $application_match, $detail_matches, PREG_PATTERN_ORDER);
  24. $application = new Application();
  25. //match the basic details
  26. $application->council_reference = str_replace("<td>", "", $detail_matches[0][0]);
  27. $application->date_received = str_replace("<td>", "", $detail_matches[0][1]);
  28. $application->address = str_replace("<td>", "", $detail_matches[0][2]);
  29. //$application->status = str_replace("<td>", "", $detail_matches[0][4]);
  30. //match case number
  31. $casenumber_pattern = "/caseno=([^&]*)/";
  32. preg_match($casenumber_pattern, $application_match, $casenumber_matches);
  33. //START Duncan's debug
  34. //print_r($application_match);
  35. //var_dump($casenumber_matches);
  36. //END Duncan's debug
  37. $case_number ="";
  38. if(sizeof($casenumber_matches)>0){
  39. $case_number = str_replace("caseno=","", $casenumber_matches[0]);
  40. }
  41. //if weve found a caase number, then get the details
  42. if($case_number !=""){
  43. //Comment and info urls
  44. $application->info_url = $info_url_base . $case_number;
  45. $application->comment_url = $comment_url_base . $case_number;
  46. //Get the postcode
  47. $postcode_pattern = "/[A-Z][A-Z]?[0-9][A-Z0-9]? ?[0-9][ABDEFGHJLNPQRSTUWXYZ]{2}/";
  48. preg_match($postcode_pattern, $application->address, $postcode_matches);
  49. if(isset($postcode_matches[0])){
  50. $application->postcode = $postcode_matches[0];
  51. }
  52. //get full details
  53. $details_html = "";
  54. $details_html = safe_scrape_page($info_url_base . $case_number);
  55. //regular expresion and clean
  56. $full_detail_pattern = '/id="desc" rows="[1-9]" cols="80" class="cDetailInput">([^<]*)/';
  57. preg_match($full_detail_pattern, $details_html, $full_detail_matches);
  58. if (isset($full_detail_matches[0])){
  59. $application->description = substr($full_detail_matches[0], strpos($full_detail_matches[0], ">") + 1);
  60. }
  61. //only add it if we have a postcode (bit useless otherwise)
  62. if(is_postcode($application->postcode)){
  63. array_push($applications, $application);
  64. }
  65. }else{
  66. error_log("Unable to find case number for an application at " . $search_url);
  67. }
  68. }
  69. //return
  70. return $applications;
  71. }
  72. function scrape_applications_wam ($search_url, $info_url_base, $comment_url_base, $detail_mode = 1){
  73. $applications = array();
  74. $application_pattern = '/<tr><td class=[^>]*>([^<]*)<\/td><td class=[^>]*><a href="[^"]*">([^<]*)<\/a><\/td><td class=[^>]*>([^<]*)<\/td><td class=[^>]*>([^<]*)<\/td>/';
  75. //grab the page
  76. $html = safe_scrape_page($search_url);
  77. //clean html
  78. $html = str_replace("\r\n","", $html);
  79. preg_match_all($application_pattern, $html, $application_matches, PREG_SET_ORDER);
  80. foreach ($application_matches as $application_match){
  81. if ($application_match[4] != 'Current') { continue; }
  82. $application = new Application();
  83. //match the basic details
  84. $application->council_reference = $application_match[2];
  85. $case_number = $application_match[2];
  86. $application->date_received = $application_match[1];
  87. $application->address = $application_match[3];
  88. //$application->status = $application_match[4];
  89. //if weve found a caase number, then get the details
  90. if($case_number !=""){
  91. //Comment and info urls
  92. $application->info_url = $info_url_base . $case_number;
  93. $application->comment_url = $comment_url_base . $case_number;
  94. //Get the postcode
  95. $postcode_pattern = "/[A-Z][A-Z]?[0-9][A-Z0-9]? ?[0-9][ABDEFGHJLNPQRSTUWXYZ]{2}/";
  96. preg_match($postcode_pattern, $application->address, $postcode_matches);
  97. if(isset($postcode_matches[0])){
  98. $application->postcode = $postcode_matches[0];
  99. }
  100. //get full details
  101. $details_html = "";
  102. $details_html = safe_scrape_page($info_url_base . $case_number);
  103. $details_html = str_replace("\r\n","",$details_html);
  104. //regular expresion and clean. SItes vary a tiny bit in their html, so there's a bit of a hack here
  105. if ($detail_mode == 1){
  106. $full_detail_pattern = '/Development:<.*<td colspan="3">([^<]*)<\/td>/';
  107. }
  108. if ($detail_mode == 2){
  109. $full_detail_pattern = '/Development:<\/td><td>([^<]*)/';
  110. }
  111. preg_match($full_detail_pattern, $details_html, $full_detail_matches);
  112. if (isset($full_detail_matches[1])){
  113. $application->description = $full_detail_matches[1];
  114. }
  115. //only add it if we have a postcode (bit useless otherwise)
  116. if(is_postcode($application->postcode)){
  117. //removed the xy for the moment. It is slowing down the scrape and will be added when the app is parsed anyway (Richard)
  118. /* $xy = postcode_to_location($application->postcode);
  119. $application->x = $xy[0];
  120. $application->y = $xy[1];
  121. $os = new OSRef($xy[0],$xy[1]);
  122. $latlon = $os->toLatLng();
  123. $application->lat = $latlon->lat;
  124. $application->lon = $latlon->lng;
  125. */
  126. array_push($applications, $application);
  127. }
  128. }else{
  129. error_log("Unable to find case number for an application at " . $search_url);
  130. }
  131. }
  132. //return
  133. return $applications;
  134. }
  135. // Council specific scapers
  136. function scrape_applications_islington ($search_url, $info_url_base, $comment_url_base){
  137. $applications = array();
  138. $application_pattern = '/<TR>([^<]*)<TD class="lg" valign="top" >([^<]*)<a href([^<]*)<a href=wphappcriteria.display>Search Criteria(.*)([^<]*)<(.*)>([^<]*)<TD class="lg" >([^<]*)<\/TD>([^<]*)<TD class="lg" >([^<]*)<INPUT TYPE=HIDDEN NAME([^>]*)([^<]*)/';
  139. //grab the page
  140. $html = safe_scrape_page($search_url);
  141. preg_match_all($application_pattern, $html, $application_matches, PREG_PATTERN_ORDER);
  142. foreach ($application_matches[0] as $application_match){
  143. $application_string = str_replace("\n","", $application_match);
  144. $reference_pattern = '/Search Results<\/a>">([^<]*)/';
  145. preg_match_all($reference_pattern, $application_string, $reference_matches, PREG_PATTERN_ORDER);
  146. $application = new Application();
  147. //match the applicaiton number
  148. $application->council_reference = str_replace('Search Results</a>">', "", $reference_matches[0][0]);
  149. //Comment and info urls
  150. $application->info_url = $info_url_base . $application->council_reference;
  151. $application->comment_url = $comment_url_base . $application->council_reference;
  152. //get full details
  153. $details_html = "";
  154. $details_html = safe_scrape_page($info_url_base . $application->council_reference);
  155. $details_html = str_replace("\r\n","",$details_html);
  156. //Details
  157. $full_detail_pattern = '/Proposal:<\/label><\/td>([^<]*)<td colspan="3">([^<]*)/';
  158. preg_match($full_detail_pattern, $details_html, $full_detail_matches);
  159. if (isset($full_detail_matches[2])){
  160. $application->description = $full_detail_matches[2];
  161. }
  162. //Address
  163. $address_pattern = '/Main location:<\/label><\/td>([^<]*)<td colspan="3">([^<]*)/';
  164. $address = "";
  165. preg_match($address_pattern, $details_html, $address_matches);
  166. if(isset($address_matches[2])){
  167. $application->address = $address_matches[2];
  168. }
  169. //postcode
  170. $postcode_pattern = "/[A-Z][A-Z]?[0-9][A-Z0-9]? ?[0-9][ABDEFGHJLNPQRSTUWXYZ]{2}/";
  171. preg_match($postcode_pattern, $application->address, $postcode_matches);
  172. if(isset($postcode_matches[0])){
  173. $application->postcode = $postcode_matches[0];
  174. }
  175. //only add it if we have a postcode (bit useless otherwise)
  176. if(is_postcode($application->postcode)){
  177. array_push($applications, $application);
  178. }
  179. }
  180. //return
  181. return $applications;
  182. }
  183. //validate postcode
  184. function is_postcode ($postcode){
  185. $valid = false;
  186. $postcode=str_replace(" ","",$postcode);
  187. if(ereg ('^[a-zA-Z]{1,2}[0-9]{1,2}[a-zA-Z]{0,1}[0-9]{1}[a-zA-Z]{2}$', $postcode)){
  188. $valid = true;
  189. }
  190. return $valid;
  191. }
  192. function clean_postcode ($postcode, $upper = true) {
  193. $reg = array();
  194. $postcode = trim($postcode);
  195. preg_match('/^(.+?)([0-9][a-z]{2})$/',$postcode, $reg);
  196. $clean_postcode = trim($reg[1]) . ' ' . trim($reg[2]);
  197. if($upper){
  198. $clean_postcode = strtoupper($clean_postcode);
  199. }
  200. return $clean_postcode;
  201. }
  202. //Tiny url
  203. function tiny_url($url,$length=30){
  204. // make nasty big url all small
  205. if (strlen($url) >= $length){
  206. $tinyurl = @file ("http://tinyurl.com/api-create.php?url=$url");
  207. if (is_array($tinyurl)){
  208. $tinyurl = join ('', $tinyurl);
  209. } else {
  210. $tinyurl = $url;
  211. }
  212. } else {
  213. $tinyurl = $url;
  214. }
  215. return $tinyurl;
  216. }
  217. //Google maps url
  218. function googlemap_url_from_postcode($postcode, $zoom = 15){
  219. $postcode = strtolower(str_replace(" ", "+", $postcode));
  220. return "http://maps.google.co.uk/maps?q=$postcode&z=$zoom";
  221. }
  222. //postcode to location
  223. function postcode_to_location($postcode){
  224. // We don't actually need to fetch the page, we
  225. // can get everything we need from the url we are
  226. // redirected to.
  227. $clean_postcode = strtolower($postcode);
  228. $clean_postcode = str_replace(" ","+", $clean_postcode);
  229. $url = "http://www.streetmap.co.uk/newsearch.srf?type=Postcode&name=" . $clean_postcode;
  230. $headers = get_headers($url, 1);
  231. $location = $headers["Location"];
  232. $location_regex = "/x=(\d*)&y=(\d*)&/";
  233. preg_match ($location_regex, $location, $matches);
  234. return array_slice ($matches, 1);
  235. }
  236. function location_to_postcode($easting, $northing) {
  237. $url = sprintf(
  238. "http://streetmap.co.uk/streetmap.dll?GridConvert?name=%d,%d&type=OSGrid",
  239. $easting, $northing);
  240. $resp = @file($url);
  241. if (is_array($resp)) $resp = join("\n", $resp);
  242. $resp = strip_tags($resp);
  243. // Kinda ghetto. Would be nice to have a nicer regex for postcodes.
  244. if (preg_match('/Nearest\s+Post\s+Code\s+(\S+\s+\S+)/i', $resp, $mat))
  245. return $mat[1];
  246. return NULL;
  247. }
  248. function valid_email ($string) {
  249. $valid = false;
  250. if (!ereg('^[-!#$%&\'*+\\./0-9=?A-Z^_`a-z{|}~]+'.
  251. '@'.
  252. '[-!#$%&\'*+\\/0-9=?A-Z^_`a-z{|}~]+\.'.
  253. '[-!#$%&\'*+\\./0-9=?A-Z^_`a-z{|}~]+$', $string)) {
  254. $valid = false;
  255. } else {
  256. $valid = true;
  257. }
  258. return $valid;
  259. }
  260. function alert_size_to_meters($alert_area_size){
  261. $area_size_meters = 0;
  262. if ($alert_area_size == "s"){
  263. $area_size_meters = SMALL_ZONE_SIZE;
  264. }elseif ($alert_area_size == "m"){
  265. $area_size_meters = MEDIUM_ZONE_SIZE;
  266. }elseif ($alert_area_size == "l"){
  267. $area_size_meters = LARGE_ZONE_SIZE;
  268. }
  269. return $area_size_meters;
  270. }
  271. //Send a text email
  272. function send_text_email($to, $from_name, $from_email, $subject, $body){
  273. $headers = 'MIME-Version: 1.0' . "\r\n";
  274. $headers .= 'Content-type: text/plain; charset=iso-8859-1' . "\r\n";
  275. $headers .= 'From: ' . $from_name. ' <' . $from_email . ">\r\n";
  276. mail($to, $subject, $body, $headers);
  277. }
  278. // Format a date to mysql format
  279. function mysql_date($date){
  280. return date("Y-m-d H::i:s", $date);
  281. }
  282. function safe_scrape_page($url, $method = "GET"){
  283. $page = "";
  284. for ($i=0; $i < 3; $i++){
  285. if($page == false){
  286. if (SCRAPE_METHOD == "PEAR"){
  287. $page = scrape_page_pear($url, $method);
  288. }else{
  289. $page = scrape_page_curl($url, $method);
  290. }
  291. }
  292. }
  293. return $page;
  294. }
  295. function scrape_page_pear($url, $method = "GET"){
  296. $page = "";
  297. $request = new HTTP_Request($url, array("method" => $method));
  298. $request->sendRequest();
  299. $page = $request->getResponseBody();
  300. return $page;
  301. }
  302. function scrape_page_curl($url) {
  303. $ch = curl_init($url);
  304. curl_setopt($ch,CURLOPT_RETURNTRANSFER,TRUE);
  305. curl_setopt($ch,CURLOPT_FOLLOWLOCATION,TRUE);
  306. return curl_exec($ch);
  307. }
  308. function display_applications($applications, $authority_name, $authority_short_name){
  309. //smarty
  310. $smarty = new Smarty;
  311. $smarty->force_compile = true;
  312. $smarty->compile_dir = SMARTY_COMPILE_DIRECTORY;
  313. $smarty->template_dir = "../templates";
  314. $smarty->assign("authority_name", $authority_name);
  315. $smarty->assign("authority_short_name", $authority_short_name);
  316. if (sizeof($applications) > 0){
  317. $smarty->assign("applications", $applications);
  318. }
  319. $smarty->display("xml.tpl");
  320. }
  321. function get_time_from_get(){
  322. //if any get params were passed, overwrite the default date
  323. if (isset($_GET['day'])){
  324. $day = $_GET['day'];
  325. }else{
  326. throw_error("No day set in get string");
  327. }
  328. if (isset($_GET['month'])){
  329. $month = $_GET['month'];
  330. }else{
  331. throw_error("No year set in get string");
  332. }
  333. if (isset($_GET['year'])){
  334. $year = $_GET['year'];
  335. }else{
  336. throw_error("No year set in get string");
  337. }
  338. return mktime(0,0,0,$month,$day,$year);
  339. }
  340. function throw_error($message){
  341. throw new exception($message);
  342. }
  343. function redirect ($url){
  344. header("Location: $url");
  345. }
  346. ?>