Automatically exported from code.google.com/p/planningalerts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

scraper_support.php 14 KiB

17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
17 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426
  1. <?php
  2. //Includes
  3. require_once('config.php');
  4. require_once('application.php');
  5. require_once ("PEAR/HTTP/Request.php");
  6. require_once('phpcoord.php');
  7. //Generic scrapers
  8. function scrape_applications_publicaccess ($search_url, $info_url_base, $comment_url_base){
  9. $applications = array();
  10. $application_pattern = "/<tr><th>([0-9]*)<\/th>([^;]*)([^<]*)/";
  11. //grab the page
  12. $html = safe_scrape_page($search_url);
  13. //clean html
  14. $html = str_replace("\r\n","", $html);
  15. preg_match_all($application_pattern, $html, $application_matches, PREG_PATTERN_ORDER);
  16. foreach ($application_matches[0] as $application_match){
  17. $detail_pattern = "/<td>([^<])*/";
  18. preg_match_all($detail_pattern, $application_match, $detail_matches, PREG_PATTERN_ORDER);
  19. $application = new Application();
  20. //match the basic details
  21. $application->council_reference = str_replace("<td>", "", $detail_matches[0][0]);
  22. $application->date_received = str_replace("<td>", "", $detail_matches[0][1]);
  23. $application->address = str_replace("<td>", "", $detail_matches[0][2]);
  24. //$application->status = str_replace("<td>", "", $detail_matches[0][4]);
  25. //match case number
  26. $casenumber_pattern = "/caseno=([^&]*)/";
  27. preg_match($casenumber_pattern, $application_match, $casenumber_matches);
  28. $case_number ="";
  29. if(sizeof($casenumber_matches)>0){
  30. $case_number = str_replace("caseno=","", $casenumber_matches[0]);
  31. }
  32. //if weve found a caase number, then get the details
  33. if($case_number !=""){
  34. //Comment and info urls
  35. $application->info_url = $info_url_base . $case_number;
  36. $application->comment_url = $comment_url_base . $case_number;
  37. //Get the postcode
  38. $postcode_pattern = "/[A-Z][A-Z]?[0-9][A-Z0-9]? ?[0-9][ABDEFGHJLNPQRSTUWXYZ]{2}/";
  39. preg_match($postcode_pattern, $application->address, $postcode_matches);
  40. if(isset($postcode_matches[0])){
  41. $application->postcode = $postcode_matches[0];
  42. }
  43. //get full details
  44. $details_html = "";
  45. $details_html = safe_scrape_page($info_url_base . $case_number);
  46. //regular expresion and clean
  47. $full_detail_pattern = '/id="desc" rows="[1-9]" cols="80" class="cDetailInput">([^<]*)/';
  48. preg_match($full_detail_pattern, $details_html, $full_detail_matches);
  49. if (isset($full_detail_matches[0])){
  50. $application->description = substr($full_detail_matches[0], strpos($full_detail_matches[0], ">") + 1);
  51. }
  52. //only add it if we have a postcode (bit useless otherwise)
  53. if(is_postcode($application->postcode)){
  54. array_push($applications, $application);
  55. }
  56. }else{
  57. error_log("Unable to find case number for an application at " . $search_url);
  58. }
  59. }
  60. //return
  61. return $applications;
  62. }
  63. function scrape_applications_wam ($search_url, $info_url_base, $comment_url_base, $detail_mode = 1){
  64. $applications = array();
  65. $application_pattern = '/<tr><td class=[^>]*>([^<]*)<\/td><td class=[^>]*><a href="[^"]*">([^<]*)<\/a><\/td><td class=[^>]*>([^<]*)<\/td><td class=[^>]*>([^<]*)<\/td>/';
  66. //grab the page
  67. $html = safe_scrape_page($search_url);
  68. //clean html
  69. $html = str_replace("\r\n","", $html);
  70. preg_match_all($application_pattern, $html, $application_matches, PREG_SET_ORDER);
  71. foreach ($application_matches as $application_match){
  72. if ($application_match[4] != 'Current') { continue; }
  73. $application = new Application();
  74. //match the basic details
  75. $application->council_reference = $application_match[2];
  76. $case_number = $application_match[2];
  77. $application->date_received = $application_match[1];
  78. $application->address = $application_match[3];
  79. //$application->status = $application_match[4];
  80. //if weve found a caase number, then get the details
  81. if($case_number !=""){
  82. //Comment and info urls
  83. $application->info_url = $info_url_base . $case_number;
  84. $application->comment_url = $comment_url_base . $case_number;
  85. //Get the postcode
  86. $postcode_pattern = "/[A-Z][A-Z]?[0-9][A-Z0-9]? ?[0-9][ABDEFGHJLNPQRSTUWXYZ]{2}/";
  87. preg_match($postcode_pattern, $application->address, $postcode_matches);
  88. if(isset($postcode_matches[0])){
  89. $application->postcode = $postcode_matches[0];
  90. }
  91. //get full details
  92. $details_html = "";
  93. $details_html = safe_scrape_page($info_url_base . $case_number);
  94. $details_html = str_replace("\r\n","",$details_html);
  95. //regular expresion and clean. SItes vary a tiny bit in their html, so there's a bit of a hack here
  96. if ($detail_mode == 1){
  97. $full_detail_pattern = '/Development:<.*<td colspan="3">([^<]*)<\/td>/';
  98. }
  99. if ($detail_mode == 2){
  100. $full_detail_pattern = '/Development:<\/td><td>([^<]*)/';
  101. }
  102. preg_match($full_detail_pattern, $details_html, $full_detail_matches);
  103. if (isset($full_detail_matches[1])){
  104. $application->description = $full_detail_matches[1];
  105. }
  106. //only add it if we have a postcode (bit useless otherwise)
  107. if(is_postcode($application->postcode)){
  108. //removed the xy for the moment. It is slowing down the scrape and will be added when the app is parsed anyway (Richard)
  109. /* $xy = postcode_to_location($application->postcode);
  110. $application->x = $xy[0];
  111. $application->y = $xy[1];
  112. $os = new OSRef($xy[0],$xy[1]);
  113. $latlon = $os->toLatLng();
  114. $application->lat = $latlon->lat;
  115. $application->lon = $latlon->lng;
  116. */
  117. array_push($applications, $application);
  118. }
  119. }else{
  120. error_log("Unable to find case number for an application at " . $search_url);
  121. }
  122. }
  123. //return
  124. return $applications;
  125. }
  126. // Council specific scapers
  127. function scrape_applications_islington ($search_url, $info_url_base, $comment_url_base){
  128. $applications = array();
  129. $application_pattern = '/<TR>([^<]*)<TD class="lg" valign="top" >([^<]*)<a href([^<]*)<a href=wphappcriteria.display>Search Criteria(.*)([^<]*)<(.*)>([^<]*)<TD class="lg" >([^<]*)<\/TD>([^<]*)<TD class="lg" >([^<]*)<INPUT TYPE=HIDDEN NAME([^>]*)([^<]*)/';
  130. //grab the page
  131. $html = safe_scrape_page($search_url);
  132. preg_match_all($application_pattern, $html, $application_matches, PREG_PATTERN_ORDER);
  133. foreach ($application_matches[0] as $application_match){
  134. $application_string = str_replace("\n","", $application_match);
  135. $reference_pattern = '/Search Results<\/a>">([^<]*)/';
  136. preg_match_all($reference_pattern, $application_string, $reference_matches, PREG_PATTERN_ORDER);
  137. $application = new Application();
  138. //match the applicaiton number
  139. $application->council_reference = str_replace('Search Results</a>">', "", $reference_matches[0][0]);
  140. //Comment and info urls
  141. $application->info_url = $info_url_base . $application->council_reference;
  142. $application->comment_url = $comment_url_base . $application->council_reference;
  143. //get full details
  144. $details_html = "";
  145. $details_html = safe_scrape_page($info_url_base . $application->council_reference);
  146. $details_html = str_replace("\r\n","",$details_html);
  147. //Details
  148. $full_detail_pattern = '/Proposal:<\/label><\/td>([^<]*)<td colspan="3">([^<]*)/';
  149. preg_match($full_detail_pattern, $details_html, $full_detail_matches);
  150. if (isset($full_detail_matches[2])){
  151. $application->description = $full_detail_matches[2];
  152. }
  153. //Address
  154. $address_pattern = '/Main location:<\/label><\/td>([^<]*)<td colspan="3">([^<]*)/';
  155. $address = "";
  156. preg_match($address_pattern, $details_html, $address_matches);
  157. if(isset($address_matches[2])){
  158. $application->address = $address_matches[2];
  159. }
  160. //postcode
  161. $postcode_pattern = "/[A-Z][A-Z]?[0-9][A-Z0-9]? ?[0-9][ABDEFGHJLNPQRSTUWXYZ]{2}/";
  162. preg_match($postcode_pattern, $application->address, $postcode_matches);
  163. if(isset($postcode_matches[0])){
  164. $application->postcode = $postcode_matches[0];
  165. }
  166. //only add it if we have a postcode (bit useless otherwise)
  167. if(is_postcode($application->postcode)){
  168. array_push($applications, $application);
  169. }
  170. }
  171. //return
  172. return $applications;
  173. }
  174. //validate postcode
  175. function is_postcode ($postcode){
  176. $valid = false;
  177. $postcode=str_replace(" ","",$postcode);
  178. if(ereg ('^[a-zA-Z]{1,2}[0-9]{1,2}[a-zA-Z]{0,1}[0-9]{1}[a-zA-Z]{2}$', $postcode)){
  179. $valid = true;
  180. }
  181. return $valid;
  182. }
  183. //Tiny url
  184. function tiny_url($url,$length=30){
  185. // make nasty big url all small
  186. if (strlen($url) >= $length){
  187. $tinyurl = @file ("http://tinyurl.com/api-create.php?url=$url");
  188. if (is_array($tinyurl)){
  189. $tinyurl = join ('', $tinyurl);
  190. } else {
  191. $tinyurl = $url;
  192. }
  193. } else {
  194. $tinyurl = $url;
  195. }
  196. return $tinyurl;
  197. }
  198. //Google maps url
  199. function googlemap_url_from_postcode($postcode, $zoom = 15){
  200. $postcode = strtolower(str_replace(" ", "+", $postcode));
  201. return "http://maps.google.co.uk/maps?q=$postcode&z=$zoom";
  202. }
  203. //postcode to location
  204. function postcode_to_location($postcode){
  205. $x = 0;
  206. $y = 0;
  207. $clean_postcode = strtolower($postcode);
  208. $clean_postcode = str_replace(" ","+", $clean_postcode);
  209. $url = "http://www.streetmap.co.uk/newsearch.srf?type=Postcode&name=" . $clean_postcode;
  210. $html = file_get_contents($url);
  211. $x_pattern = "/var _LocationX=\d*;/";
  212. $y_pattern = "/var _LocationY=\d*;/";
  213. //X
  214. preg_match($x_pattern, $html, $matches);
  215. if(sizeof($matches) >0){
  216. $x = $matches[0];
  217. $x = str_replace('var _LocationX=',"", $x);
  218. $x = str_replace(";","", $x);
  219. }
  220. //Y
  221. preg_match($y_pattern, $html, $matches);
  222. if(sizeof($matches) >0){
  223. $y = str_replace("var _LocationY=","", $matches[0]);
  224. $y = str_replace(";","", $y);
  225. }
  226. $return = array();
  227. $return[0] = $x;
  228. $return[1] = $y;
  229. return $return;
  230. }
  231. function valid_email ($string) {
  232. $valid = false;
  233. if (!ereg('^[-!#$%&\'*+\\./0-9=?A-Z^_`a-z{|}~]+'.
  234. '@'.
  235. '[-!#$%&\'*+\\/0-9=?A-Z^_`a-z{|}~]+\.'.
  236. '[-!#$%&\'*+\\./0-9=?A-Z^_`a-z{|}~]+$', $string)) {
  237. $valid = false;
  238. } else {
  239. $valid = true;
  240. }
  241. return $valid;
  242. }
  243. function alert_size_to_meters($alert_area_size){
  244. $area_size_meters = 0;
  245. if ($alert_area_size == "s"){
  246. $area_size_meters = SMALL_ZONE_SIZE;
  247. }elseif ($alert_area_size == "m"){
  248. $area_size_meters = MEDIUM_ZONE_SIZE;
  249. }elseif ($alert_area_size == "l"){
  250. $area_size_meters = LARGE_ZONE_SIZE;
  251. }
  252. return $area_size_meters;
  253. }
  254. //Send a text email
  255. function send_text_email($to, $from_name, $from_email, $subject, $body){
  256. $headers = 'MIME-Version: 1.0' . "\r\n";
  257. $headers .= 'Content-type: text/plain; charset=iso-8859-1' . "\r\n";
  258. $headers .= 'From: ' . $from_name. ' <' . $from_email . ">\r\n";
  259. mail($to, $subject, $body, $headers);
  260. }
  261. // Format a date to mysql format
  262. function mysql_date($date){
  263. return date("Y-m-d H::i:s", $date);
  264. }
  265. function safe_scrape_page($url, $method = "GET"){
  266. error_log(print_r($url, true));
  267. $page = "";
  268. for ($i=0; $i < 3; $i++){
  269. if($page == false){
  270. if (SCRAPE_METHOD == "PEAR"){
  271. $page = scrape_page_pear($url, $method);
  272. }else{
  273. $page = scrape_page_curl($url, $method);
  274. }
  275. }
  276. }
  277. return $page;
  278. }
  279. function scrape_page_pear($url, $method = "GET"){
  280. $page = "";
  281. $request = new HTTP_Request($url, array("method" => $method));
  282. $request->sendRequest();
  283. $page = $request->getResponseBody();
  284. return $page;
  285. }
  286. function scrape_page_curl($url) {
  287. $ch = curl_init($url);
  288. curl_setopt($ch,CURLOPT_RETURNTRANSFER,TRUE);
  289. curl_setopt($ch,CURLOPT_FOLLOWLOCATION,TRUE);
  290. return curl_exec($ch);
  291. }
  292. function display_applications($applications, $authority_name, $authority_short_name){
  293. //smarty
  294. $smarty = new Smarty;
  295. $smarty->force_compile = true;
  296. $smarty->compile_dir = SMARTY_COMPILE_DIRECTORY;
  297. $smarty->template_dir = "../templates";
  298. $smarty->assign("authority_name", $authority_name);
  299. $smarty->assign("authority_short_name", $authority_short_name);
  300. if (sizeof($applications) > 0){
  301. $smarty->assign("applications", $applications);
  302. }
  303. $smarty->display("xml.tpl");
  304. }
  305. function get_time_from_get(){
  306. //if any get params were passed, overwrite the default date
  307. if (isset($_GET['day'])){
  308. $day = $_GET['day'];
  309. }else{
  310. throw_error("No day set in get string");
  311. }
  312. if (isset($_GET['month'])){
  313. $month = $_GET['month'];
  314. }else{
  315. throw_error("No year set in get string");
  316. }
  317. if (isset($_GET['year'])){
  318. $year = $_GET['year'];
  319. }else{
  320. throw_error("No year set in get string");
  321. }
  322. return mktime(0,0,0,$month,$day,$year);
  323. }
  324. function throw_error($message){
  325. throw new exception($message);
  326. }
  327. function redirect ($url){
  328. header("Location: $url");
  329. }
  330. ?>