Automatically exported from code.google.com/p/planningalerts
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

scraper_support.php 13 KiB

18 years ago
18 years ago
18 years ago
18 years ago
18 years ago
18 years ago
18 years ago
18 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. <?php
  2. //Includes
  3. require_once('config.php');
  4. require_once('application.php');
  5. require_once ("PEAR/HTTP/Request.php");
  6. require_once('phpcoord.php');
  7. //Generic scrapers
  8. function scrape_applications_publicaccess ($search_url, $info_url_base, $comment_url_base){
  9. $applications = array();
  10. $application_pattern = "/<tr><th>([0-9]*)<\/th>([^;]*)([^<]*)/";
  11. //grab the page
  12. $html = safe_scrape_page($search_url);
  13. //clean html
  14. $html = str_replace("\r\n","", $html);
  15. preg_match_all($application_pattern, $html, $application_matches, PREG_PATTERN_ORDER);
  16. foreach ($application_matches[0] as $application_match){
  17. $detail_pattern = "/<td>([^<])*/";
  18. preg_match_all($detail_pattern, $application_match, $detail_matches, PREG_PATTERN_ORDER);
  19. $application = new Application();
  20. //match the basic details
  21. $application->council_reference = str_replace("<td>", "", $detail_matches[0][0]);
  22. $application->date_received = str_replace("<td>", "", $detail_matches[0][1]);
  23. $application->address = str_replace("<td>", "", $detail_matches[0][2]);
  24. //$application->status = str_replace("<td>", "", $detail_matches[0][4]);
  25. //match case number
  26. $casenumber_pattern = "/caseno=([^&]*)/";
  27. preg_match($casenumber_pattern, $application_match, $casenumber_matches);
  28. $case_number ="";
  29. if(sizeof($casenumber_matches)>0){
  30. $case_number = str_replace("caseno=","", $casenumber_matches[0]);
  31. }
  32. //if weve found a caase number, then get the details
  33. if($case_number !=""){
  34. //Comment and info urls
  35. $application->info_url = $info_url_base . $case_number;
  36. $application->comment_url = $comment_url_base . $case_number;
  37. //Get the postcode
  38. $postcode_pattern = "/[A-Z][A-Z]?[0-9][A-Z0-9]? ?[0-9][ABDEFGHJLNPQRSTUWXYZ]{2}/";
  39. preg_match($postcode_pattern, $application->address, $postcode_matches);
  40. if(isset($postcode_matches[0])){
  41. $application->postcode = $postcode_matches[0];
  42. }
  43. //get full details
  44. $details_html = "";
  45. $details_html = safe_scrape_page($info_url_base . $case_number);
  46. //regular expresion and clean
  47. $full_detail_pattern = '/id="desc" rows="[1-9]" cols="80" class="cDetailInput">([^<]*)/';
  48. preg_match($full_detail_pattern, $details_html, $full_detail_matches);
  49. if (isset($full_detail_matches[0])){
  50. $application->description = substr($full_detail_matches[0], strpos($full_detail_matches[0], ">") + 1);
  51. }
  52. //only add it if we have a postcode (bit useless otherwise)
  53. if(is_postcode($application->postcode)){
  54. array_push($applications, $application);
  55. }
  56. }else{
  57. error_log("Unable to find case number for an application at " . $search_url);
  58. }
  59. }
  60. //return
  61. return $applications;
  62. }
  63. function scrape_applications_wam ($search_url, $info_url_base, $comment_url_base, $detail_mode = 1){
  64. $applications = array();
  65. $application_pattern = '/<tr><td class=[^>]*>([^<]*)<\/td><td class=[^>]*><a href="[^"]*">([^<]*)<\/a><\/td><td class=[^>]*>([^<]*)<\/td><td class=[^>]*>([^<]*)<\/td>/';
  66. //grab the page
  67. $html = safe_scrape_page($search_url);
  68. //clean html
  69. $html = str_replace("\r\n","", $html);
  70. preg_match_all($application_pattern, $html, $application_matches, PREG_SET_ORDER);
  71. foreach ($application_matches as $application_match){
  72. if ($application_match[4] != 'Current') { continue; }
  73. $application = new Application();
  74. //match the basic details
  75. $application->council_reference = $application_match[2];
  76. $case_number = $application_match[2];
  77. $application->date_received = $application_match[1];
  78. $application->address = $application_match[3];
  79. //$application->status = $application_match[4];
  80. //if weve found a caase number, then get the details
  81. if($case_number !=""){
  82. //Comment and info urls
  83. $application->info_url = $info_url_base . $case_number;
  84. $application->comment_url = $comment_url_base . $case_number;
  85. //Get the postcode
  86. $postcode_pattern = "/[A-Z][A-Z]?[0-9][A-Z0-9]? ?[0-9][ABDEFGHJLNPQRSTUWXYZ]{2}/";
  87. preg_match($postcode_pattern, $application->address, $postcode_matches);
  88. if(isset($postcode_matches[0])){
  89. $application->postcode = $postcode_matches[0];
  90. }
  91. //get full details
  92. $details_html = "";
  93. $details_html = safe_scrape_page($info_url_base . $case_number);
  94. $details_html = str_replace("\r\n","",$details_html);
  95. //regular expresion and clean. SItes vary a tiny bit in their html, so there's a bit of a hack here
  96. if ($detail_mode == 1){
  97. $full_detail_pattern = '/Development:<.*<td colspan="3">([^<]*)<\/td>/';
  98. }
  99. if ($detail_mode == 2){
  100. $full_detail_pattern = '/Development:<\/td><td>([^<]*)/';
  101. }
  102. preg_match($full_detail_pattern, $details_html, $full_detail_matches);
  103. if (isset($full_detail_matches[1])){
  104. $application->description = $full_detail_matches[1];
  105. }
  106. //only add it if we have a postcode (bit useless otherwise)
  107. if(is_postcode($application->postcode)){
  108. //removed the xy for the moment. It is slowing down the scrape and will be added when the app is parsed anyway (Richard)
  109. /* $xy = postcode_to_location($application->postcode);
  110. $application->x = $xy[0];
  111. $application->y = $xy[1];
  112. $os = new OSRef($xy[0],$xy[1]);
  113. $latlon = $os->toLatLng();
  114. $application->lat = $latlon->lat;
  115. $application->lon = $latlon->lng;
  116. */
  117. array_push($applications, $application);
  118. }
  119. }else{
  120. error_log("Unable to find case number for an application at " . $search_url);
  121. }
  122. }
  123. //return
  124. return $applications;
  125. }
  126. // Council specific scapers
  127. function scrape_applications_islington ($search_url, $info_url_base, $comment_url_base){
  128. $applications = array();
  129. $application_pattern = '/<TR>([^<]*)<TD class="lg" valign="top" >([^<]*)<a href([^<]*)<a href=wphappcriteria.display>Search Criteria(.*)([^<]*)<(.*)>([^<]*)<TD class="lg" >([^<]*)<\/TD>([^<]*)<TD class="lg" >([^<]*)<INPUT TYPE=HIDDEN NAME([^>]*)([^<]*)/';
  130. //grab the page
  131. $html = safe_scrape_page($search_url);
  132. preg_match_all($application_pattern, $html, $application_matches, PREG_PATTERN_ORDER);
  133. foreach ($application_matches[0] as $application_match){
  134. $application_string = str_replace("\n","", $application_match);
  135. $reference_pattern = '/Search Results<\/a>">([^<]*)/';
  136. preg_match_all($reference_pattern, $application_string, $reference_matches, PREG_PATTERN_ORDER);
  137. $application = new Application();
  138. //match the applicaiton number
  139. $application->council_reference = str_replace('Search Results</a>">', "", $reference_matches[0][0]);
  140. //Comment and info urls
  141. $application->info_url = $info_url_base . $application->council_reference;
  142. $application->comment_url = $comment_url_base . $application->council_reference;
  143. //get full details
  144. $details_html = "";
  145. $details_html = safe_scrape_page($info_url_base . $application->council_reference);
  146. $details_html = str_replace("\r\n","",$details_html);
  147. //Details
  148. $full_detail_pattern = '/Proposal:<\/label><\/td>([^<]*)<td colspan="3">([^<]*)/';
  149. preg_match($full_detail_pattern, $details_html, $full_detail_matches);
  150. if (isset($full_detail_matches[2])){
  151. $application->description = $full_detail_matches[2];
  152. }
  153. //Address
  154. $address_pattern = '/Main location:<\/label><\/td>([^<]*)<td colspan="3">([^<]*)/';
  155. $address = "";
  156. preg_match($address_pattern, $details_html, $address_matches);
  157. if(isset($address_matches[2])){
  158. $application->address = $address_matches[2];
  159. }
  160. //postcode
  161. $postcode_pattern = "/[A-Z][A-Z]?[0-9][A-Z0-9]? ?[0-9][ABDEFGHJLNPQRSTUWXYZ]{2}/";
  162. preg_match($postcode_pattern, $application->address, $postcode_matches);
  163. if(isset($postcode_matches[0])){
  164. $application->postcode = $postcode_matches[0];
  165. }
  166. //only add it if we have a postcode (bit useless otherwise)
  167. if(is_postcode($application->postcode)){
  168. array_push($applications, $application);
  169. }
  170. }
  171. //return
  172. return $applications;
  173. }
  174. //validate postcode
  175. function is_postcode ($postcode){
  176. $valid = false;
  177. $postcode=str_replace(" ","",$postcode);
  178. if(ereg ('^[a-zA-Z]{1,2}[0-9]{1,2}[a-zA-Z]{0,1}[0-9]{1}[a-zA-Z]{2}$', $postcode)){
  179. $valid = true;
  180. }
  181. return $valid;
  182. }
  183. //Tiny url
  184. function tiny_url($url,$length=30){
  185. // make nasty big url all small
  186. if (strlen($url) >= $length){
  187. $tinyurl = @file ("http://tinyurl.com/api-create.php?url=$url");
  188. if (is_array($tinyurl)){
  189. $tinyurl = join ('', $tinyurl);
  190. } else {
  191. $tinyurl = $url;
  192. }
  193. } else {
  194. $tinyurl = $url;
  195. }
  196. return $tinyurl;
  197. }
  198. //Google maps url
  199. function googlemap_url_from_postcode($postcode, $zoom = 15){
  200. $postcode = strtolower(str_replace(" ", "+", $postcode));
  201. return "http://maps.google.com/maps?q=$postcode&z=$zoom";
  202. }
  203. //postcode to location
  204. function postcode_to_location($postcode){
  205. $x = 0;
  206. $y = 0;
  207. $clean_postcode = strtolower($postcode);
  208. $clean_postcode = str_replace(" ","+", $clean_postcode);
  209. $url = "http://www.streetmap.co.uk/newsearch.srf?type=Postcode&name=" . $clean_postcode;
  210. $html = file_get_contents($url);
  211. $x_pattern = "/var _LocationX=\d*;/";
  212. $y_pattern = "/var _LocationY=\d*;/";
  213. //X
  214. preg_match($x_pattern, $html, $matches);
  215. if(sizeof($matches) >0){
  216. $x = $matches[0];
  217. $x = str_replace('var _LocationX=',"", $x);
  218. $x = str_replace(";","", $x);
  219. }
  220. //Y
  221. preg_match($y_pattern, $html, $matches);
  222. if(sizeof($matches) >0){
  223. $y = str_replace("var _LocationY=","", $matches[0]);
  224. $y = str_replace(";","", $y);
  225. }
  226. $return = array();
  227. $return[0] = $x;
  228. $return[1] = $y;
  229. return $return;
  230. }
  231. function valid_email ($string) {
  232. $valid = false;
  233. if (!ereg('^[-!#$%&\'*+\\./0-9=?A-Z^_`a-z{|}~]+'.
  234. '@'.
  235. '[-!#$%&\'*+\\/0-9=?A-Z^_`a-z{|}~]+\.'.
  236. '[-!#$%&\'*+\\./0-9=?A-Z^_`a-z{|}~]+$', $string)) {
  237. $valid = false;
  238. } else {
  239. $valid = true;
  240. }
  241. return $valid;
  242. }
  243. function alert_size_to_meters($alert_area_size){
  244. $area_size_meters = 0;
  245. if ($alert_area_size == "s"){
  246. $area_size_meters = SMALL_ZONE_SIZE;
  247. }elseif ($alert_area_size == "m"){
  248. $area_size_meters = MEDIUM_ZONE_SIZE;
  249. }elseif ($alert_area_size == "l"){
  250. $area_size_meters = LARGE_ZONE_SIZE;
  251. }
  252. return $area_size_meters;
  253. }
  254. //Send a text email
  255. function send_text_email($to, $from_name, $from_email, $subject, $body){
  256. $headers = 'MIME-Version: 1.0' . "\r\n";
  257. $headers .= 'Content-type: text/plain; charset=iso-8859-1' . "\r\n";
  258. $headers .= 'From: ' . $from_name. ' <' . $from_email . ">\r\n";
  259. mail($to, $subject, $body, $headers);
  260. }
  261. // Format a date to mysql format
  262. function mysql_date($date){
  263. return date("Y-m-d H::i:s", $date);
  264. }
  265. function safe_scrape_page($url, $method = "GET"){
  266. error_log(print_r($url, true));
  267. $page = "";
  268. for ($i=0; $i < 3; $i++){
  269. if($page == false){
  270. if (SCRAPE_METHOD == "PEAR"){
  271. $page = scrape_page_pear($url, $method);
  272. }else{
  273. $page = scrape_page_curl($url, $method);
  274. }
  275. }
  276. }
  277. return $page;
  278. }
  279. function scrape_page_pear($url, $method = "GET"){
  280. $page = "";
  281. $request = new HTTP_Request($url, array("method" => $method));
  282. $request->sendRequest();
  283. $page = $request->getResponseBody();
  284. return $page;
  285. }
  286. function scrape_page_curl($url) {
  287. $ch = curl_init($url);
  288. curl_setopt($ch,CURLOPT_RETURNTRANSFER,TRUE);
  289. curl_setopt($ch,CURLOPT_FOLLOWLOCATION,TRUE);
  290. return curl_exec($ch);
  291. }
  292. function display_applications($applications, $authority_name, $authority_short_name){
  293. //smarty
  294. $smarty = new Smarty;
  295. $smarty->force_compile = true;
  296. $smarty->compile_dir = SMARTY_COMPILE_DIRECTORY;
  297. $smarty->template_dir = "../templates";
  298. $smarty->assign("authority_name", $authority_name);
  299. $smarty->assign("authority_short_name", $authority_short_name);
  300. if (sizeof($applications) > 0){
  301. $smarty->assign("applications", $applications);
  302. }
  303. $smarty->display("xml.tpl");
  304. }
  305. function get_time_from_get(){
  306. //if any get params were passed, overwrite the default date
  307. if (isset($_GET['day'])){
  308. $day = $_GET['day'];
  309. }else{
  310. throw_error("No day set in get string");
  311. }
  312. if (isset($_GET['month'])){
  313. $month = $_GET['month'];
  314. }else{
  315. throw_error("No year set in get string");
  316. }
  317. if (isset($_GET['year'])){
  318. $year = $_GET['year'];
  319. }else{
  320. throw_error("No year set in get string");
  321. }
  322. return mktime(0,0,0,$month,$day,$year);
  323. }
  324. function throw_error($message){
  325. throw new exception($message);
  326. }
  327. ?>