最近要用到采集程序,需要取出整个页面的链接,网上找到的一遇到中文链接就取不出完整的网址,自己改了一个,留个档
<?php
function GetAllLink($string)
{
$r_=chr(0xa1);
$e_=chr(0xff);
//echo "<br>".$e_."<br>";
$string = str_replace("\r","",$string);
$string = str_replace("\n","",$string);
$regex[url] = "((http|https|ftp|telnet|news):\/\/)?([a-z0-9_\-\/\.]+\.[][a-z0-9:;&#@=_~%\?\/\.\,\+\-]+[a-z0-9_\-\=]+[\u4e00-\u9fa5]+[\xa0-\xff]{0,})";
$regex[email] = "([a-z0-9_\-]+)@([a-z0-9_\-]+\.[a-z0-9\-\._\-]+)";
$string = eregi_replace(">[^<>]+<","><", $string);
$string = eregi_replace("<!--.*//-->","", $string);
$string = eregi_replace("<[^a][^<>]*>","", $string);
$string = eregi_replace("<a([ ]+)href=([\"']*)mailto:($regex[email])([\"']*)[^>]*>","", $string);
$string = eregi_replace("<a([ ]+)href=([\"']*)($regex[url])([\"']*)[^>]*>","\\3\t", $string);
$output[0] = strtok($string, "\t");
while(($temp = strtok("\t")))
{
if($temp && !in_array($temp, $output))
$output[++$i] = $temp;
}
return $output;
}
?>