在采集程序或者蜘蛛程序中经常会遇到一类问题,就是将网页中相对路径形式的URL转换为绝对路径形式的URL。例如在http://www.msphome.cn/blog/1/这个页面中,有一个URL链接为../index.php,那么我们要将它转换为http://www.msphome.cn/blog /index.php。下面给出了解决这类问题的代码。该程序能够成功处理各种URL,将其变成绝对形式。
方法一:
/**
* 将一个URL转换为完整URL
*
*/
function format_url($srcurl, $baseurl) {
$srcinfo = parse_url($srcurl);
if(isset($srcinfo[‘scheme’])) {
return $srcurl;
}
$baseinfo = parse_url($baseurl);
$url = $baseinfo[‘scheme’].’://’.$baseinfo[‘host’];
if(substr($srcinfo[‘path’], 0, 1) == ‘/’) {
$path = $srcinfo[‘path’];
}else{
$path = dirname($baseinfo[‘path’]).’/’.$srcinfo[‘path’];
}
$rst = array();
$path_array = explode(‘/’, $path);
if(!$path_array[0]) {
$rst[] = ”;
}
foreach ($path_array AS $key => $dir) {
if ($dir == ‘..’) {
if (end($rst) == ‘..’) {
$rst[] = ‘..’;
}elseif(!array_pop($rst)) {
$rst[] = ‘..’;
}
}elseif($dir && $dir != ‘.’) {
$rst[] = $dir;
}
}
if(!end($path_array)) {
$rst[] = ”;
}
$url .= implode(‘/’, $rst);
return str_replace(‘\\’, ‘/’, $url);
}
$srcurl = ‘/guestbook.php’;
$baseurl = ‘http://www.msphome.cn/index.php/ddd.html‘;
echo format_url($srcurl, $baseurl);
?>
方法二
//相对路径转化成绝对路径
function relative_to_absolute($content, $feed_url) {
preg_match(‘/(http|https|ftp):///’, $feed_url, $protocol);
$server_url = preg_replace(“/(http|https|ftp|news):///”, “”, $feed_url);
$server_url = preg_replace(“//.*/”, “”, $server_url);
if ($server_url == ”) {
return $content;
}
if (isset($protocol[0])) {
$new_content = preg_replace(‘/href=”//’, ‘href=”‘.$protocol[0].$server_url.’/’, $content);
$new_content = preg_replace(‘/src=”//’, ‘src=”‘.$protocol[0].$server_url.’/’, $new_content); //开源OSPhP.COM.CN
} else {
$new_content = $content;
}
return $new_content;
}
?>
赞过:
赞 正在加载……
相关