<?php
/**************
从网页或网站中抓取RSS地址的类
用法:
$RSS = new GetRssUrl;
配置抓取超时时间:
$RSS->timelimit = 60; //一分钟
抓取RSS:
$Link = $RSS->Get("blog.zoneker.com");
**************/
class GetRssUrl
{
//public
var $url;
var $domin;
var $file;
var $link;
var $rsslink;
var $rssurl;
var $validrssurl;
var $timelimit;
function Init()
{
set_time_limit($this->timelimit);
}
function ProcessUrl()
{
//$this->url = $url;
//判断传入地址是否具有HTTP标志
if (!eregi ('(http://)|(HTTP://)', $this->url))
{
$this->url = "http://" . $this->url;
}
else
{
$this->url = str_replace("HTTP://", "http://", $this->url);
}
//取传入地址的完整域名
eregi('http://[:./_0-9a-z/~/@/#/$/%/^/&/*/(/)/?/+/=/!/-]+/*', $this->url, $fileurl);
//判断地址是否以"/"结尾
if (eregi('/$', $fileurl[0]))
{
$this->domin = substr($fileurl[0], 0, -1);
}
else
{
$this->domin = $fileurl[0];
}
}
function GetHtmlFile()
{
//抓取远程网页到内存
$this->file = file_get_contents ($this->url);
}
function RssOrHtml()
{
//判断RSS地址是不是XML文档,如果是则停止判断返回地址
if (eregi ('<rss',$this->file))
{
$this->rsslink[] = $this->url ;
return $this->rsslink;
}
else
{
return 0;
}
}
function GetValidLink()
{
//抓取所有合法URL
$pattern="|href=['/"]?([^ '/"]+)['/" ]|U";
preg_match_all($pattern,$this->file, $link, PREG_PATTERN_ORDER);
$this->link = $link[1];
$this->link = array_unique($this->link);//移除重复数据
//重排数组
for($i=0 ;$i < count($link[1]) ; $i++)
{
if($this->link[$i]!= '')
{
$this->link['tmp'][] = $this->link[$i];
}
}
$this->link = $this->link['tmp'];
}
function PregMatchUrl()
{
//从所有链接中过滤带有RSS标记的链接
for($i =0 ;$i < count($this->link); $i++)
{
if(eregi ('(feed[s]*)|(rss)|(xml)' , $this->link[$i]))
{
$this->rsslink[]=$this->link[$i];
}
}
//详细匹配链接,过滤RSS地址
if($this->rsslink[0] == '')
$this->rsslink = $this->link;
foreach ($this->rsslink as $rsslink)
{
if (eregi ('([:.//_0-9a-z/~/@/#/$/%/^/&/*/(/)/?/+/=/!/-]+[/.]*[xmlrssfeed]{3,5})|([:.//_0-9a-z/~/@/#/$/%/^/&/*/(/)/?/+/=/!/-]+[/.]+[phpaspx]{3,4}[/?]*[:.//_0-9a-z/~/@/#/$/%/^/&/*/(/)/?/+/=/!/-]*)' , $rsslink))
{
if (eregi ('(^/)', $rsslink))
{
$rsslink = $this->domin. $rsslink;
}
if (!eregi ('/', $rsslink))
{
$rsslink = $this->domin. "/" . $rsslink;
}
$rsslink = strtolower($rsslink);
$this->rssurl[] = $rsslink;
}
}
if($this->rssurl[0]!= '')
{
$count = $this->rssurl;
$this->rssurl = array_unique($this->rssurl);
//重排数组
for($i=0 ;$i < count($count) ; $i++)
{
if($this->rssurl[$i]!= '')
{
$this->rssurl['tmp'][] = $this->rssurl[$i];
}
}
$this->rssurl = $this->rssurl['tmp'];
foreach ($this->rssurl as $rssurl)
{
@ $fp = fopen($rssurl, "r");
@ $content = fread ($fp, 2048);
if (eregi ('<rss',$content))//判断RSS地址是不是XML文档
{
$this->validrssurl[] = $rssurl ;
}
}
}
}
function Get($url)
{
$this->url = $url;
$this->Init();
$this->ProcessUrl();
$this->GetHtmlFile();
$rss = $this->RssOrHtml();
if(!$rss)
{
$this->GetValidLink();
$this->PregMatchUrl();
if($this->validrssurl[0]!='')
return $this->validrssurl;
else
return false;
}
else
return $rss;
}
}
?>