<?php
class Spider {
var $mysql_host;
var $mysql_name;
var $mysql_pwd;
var $mysql_db;
var $parentUrl; //开始搜索的url
var $searchNum; //搜索的层数
var $url;
var $db;
//数据库连接函数
function connect_to_db($mysql_host,$mysql_name,$mysql_pwd){
$db=mysql_connect($mysql_host,$mysql_name,$mysql_pwd);
return $db;
}
//处理url,以符合标准
function dealUrl($url){
if(strstr($url,"http://")){
}else{
$url="http://".$url;
}
if(strrpos($url,'/')==strlen($url)-1){
$url = substr($url,0,-1);
}
return $url;
}
//取一个链接下的所有链接
function getUrl($url){
$fcontents = file($url);
$nextUrl = "succeed";
while(list(,$line)=each($fcontents)){
//while(eregi('(href[[:space:]]*=[[:space:]]*"?[[:alnum:]:@/._-]+"?)(.*)',$line,$regs)){
while(eregi('(href[[:space:]]*=[[:space:]]*"?[[:alnum:]:@/._-]+[^([:space:]|/>|")]*)(.*)',$line,$regs)){
$regs[1] =
eregi_replace('(href[[:space:]]*=[[:space:]]*"?)([[:alnum:]:@/._-]+)("?)',"//2",$regs[1]);
if(strstr($regs[1],"http://")){
}else{
$regs[1]=$url."/".$regs[1];
}
//echo " $regs[1]<br>";
$line = $regs[2];
if(strstr($nextUrl,$regs[1])){
}else{
if(strstr($regs[1],".php")||strstr($regs[1],".asp")||strstr($regs[1],".jsp")||strstr($regs[1],".htm")||strstr($regs[1],".com")||strstr($regs[1],".cn")||strstr($regs[1],".net")||strstr($regs[1],".org")){
if(strstr($regs[1],"_bak")){
}else{
$nextUrl=$nextUrl.",".$regs[1];
}
}
}
}
}
return $nextUrl;
}
//查询该URL是否需要重新搜索
function queryUrl($url,$contentDesc,$db){
mysql_select_db("SearchEngine");
$sql="select * from visited where visitedUrl='".$url."' and contentDesc='".$contentDesc."'";
$rs=mysql_query($sql,$db);
if(mysql_fetch_row($rs)){
return false;
}else{
return true;
}
}
//得到图片大小
function getImgLength($url){
$info = @file($url);
if($info){
$info = implode("",$info);
return strlen($info)." 字节";
}else{
return 0;
}
}
//取得该url内的图片地址,及网页内容,保存日期,所在服务器IP,文件大小
//存入数据库
function gatherInfo($url){
$url=$this->dealUrl($url);
$content=$this->getUrlContent($url);
if($content!=""&&strstr($this->getUrlResponse($url),"200")){
//print_r($bodyInfo);
$Ip=$this->getUrlIP($url);
$Date=$this->getUrlDate($url);
$imgInfo=$this->tags($url,"img");
$url1=$url;
for($i=0;$i<count($imgInfo);$i++){
$imgSrc[$i]=$imgInfo[$i]['Attrs']['SRC'];
$imgSize[$i]=$imgInfo[$i]['Attrs']['HEIGHT']."*".$imgInfo[$i]['Attrs']['WIDTH'];
$theImgUrl[$i]=$imgSrc[$i];
if(strstr($imgSrc[$i],"http://")){
}else{
if(strstr($url,".php")||strstr($url,".asp")||strstr($url,".jsp")||strstr($url,".htm")){
$str=strrchr($url,'/');
$url=str_replace($str,"",$url);
}
//if(strstr($url,".com")||strstr($url,".cn")||strstr($url,".net")||strstr($url,".org")){
if(strrpos($url,'/')==strlen($url)-1){
$url = substr($url,0,-1);
}
//}
if(strpos($imgSrc[$i],'/')==0){
$imgSrc[$i]=$url.$imgSrc[$i];
}else{
$imgSrc[$i]=$url."/".$imgSrc[$i];
}
}
}
for($i=0;$i<count($theImgUrl);$i++){
$content=str_replace($theImgUrl[$i],$imgSrc[$i],$content);
}
for($i=0;$i<count($imgInfo);$i++){
/*$imgSrc[$i]=$imgInfo[$i]['Attrs']['SRC'];
$imgSize[$i]=$imgInfo[$i]['Attrs']['HEIGHT']."*".$imgInfo[$i]['Attrs']['WIDTH'];
$theImgUrl[$i]=$imgSrc[$i];
if(strstr($imgSrc[$i],"http://")){
}else{
if(strstr($url,".php")||strstr($url,".asp")||strstr($url,".jsp")||strstr($url,".htm")){
$str=strrchr($url,'/');
$url=str_replace($str,"",$url);
}
//if(strstr($url,".com")||strstr($url,".cn")||strstr($url,".net")||strstr($url,".org")){
if(strrpos($url,'/')==strlen($url)-1){
$url = substr($url,0,-1);
}
//}
if(strpos($imgSrc[$i],'/')==0){
$imgSrc[$i]=$url.$imgSrc[$i];
}else{
$imgSrc[$i]=$url."/".$imgSrc[$i];
}
}*/
$length[$i]=$this->getImgLength($imgSrc[$i]);
$db=$this->connect_to_db("localhost","root","");
$contentDesc=strip_tags($content);
mysql_select_db("SearchEngine");
$sql="insert into contentgather set url='".$url1."', date='".$Date."', ip='".$Ip."', length='".$length[$i]."', imgurl='".$imgSrc[$i]."', size='".$imgSize[$i]."', content='".$content."'";
//echo $sql;
mysql_query($sql,$db);
$contentGather_Id=mysql_insert_id();
$contentDesc=strip_tags($content);
$title=$this->getUrlTitle($url1);
$imgName=$imgSrc[$i];
//echo $contentGather_Id."<br>";
//echo $contentDesc."<br>";
//echo $title;
$sql="insert into suoyin set url='".$url1."', imgName='".$imgName."', title='".$title."', contentDesc='".$contentDesc."', contentGather_Id=".$contentGather_Id;
//echo $sql;
mysql_query($sql,$db);
//echo $imgSrc[$i]."chicun:".$imgSize[$i]."daxiao:".$length[$i]."<br>";
}
echo "完成搜集:".$url1."<br>";
}
}
function getUrlResponse($url){
$fp = @fopen($url,"r");
return $http_response_header[0];
}
//取文件保存日期
function getUrlDate($url){
$fp = @fopen($url,"r");
return $http_response_header[1];
}
//取某网址对应的IP
function getUrlIP($url){
$url_stuff = parse_url($url);
return gethostbyname($url_stuff['host']);
}
//网页标签提取函数
function tags($filename,$tag) {
$buffer = @join("",file($filename));
$buffer = eregi_replace("/r/n","",$buffer);
$tagkey = sql_regcase($tag);
$buffer = eregi_replace("<$tagkey ","/n<$tag ",$buffer);
$ar = split("/n",$buffer);
foreach($ar as $v) {
if(! eregi("<$tagkey ",$v)) continue;
eregi("<$tagkey ([^>]*)((.*)</$tagkey)?",$v,$regs);
$p[tagName] = strtoupper($tag);
if($regs[3])
$p[Text] = $regs[3];
$s = trim(eregi_replace("[ /t]+"," ",$regs[1]))." ";
$s = eregi_replace(" *= *","=",$s);
$a = split(" ",$s);
for($i=0;$i<count($a);$i++) {
$ch = array();
if(eregi("=[/"']",$a[$i])) {
$j = $i+1;
while(!eregi("[/"']$",$a[$i])) {
$a[$i] .= " ".$a[$j];
unset($a[$j]);
}
}
}
foreach($a as $k) {
$name = strtoupper(strtok($k,"="));
$value = strtok("/0");
if(eregi("^[/"']",$value))
$value = substr($value,1,-1);
if($name)
$p[Attrs][$name] = $value;
}
$pp[] = $p;
}
return $pp;
}
//取网页内容
function getUrlContent($url){
/*$fcontents = file($url);
while(list(,$line)=each($fcontents)){
//while(eregi('(href[[:space:]]*=[[:space:]]*"?[[:alnum:]:@/._-]+"?)(.*)',$line,$regs)){
while(eregi('(href[[:space:]]*=[[:space:]]*"?[[:alnum:]:@/._-]+[^([:space:]|/>|")]*)(.*)',$line,$regs)){
$tmp =
eregi_replace('(href[[:space:]]*=[[:space:]]*"?)([[:alnum:]:@/._-]+)("?)',"//2",$regs[1]);
if(strstr($tmp,"http://")){
}else{
$tmp=$url."/".$tmp;
}
$line=eregi_replace($regs[1],$tmp,$line);
$content=$content.$line;
//echo " $line<br>";
$line = $regs[2];
}
}
echo $content;*/
if(strstr($this->getUrlResponse($url),"200")){
$buffer = @join("",file($url));
$buffer = eregi_replace("/r/n","",$buffer);
if(eregi('(href[[:space:]]*=[[:space:]]*"?[[:alnum:]:@/._-]+[^([:space:]|/>|")]*)(.*)',$buffer,$regs)){
$tmp =
eregi_replace('(href[[:space:]]*=[[:space:]]*"?)([[:alnum:]:@/._-]+)("?)',"//2",$regs[1]);
if(strstr($tmp,"http://")){
}else{
$tmp=$url."/".$tmp;
}
$buffer=eregi_replace($regs[1],$tmp,$buffer);
$buffer=eregi_replace("/'","/"",$buffer);
//echo " $regs[1]<br>";
}
if($buffer){
return $buffer;
}
}
}
function nextUrl($url){
$nextUrl=split(",",$this->getUrl($url));
for($i=2;$i<count($nextUrl);$i++)
{
if(strstr($nextUrl[$i],".php")||strstr($nextUrl[$i],".asp")||strstr($nextUrl[$i],".jsp")||strstr($nextUrl[$i],".htm")||strstr($nextUrl[$i],".com")||strstr($nextUrl[$i],".cn")||strstr($nextUrl[$i],".net")||strstr($nextUrl[$i],".org")){
$theUrl=$nextUrl[$i];
break;
}
}
return $theUrl;
}
function mainSpider($url){
for($i=0;$i<1;$i++){
$this->gatherInfo($url);
$nextUrl=split(",",$this->getUrl($url));
for($j=1;$j<count($nextUrl)+1;$j++){
//
echo $nextUrl[$j]."<br>";
$this->gatherInfo($nextUrl[$j]);
}
$url=$this->nextUrl($url);
}
}
//提取网页标题函数
function getUrlTitle($url){
$buffer = @join("",file($url));
$buffer = eregi_replace("/r/n","",$buffer);
$num = strlen(strstr($buffer,"<title>"))-strlen(strstr($buffer,"</title>"));
return $this->substr_for_gb2312(strstr($buffer,"<title>"),8,$num-8);
}
//无乱码截取中文字符串的函数
function substr_for_gb2312($str,$start,$len=null)
{
$totlelength = strlen($str);
//特例情况
if ($len == null) $len = $totlelength;
if ($len ==0) return "";
if ($len >= $totlelength && $start == 0 ) return $str;
if ($start > $totlelength) return "";
//分析$start
if ($start < 0 ) //$start<0时,转化为$start>0时的定位.
{
if ( abs($start) >= $totlelength ) $start = 0;
else $start = $totlelength - abs($start);
}
//确定起始位置,当起始位拆分某汉字时,返回值包含此汉字.
if ($start > 0)
{
$i = $start-1;
$flag = -1;
while ($i >= 0)
{
if ( ord(substr($str,$i,1)) > 160)
{
$flag = -1*$flag;
}
else break;
$i--;
}
if($flag==1)
{
$start = $start - 1;
$len++; //保证不位移.
}
}
$str = substr($str,$start);//截除字符串$str的$start位前的字符
$totlelength = strlen($str);
//确定结束位置,当结束位拆分某汉字时,返回值不包含此汉字.
if ($len<0) $len = $totlelength - abs($len);
if ($len <= 0) return "";
$i=min($len,$totlelength);
$i--;
$flag = -1;
while ($i >= 0)
{
if (ord(substr($str,$i,1))>160)
{
$flag=-1*$flag;
}
else break;
$i--;
}
if($flag == 1)
$len=$len-1;
$subit=substr($str,0,$len);
return $subit;
}
}
?>