php snoopy 编码,基于Snoopy的PHP近似完美获取网站编码

基于Snoopy的PHP近似完美获取网站编码

用于php爬虫,获取编码准确率99.9%, 还有部分不能获取,求大牛完善

代码来源:

站云网

www.siteyun.com

先要到网上下载Snoopy.class.php

调用方法:url=$url;

}

//打开网站

private function open($url)

{

if($this->request!==null)

{

if($this->request->status==200)

{

return true;

}

else

{

return false;

}

}

else

{

$this->request=new Snoopy();

$this->request->fetch($url);

if($this->request->status==200)

{

$this->request->results=strtolower($this->request->results);

$charset=$this->getCharset();

if($charset!="utf-8")

{

if($charset=="windows-1252")

{

$this->request->results=$this->uni_decode($this->request->results);

}

else

{

$this->request->results=mb_convert_encoding($this->request->results,"UTF-8",$charset);

}

}

return true;

}

else

{

return false;

}

}

}

//获取网站title,keywords,description

public function getWebinfo()

{

$info=array(

'title'=>'',

'keywords'=>'',

'desc'=>'',

'ip'=>''

);

if(!$this->open($this->url)){return $info;exit;}

//print_r($this->request->results);exit;

preg_match('/([^>]*)/si', $this->request->results, $titlematch );

if (isset($titlematch) && is_array($titlematch) && count($titlematch) > 0)

{

$info['title'] = strip_tags($titlematch[1]);

}

preg_match_all('/"]*)"?[\s]*' . 'content="?([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match);

$ft=0;

foreach($match[1] as $mt)

{

if($mt=="keywords" || $mt=="description")

{

$ft=1;

}

}

if($ft==0)

{

preg_match_all('/"]*)"?[\s]*name="?' . '([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match);

if (isset($match) && is_array($match) && count($match) == 3)

{

$originals = $match[0];

$names = $match[2];

$values = $match[1];

if (count($originals) == count($names) && count($names) == count($values))

{

$metaTags = array();

for ($i=0, $limiti=count($names); $i < $limiti; $i++)

{

$metaTags[$names[$i]] = array (

'html' => htmlentities($originals[$i]),

'value' => $values[$i]

);

}

}

}

}

else

{

if (isset($match) && is_array($match) && count($match) == 3)

{

$originals = $match[0];

$names = $match[1];

$values = $match[2];

if (count($originals) == count($names) && count($names) == count($values))

{

$metaTags = array();

for ($i=0, $limiti=count($names); $i < $limiti; $i++)

{

$metaTags[$names[$i]] = array (

'html' => htmlentities($originals[$i]),

'value' => $values[$i]

);

}

}

}

}

$result = array (

'metaTags' => $metaTags

);

if(isset($result['metaTags']['keywords']['value']))

{

$info['keywords']=$result['metaTags']['keywords']['value'];

}

else

{

$info['keywords']="";

}

if(isset($result['metaTags']['description']['value']))

{

$info['desc']=$result['metaTags']['description']['value'];

}

else

{

$info['desc']="";

}

$domain=preg_replace('/http\:\/\//si', '', $this->url);

$ip=@gethostbyname($domain);

$ip_arr=explode(".", $ip);

if(count($ip_arr)==4)

{

$info['ip']=$ip;

}

return $info;

}

public function t($string,$o)

{

for($i=0;$iopen($this->url)){return false;exit;}

//首先从html获取编码

preg_match("/request->results,$temp) ? strtolower($temp[1]):"";

if($temp[1]!="")

{

if(in_array($temp[1], $this->charset_arr))

{

if($temp[1]=="gb2312")

{

$tmp_charset=$this->t($this->request->results,$temp[1]);

if($tmp_charset==$temp[1])

{

return $temp[1];

}

}

else

{

return $temp[1];

}

}

}

if(!empty($this->request->headers))

{

//从header中获取编码

$hstr=strtolower(implode("

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值