php snoopy 编码,基于Snoopy的PHP近似完美获取网站编码

最新推荐文章于 2021-03-16 15:34:32 发布

weixin_39755824

最新推荐文章于 2021-03-16 15:34:32 发布

阅读量137

点赞数

文章标签： php snoopy 编码

基于Snoopy的PHP近似完美获取网站编码

用于php爬虫，获取编码准确率99.9%, 还有部分不能获取，求大牛完善

代码来源：

站云网

www.siteyun.com

先要到网上下载Snoopy.class.php

调用方法：url=$url;

}

//打开网站

private function open($url)

{

if($this->request!==null)

{

if($this->request->status==200)

{

return true;

}

else

{

return false;

}

else

{

$this->request=new Snoopy();

$this->request->fetch($url);

if($this->request->status==200)

{

$this->request->results=strtolower($this->request->results);

$charset=$this->getCharset();

if($charset!="utf-8")

{

if($charset=="windows-1252")

{

$this->request->results=$this->uni_decode($this->request->results);

}

else

{

$this->request->results=mb_convert_encoding($this->request->results,"UTF-8",$charset);

}

return true;

}

else

{

return false;

}

//获取网站title,keywords,description

public function getWebinfo()

{

$info=array(

'title'=>'',

'keywords'=>'',

'desc'=>'',

'ip'=>''

);

if(!$this->open($this->url)){return $info;exit;}

//print_r($this->request->results);exit;

preg_match('/([^>]*)/si', $this->request->results, $titlematch );

if (isset($titlematch) && is_array($titlematch) && count($titlematch) > 0)

{

$info['title'] = strip_tags($titlematch[1]);

}

preg_match_all('/"]*)"?[\s]*' . 'content="?([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match);

$ft=0;

foreach($match[1] as $mt)

{

if($mt=="keywords" || $mt=="description")

{

$ft=1;

}

if($ft==0)

{

preg_match_all('/"]*)"?[\s]*name="?' . '([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match);

if (isset($match) && is_array($match) && count($match) == 3)

{

$originals = $match[0];

$names = $match[2];

$values = $match[1];

if (count($originals) == count($names) && count($names) == count($values))

{

$metaTags = array();

for ($i=0, $limiti=count($names); $i < $limiti; $i++)

{

$metaTags[$names[$i]] = array (

'html' => htmlentities($originals[$i]),

'value' => $values[$i]

);

}

else

{

if (isset($match) && is_array($match) && count($match) == 3)

{

$originals = $match[0];

$names = $match[1];

$values = $match[2];

if (count($originals) == count($names) && count($names) == count($values))

{

$metaTags = array();

for ($i=0, $limiti=count($names); $i < $limiti; $i++)

{

$metaTags[$names[$i]] = array (

'html' => htmlentities($originals[$i]),

'value' => $values[$i]

);

}

$result = array (

'metaTags' => $metaTags

);

if(isset($result['metaTags']['keywords']['value']))

{

$info['keywords']=$result['metaTags']['keywords']['value'];

}

else

{

$info['keywords']="";

}

if(isset($result['metaTags']['description']['value']))

{

$info['desc']=$result['metaTags']['description']['value'];

}

else

{

$info['desc']="";

}

$domain=preg_replace('/http\:\/\//si', '', $this->url);

$ip=@gethostbyname($domain);

$ip_arr=explode(".", $ip);

if(count($ip_arr)==4)

{

$info['ip']=$ip;

}

return $info;

}

public function t($string,$o)

{

for($i=0;$iopen($this->url)){return false;exit;}

//首先从html获取编码

preg_match("/request->results,$temp) ? strtolower($temp[1]):"";

if($temp[1]!="")

{

if(in_array($temp[1], $this->charset_arr))

{

if($temp[1]=="gb2312")

{

$tmp_charset=$this->t($this->request->results,$temp[1]);

if($tmp_charset==$temp[1])

{

return $temp[1];

}

else

{

return $temp[1];

}

if(!empty($this->request->headers))

{

//从header中获取编码

$hstr=strtolower(implode("

weixin_39755824

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫