snoopy.class.php 模拟登陆,基于Snoopy的PHP近似完美获取网站编码的代码

基于Snoopy的PHP近似完美获取网站编码 用于php爬虫,获取编码准确率99.9%, 还有部分不能获取,求大牛完善

先要到网上下载Snoopy.class.php

调用方法:

require 'lib/Snoopy.class.php';

require 'lib/WebCrawl.class.php';//包含下面代码

$go=new WebCrawl('http://www.baidu.com');

echo $go->getCharset();

?>

class WebCrawl

{

private $url;

private $request;

public $charset_arr=array(

'gb2312',

'utf-8',

'big5',

'gbk',

'ascii',

'cp936',

'ibm037',

'ibm437',

'ibm500',

'asmo-708',

'dos-720',

'ibm737',

'ibm775',

'ibm850',

'ibm852',

'ibm855',

'ibm857',

'ibm00858',

'ibm861',

'ibm860',

'dos-862',

'ibm863',

'ibm864',

'ibm865',

'cp866',

'ibm869',

'ibm870',

'windows-874',

'cp875',

'shift_jis',

'ks_c_5601-1987',

'ibm1026',

'ibm01047',

'ibm01047',

'ibm01040',

'ibm01041',

'ibm01042',

'ibm01043',

'ibm01044',

'ibm01045',

'ibm01046',

'ibm01047',

'ibm01048',

'ibm01049',

'utf-16',

'unicodefffe',

'windows-1250',

'windows-1251',

'windows-1252',

'windows-1253',

'windows-1254',

'windows-1255',

'windows-1256',

'windows-1257',

'windows-1258',

'johab',

'macintosh',

'x-mac-japanese',

'x-mac-chinesetrad',

'x-mac-korean',

'x-mac-arabic',

'x-mac-hebrew',

'x-mac-greek',

'x-mac-cyrillic',

'x-mac-chinesesimp',

'x-mac-romanian',

'x-mac-ukrainian',

'x-mac-thai',

'x-mac-ce',

'x-mac-icelandic',

'x-mac-turkish',

'x-mac-croatian',

'x-chinese-cns',

'x-cp20001',

'x-chinese-eten',

'x-cp20003',

'x-cp20004',

'x-cp20005',

'x-ia5',

'x-ia5-german',

'x-ia5-swedish',

'x-ia5-norwegian',

'us-ascii',

'x-cp20261',

'x-cp20269',

'ibm273',

'ibm277',

'ibm278',

'ibm280',

'ibm284',

'ibm285',

'ibm290',

'ibm420',

'ibm423',

'ibm424',

'x-ebcdic-koreanextended',

'ibm-thai',

'koi8-r',

'ibm871',

'ibm880',

'ibm905',

'ibm00924',

'x-cp20936',

'x-cp20949',

'cp1025',

'koi8-u',

'iso-8859-1',

'iso-8859-2',

'iso-8859-3',

'iso-8859-4',

'iso-8859-5',

'iso-8859-6',

'iso-8859-7',

'iso-8859-8',

'iso-8859-9',

'iso-8859-13',

'iso-8859-15',

'x-europa',

'iso-8859-8-i',

'iso-2022-jp',

'csiso2022jp',

'iso-2022-jp',

'iso-2022-kr',

'x-cp50227',

'euc-jp',

'euc-cn',

'euc-kr',

'hz-gb-2312',

'gb18030',

'x-iscii-de',

'x-iscii-be',

'x-iscii-ta',

'x-iscii-te',

'x-iscii-as',

'x-iscii-or',

'x-iscii-ka',

'x-iscii-ma',

'x-iscii-gu',

'x-iscii-pa',

'utf-7',

'utf-32',

'utf-32be'

);

public function __construct($url)

{

$this->url=$url;

}

//打开网站

private function open($url)

{

if($this->request!==null)

{

if($this->request->status==200)

{

return true;

}

else

{

return false;

}

}

else

{

$this->request=new Snoopy();

$this->request->fetch($url);

if($this->request->status==200)

{

$this->request->results=strtolower($this->request->results);

$charset=$this->getCharset();

if($charset!="utf-8")

{

if($charset=="windows-1252")

{

$this->request->results=$this->uni_decode($this->request->results);

}

else

{

$this->request->results=mb_convert_encoding($this->request->results,"UTF-8",$charset);

}

}

return true;

}

else

{

return false;

}

}

}

//获取网站title,keywords,description

public function getWebinfo()

{

$info=array(

'title'=>'',

'keywords'=>'',

'desc'=>'',

'ip'=>''

);

if(!$this->open($this->url)){return $info;exit;}

// print_r($this->request->results);exit;

preg_match('/

([^>]*)/si', $this->request->results, $titlematch );

if (isset($titlematch) && is_array($titlematch) && count($titlematch) > 0)

{

$info['title'] = strip_tags($titlematch[1]);

}

preg_match_all('/"]*)"?[\s]*' . 'content="?([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match);

$ft=0;

foreach($match[1] as $mt)

{

if($mt=="keywords" || $mt=="description")

{

$ft=1;

}

}

if($ft==0)

{

preg_match_all('/"]*)"?[\s]*name="?' . '([^>"]*)"?[\s]*[\/]?[\s]*>/si', $this->request->results, $match);

if (isset($match) && is_array($match) && count($match) == 3)

{

$originals = $match[0];

$names = $match[2];

$values = $match[1];

if (count($originals) == count($names) && count($names) == count($values))

{

$metaTags = array();

for ($i=0, $limiti=count($names); $i < $limiti; $i++)

{

$metaTags[$names[$i]] = array (

'html' => htmlentities($originals[$i]),

'value' => $values[$i]

);

}

}

}

}

else

{

if (isset($match) && is_array($match) && count($match) == 3)

{

$originals = $match[0];

$names = $match[1];

$values = $match[2];

if (count($originals) == count($names) && count($names) == count($values))

{

$metaTags = array();

for ($i=0, $limiti=count($names); $i < $limiti; $i++)

{

$metaTags[$names[$i]] = array (

'html' => htmlentities($originals[$i]),

'value' => $values[$i]

);

}

}

}

}

$result = array (

'metaTags' => $metaTags

);

if(isset($result['metaTags']['keywords']['value']))

{

$info['keywords']=$result['metaTags']['keywords']['value'];

}

else

{

$info['keywords']="";

}

if(isset($result['metaTags']['description']['value']))

{

$info['desc']=$result['metaTags']['description']['value'];

}

else

{

$info['desc']="";

}

$domain=preg_replace('/http\:\/\//si', '', $this->url);

$ip=@gethostbyname($domain);

$ip_arr=explode(".", $ip);

if(count($ip_arr)==4)

{

$info['ip']=$ip;

}

return $info;

}

public function t($string,$o)

{

for($i=0;$i

{

if(ord($string{$i})<128)

continue;

if((ord($string{$i})&224)==224)

{

//第一个字节判断通过

$char = $string{++$i};

if((ord($char)&128)==128)

{

//第二个字节判断通过

$char = $string{++$i};

if((ord($char)&128)==128)

{

$encoding = "UTF-8";

break;

}

}

}

if((ord($string{$i})&192)==192)

{

//第一个字节判断通过

$char = $string{++$i};

if((ord($char)&128)==128)

{

//第二个字节判断通过

$encoding = "GB2312";

break;

}

}

}

return strtolower($encoding);

}

function uni_decode ($str, $code = 'utf-8'){

$str = json_decode(preg_replace_callback('/(\d{5});/', create_function('$dec', 'return \'\\u\'.dechex($dec[1]);'), '"'.$str.'"'));

if($code != 'utf-8'){ $str = iconv('utf-8', $code, $str); }

return $str;

}

//获取网站编码

public function getCharset()

{

if(!$this->open($this->url)){return false;exit;}

//首先从html获取编码

preg_match("/request->results,$temp) ? strtolower($temp[1]):"";

if($temp[1]!="")

{

if(in_array($temp[1], $this->charset_arr))

{

if($temp[1]=="gb2312")

{

$tmp_charset=$this->t($this->request->results,$temp[1]);

if($tmp_charset==$temp[1])

{

return $temp[1];

}

}

else

{

return $temp[1];

}

}

}

if(!empty($this->request->headers))

{

//从header中获取编码

$hstr=strtolower(implode("|||",$this->request->headers));

preg_match("/charset=[^\w]?([-\w]+)/is",$hstr,$lang) ? strtolower($lang[1]):"";

if($lang[1]!="")

{

return $lang[1];

}

}

$encode_arr=array("UTF-8","GB2312","GBK","BIG5","ASCII","EUC-JP","Shift_JIS","CP936","ISO-8859-1","JIS","eucjp-win","sjis-win");

$encoded=mb_detect_encoding($this->request->results,$encode_arr);

if($encoded)

{

return strtolower($encoded);

}

else

{

return false;

}

}

}

?>

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值