网页自动更换标题代码php,抓取网页标题关键字描述和网页内容并自动转码

# $url:网址 #$encode:编码

function htmload($url,$encode='UTF-8'){

$pageinfo = array();

$pageinfo['content_type'] = '';

$pageinfo['charset'] = '';

$pageinfo['title'] = '';

$pageinfo['description'] = '';

$pageinfo['keywords'] = '';

$pageinfo['body'] = '';

$pageinfo['httpcode'] = 200;

$ch = curl_init();

curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)");

curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);

curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0);

curl_setopt($ch, CURLOPT_SSL_VERIFYPEER,0);

curl_setopt($ch, CURLOPT_TIMEOUT, 8);

curl_setopt($ch, CURLOPT_FILETIME, 1);

curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);

curl_setopt($ch, CURLOPT_URL,$url);

$curl_start = microtime(true);

$store = curl_exec ($ch);

$curl_time = microtime(true) - $curl_start;

if( curl_error($ch) ) {

$pageinfo['httpcode'] = 505;

return $pageinfo;

}

$pageinfo['httpcode'] = curl_getinfo($ch,CURLINFO_HTTP_CODE);

$pageinfo['content_type'] = curl_getinfo($ch,CURLINFO_CONTENT_TYPE);

if(intval($pageinfo['httpcode']) <> 200 or !preg_match('@text/html@',curl_getinfo($ch,CURLINFO_CONTENT_TYPE))){

return $pageinfo;

}

preg_match('#charset=([^/s/n/r]+)#i',curl_getinfo($ch,CURLINFO_CONTENT_TYPE),$matches); //从header 里取charset

if( trim($matches[1]) ){

$pageinfo['charset'] = trim($matches[1]);

}

curl_close ($ch);

$store = preg_replace("//smUi",'',$store);

$store = preg_replace("/(.*)/smUi",'',$store);

$store = preg_replace("/ /",'',$store);

if($pageinfo['charset'] == '' ) {

preg_match('@]*>@i',$store,$matches);

$pageinfo['charset'] = trim($matches[1]);

}

preg_match('//iU',$store,$matches);

$desc = trim($matches[1]);

$pageinfo['description'] = get_encoding(str_replace("/", '',$desc),$encode);

preg_match('//iU',$store,$matches);

$keywords = trim($matches[1]);

$pageinfo['keywords'] = str_replace("/", '',$keywords);

preg_match("/

(.*)/smUi",$store, $matches);

$pageinfo['title'] = get_encoding(trim($matches[1]),$encode);

preg_match("/

(.*)/smUi",$store, $matches);

$pageinfo['body'] = get_encoding(addslashes(clhtml($matches[1])),$encode);

return $pageinfo;

}

#清理内容空格转义字符及js代码

function clhtml($document){

$document = trim(preg_replace("/\s| /","",$document));

if (strlen($document) <= 0){

return $document;

}

$search = array (

"''si",

"']*?>'si",

"'[/r/n/s+]'",

"'&([/w]+);'i",

"'&bp;'"

);

$replace = array ( "","","", "","");

return @preg_replace ($search, $replace, $document);

}

#转码

function get_encoding($data,$to){

$encode_arr = array('UTF-8','ASCII','GBK','GB2312','BIG5','JIS','eucjp-win','sjis-win','EUC-JP');

$encoded = mb_detect_encoding($data, $encode_arr);

$data = mb_convert_encoding($data,$to,$encoded);

return $data;

}

代码参考了网络上的一部分,但网上的基本上都有BUG,我修改和优化了一下,留着说不定将来的某一天我也能做搜索引擎的时候用 哈哈。。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值