抓取网页标题关键字描述和网页内容并自动转码

最新推荐文章于 2022-10-10 16:25:33 发布

weixin_34219944

最新推荐文章于 2022-10-10 16:25:33 发布

阅读量441

点赞数

原文链接：https://my.oschina.net/biejun/blog/366066

版权

为什么80%的码农都做不了架构师？>>>

# $url:网址 #$encode:编码
function htmload($url,$encode='UTF-8'){
        $pageinfo = array();
        $pageinfo['content_type'] = '';
        $pageinfo['charset'] = '';
        $pageinfo['title'] = '';
        $pageinfo['description'] = '';
        $pageinfo['keywords'] = '';
        $pageinfo['body'] = '';
        $pageinfo['httpcode'] = 200;

        $ch = curl_init();
        curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)");
        curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0);
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER,0);
        curl_setopt($ch, CURLOPT_TIMEOUT, 8);
        curl_setopt($ch, CURLOPT_FILETIME, 1);
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
        curl_setopt($ch, CURLOPT_URL,$url);   

        $curl_start = microtime(true);
        $store = curl_exec ($ch);

        $curl_time = microtime(true) - $curl_start;   
        if( curl_error($ch) ) { 
            $pageinfo['httpcode'] = 505;
            return $pageinfo;
        }   
        $pageinfo['httpcode'] = curl_getinfo($ch,CURLINFO_HTTP_CODE);
        $pageinfo['content_type'] = curl_getinfo($ch,CURLINFO_CONTENT_TYPE);
        if(intval($pageinfo['httpcode']) <> 200 or !preg_match('@text/html@',curl_getinfo($ch,CURLINFO_CONTENT_TYPE))){   
            return $pageinfo;
        }   
        preg_match('#charset=([^/s/n/r]+)#i',curl_getinfo($ch,CURLINFO_CONTENT_TYPE),$matches); //从header 里取charset   
        if( trim($matches[1]) ){
            $pageinfo['charset'] = trim($matches[1]);
        }
        curl_close ($ch);
        $store = preg_replace("/<mce:script.*><!--(.*)<\/script>/smUi",'',$store);
        $store = preg_replace("/<link\/s+[^>]+>/smUi",'',$store);
        $store = preg_replace("/<!--.*-->/smUi",'',$store);
        $store = preg_replace("/<style.*>(.*)<\/style>/smUi",'',$store);
        $store = preg_replace("/　/",'',$store);
        if($pageinfo['charset'] == '' ) {
            preg_match('@<meta.+charset=([/w/-]+)[^>]*>@i',$store,$matches);
            $pageinfo['charset'] = trim($matches[1]);
        }
        preg_match('/<meta\s+name=\"description\"\s+content=\"(.*)\"\s?\/?>/iU',$store,$matches);
        $desc = trim($matches[1]);
        $pageinfo['description'] = get_encoding(str_replace("/", '',$desc),$encode);
        preg_match('/<meta\s+name=\"keywords\"\s+content=\"(.*)\"\s?\/?>/iU',$store,$matches);
        $keywords = trim($matches[1]);
        $pageinfo['keywords'] = str_replace("/", '',$keywords); 
        preg_match("/<title>(.*)<\/title>/smUi",$store, $matches);
        $pageinfo['title'] = get_encoding(trim($matches[1]),$encode);

        preg_match("/<body.*>(.*)<\/body>/smUi",$store, $matches);
        $pageinfo['body'] = get_encoding(addslashes(clhtml($matches[1])),$encode);
        return $pageinfo;
}
#清理内容空格转义字符及js代码
function clhtml($document){
        $document = trim(preg_replace("/\s|　/","",$document));
        if (strlen($document) <= 0){
            return $document;
        }   
        $search = array (
                    "'<script[^>]*?>.*?</script>'si",
                    "'<[///!]*?[^<>]*?>'si",
                    "'[/r/n/s+]'",
                    "'&([/w]+);'i",
                    "'&bp;'"
                    );
        $replace = array ( "","","", "","");

        return @preg_replace ($search, $replace, $document);
}
#转码
function get_encoding($data,$to){
   $encode_arr = array('UTF-8','ASCII','GBK','GB2312','BIG5','JIS','eucjp-win','sjis-win','EUC-JP');
   $encoded = mb_detect_encoding($data, $encode_arr);
   $data = mb_convert_encoding($data,$to,$encoded);
   return $data;
}

代码参考了网络上的一部分，但网上的基本上都有BUG，我修改和优化了一下，留着说不定将来的某一天我也能做搜索引擎的时候用哈哈。。

转载于:https://my.oschina.net/biejun/blog/366066