php过滤百科页面

只是很简单的一个页面过滤,百度页面若有变动方法会失效,在这里只是做一个简单的笔记,想要的人可以借鉴参考一下

<?php
    /**
     * @describe 外部页面处理
     */
    class BaiKe
    {
        /**
         * @prompt 设置外部请求链接
         */
        private $BaiKeRequestUrl; //百科链接

        /**
         * @prompt 初始化链接地址
         */
        public function __construct()
        {
            $this->BaiKeRequestUrl = 'https://baike.baidu.com/item/';
        }

        /**
         * @param $url
         * @return bool|mixed
         * @prompt 统一外部请求方法
         */
        private function SendRequest($url)
        {
            try{
                $ch = curl_init();
                curl_setopt($ch, CURLOPT_URL,$url);
                curl_setopt($ch, CURLOPT_HEADER,0);
                curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);//禁止调用时就输出获取到的数据
                curl_setopt($ch, CURLOPT_FOLLOWLOCATION,1);
                curl_setopt($ch, CURLOPT_SSL_VERIFYPEER,false);
                curl_setopt($ch, CURLOPT_SSL_VERIFYHOST,false);
                $result = curl_exec($ch);
                $code = curl_getinfo($ch);
                curl_close($ch);
                if($code['http_code'] == 200 && $result) {
                    return $result;
                }else{
                    return false;
                }
            }catch (Throwable $e){
                LIB_Logger::error($e);
                return $this->errorMessage('请求失败: ' . $e->getMessage(), LIB_Config::get('global.error.system'));
            }
        }

        /**
         * @param $search
         * @return array
         * @prompt 百度百科数据处理方法
         */
        public function  getSearchReturnBkCont($search)
        {
            $ContentMessage = array();
            if(filter_var($search,FILTER_VALIDATE_BOOLEAN) == NULL && !empty($search)){
                //处理请求内容
                $searchCont = trim($this->BaiKeSpecialWord($search));
                //请求链接
                $RequestUrl = $this->BaiKeRequestUrl.$searchCont;
                //请求后的返回值
                $ResponseCont = $this->SendRequest($RequestUrl);
                if(filter_var($ResponseCont,FILTER_VALIDATE_BOOLEAN) == NULL && !empty($ResponseCont)) {
                    //获取主内容
                    $regex_1 = "/<div class=\"main-content\".*?>(.*)<\/div>/ism";
                    preg_match($regex_1,$ResponseCont,$matches);
                    if(empty($matches[0])) {
                        $ContentMessage['code'] = '4001';
                        $ContentMessage['message'] = '搜索内容百科暂未收录';
                        return $ContentMessage;
                    }else{
                        $ContentMessage['code'] = '4002';
                        $ContentMessage['message'] = '查询成功';
                        $disposeCont = $matches[0];
                        //正则过滤摘要
                        $regex_2 = "/<div class=\"lemma-summary\".*?>.{1}<\/div>/ism";
                        preg_match($regex_2,$disposeCont,$dispose_j);
                        if(empty($dispose_j[0])) {
                            $ContentMessage['abstract'] = null;
                        }else{
                            $dispose_html = strip_tags($dispose_j[0],'<sup>');;
                            $abstract    = str_replace(array("&nbsp;"),array(""),preg_replace('/<sup[^>]*>(.*?)<\/sup>/is','', $dispose_html));
                            $ContentMessage['abstract'] = $abstract;
                        }
                        //正则过滤名称
                        $regex_3 = "/<div class=\"basic-info cmn-clearfix\".*?>(.*?)<\/div>/ism";
                        preg_match($regex_3, $disposeCont, $dispose_n);
                        if(empty($dispose_n[1])) {
                            $ContentMessage['name'] = null;
                        }else{
                            $dispose_html_n = strip_tags($dispose_n[1],'<sup>');
                            $dispose_html_n = preg_replace('/<sup[^>]*>(.*?)<\/sup>/is','', $dispose_html_n);
                            $dispose_html_n_arr = explode("\n",$dispose_html_n);
                            $dis_n_arr = array();
                            foreach($dispose_html_n_arr as $k=>$v) {
                                if(!empty($v)) {
                                    $dis_n_arr[] = str_replace(array("&nbsp;"),array(""),$v);
                                }
                            }
                            $name = array();
                            $name_num = intval(count($dis_n_arr)/2);
                            for($i = 0; $i < $name_num;$i++) {
                                if($i > 0) {
                                    $name[$i]['key'] = $dis_n_arr[($i*2)];
                                    $name[$i]['value'] = $dis_n_arr[($i*2)+1];
                                }else{
                                    $name[$i]['key'] = $dis_n_arr[0];
                                    $name[$i]['value'] = $dis_n_arr[1];
                                }
                            }
                            $ContentMessage['name'] = $name;
                        }
                        //正则过滤内容
                        $regex_4 = "/<div class=\"para-title level-2\".*?>(.*)<\/div>/ism";
                        preg_match($regex_4, $disposeCont, $dispose_c);
                        if(empty($dispose_c[0])) {
                            $c_index = strpos($disposeCont,"basic-info cmn-clearfix");
                            $c_content = substr($disposeCont,$c_index);
                            preg_match("/<div class=\"para\".*?>(.*)<\/div>/ism", $c_content, $dispose_c);
                        }
                        if(empty($dispose_c[0])){
                            $ContentMessage['content'] = null;
                        }else{
                            $dispose_html_c = preg_replace('/<div[^>]*?class="lemma-picture text-pic layout-right"[^>]*>(.*?)<\/div>/is','',$dispose_c[0]);
                            $dispose_html_c = preg_replace('/<div[^>]*?class="lemma-picture text-pic layout-center"[^>]*>(.*?)<\/div>/is','',$dispose_html_c);
                            $dispose_html_c = preg_replace('/<div[^>]*?class="lemma-picture text-pic layout-left"[^>]*>(.*?)<\/div>/is','',$dispose_html_c);
                            $dispose_html_c = preg_replace('/<a[^>]*?class="lemma-album layout-right nslog:10000206"[^>]*>(.*?)<\/a>/is','',$dispose_html_c);
                            $dispose_html_c = preg_replace('/<a[^>]*?class="edit-icon j-edit-link"[^>]*>(.*?)<\/a>/is','',$dispose_html_c);//去除所有编辑按钮
                            //判断分割位置
                            $index = strpos($dispose_html_c,'词条图册');
                            if(!$index) {
                                $index = strpos($dispose_html_c,'参考资料');
                                if(!$index) {
                                    $index =  strpos($dispose_html_c,'词条标签');
                                }
                                if(!$index) {
                                    $index =  strpos($dispose_html_c,trim($searchCont.'图册'));
                                }
                            }
                            $split_content = str_split($dispose_html_c,$index);
                            $content = preg_replace('/<sup[^>]*>(.*?)<\/sup>/is','',strip_tags($split_content[0],"<sup>"));
                            $content = str_replace(array("&nbsp;"),array(""),$content);
                            $ContentMessage['content'] = $content;
                        }
                        //返回组装好的数据
                        return $ContentMessage;
                    }

                }else{
                    $ContentMessage['code'] = '4001';
                    $ContentMessage['message'] = '搜索内容百科暂未收录';
                    return $ContentMessage;
                }
            }else{
                $ContentMessage['code'] = '4000';
                $ContentMessage['message'] = '搜索参数不能为空';
                return $ContentMessage;
            }
        }

        /**
         * @param $text
         * @return mixed|string
         */
        private function  BaiKeSpecialWord($text)
        {
            //收录特殊词汇
            //请根据自己的业务处理特殊词汇
            
        }
    }

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值