只是很简单的一个页面过滤,百度页面若有变动方法会失效,在这里只是做一个简单的笔记,想要的人可以借鉴参考一下
<?php
/**
* @describe 外部页面处理
*/
class BaiKe
{
/**
* @prompt 设置外部请求链接
*/
private $BaiKeRequestUrl; //百科链接
/**
* @prompt 初始化链接地址
*/
public function __construct()
{
$this->BaiKeRequestUrl = 'https://baike.baidu.com/item/';
}
/**
* @param $url
* @return bool|mixed
* @prompt 统一外部请求方法
*/
private function SendRequest($url)
{
try{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL,$url);
curl_setopt($ch, CURLOPT_HEADER,0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);//禁止调用时就输出获取到的数据
curl_setopt($ch, CURLOPT_FOLLOWLOCATION,1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER,false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST,false);
$result = curl_exec($ch);
$code = curl_getinfo($ch);
curl_close($ch);
if($code['http_code'] == 200 && $result) {
return $result;
}else{
return false;
}
}catch (Throwable $e){
LIB_Logger::error($e);
return $this->errorMessage('请求失败: ' . $e->getMessage(), LIB_Config::get('global.error.system'));
}
}
/**
* @param $search
* @return array
* @prompt 百度百科数据处理方法
*/
public function getSearchReturnBkCont($search)
{
$ContentMessage = array();
if(filter_var($search,FILTER_VALIDATE_BOOLEAN) == NULL && !empty($search)){
//处理请求内容
$searchCont = trim($this->BaiKeSpecialWord($search));
//请求链接
$RequestUrl = $this->BaiKeRequestUrl.$searchCont;
//请求后的返回值
$ResponseCont = $this->SendRequest($RequestUrl);
if(filter_var($ResponseCont,FILTER_VALIDATE_BOOLEAN) == NULL && !empty($ResponseCont)) {
//获取主内容
$regex_1 = "/<div class=\"main-content\".*?>(.*)<\/div>/ism";
preg_match($regex_1,$ResponseCont,$matches);
if(empty($matches[0])) {
$ContentMessage['code'] = '4001';
$ContentMessage['message'] = '搜索内容百科暂未收录';
return $ContentMessage;
}else{
$ContentMessage['code'] = '4002';
$ContentMessage['message'] = '查询成功';
$disposeCont = $matches[0];
//正则过滤摘要
$regex_2 = "/<div class=\"lemma-summary\".*?>.{1}<\/div>/ism";
preg_match($regex_2,$disposeCont,$dispose_j);
if(empty($dispose_j[0])) {
$ContentMessage['abstract'] = null;
}else{
$dispose_html = strip_tags($dispose_j[0],'<sup>');;
$abstract = str_replace(array(" "),array(""),preg_replace('/<sup[^>]*>(.*?)<\/sup>/is','', $dispose_html));
$ContentMessage['abstract'] = $abstract;
}
//正则过滤名称
$regex_3 = "/<div class=\"basic-info cmn-clearfix\".*?>(.*?)<\/div>/ism";
preg_match($regex_3, $disposeCont, $dispose_n);
if(empty($dispose_n[1])) {
$ContentMessage['name'] = null;
}else{
$dispose_html_n = strip_tags($dispose_n[1],'<sup>');
$dispose_html_n = preg_replace('/<sup[^>]*>(.*?)<\/sup>/is','', $dispose_html_n);
$dispose_html_n_arr = explode("\n",$dispose_html_n);
$dis_n_arr = array();
foreach($dispose_html_n_arr as $k=>$v) {
if(!empty($v)) {
$dis_n_arr[] = str_replace(array(" "),array(""),$v);
}
}
$name = array();
$name_num = intval(count($dis_n_arr)/2);
for($i = 0; $i < $name_num;$i++) {
if($i > 0) {
$name[$i]['key'] = $dis_n_arr[($i*2)];
$name[$i]['value'] = $dis_n_arr[($i*2)+1];
}else{
$name[$i]['key'] = $dis_n_arr[0];
$name[$i]['value'] = $dis_n_arr[1];
}
}
$ContentMessage['name'] = $name;
}
//正则过滤内容
$regex_4 = "/<div class=\"para-title level-2\".*?>(.*)<\/div>/ism";
preg_match($regex_4, $disposeCont, $dispose_c);
if(empty($dispose_c[0])) {
$c_index = strpos($disposeCont,"basic-info cmn-clearfix");
$c_content = substr($disposeCont,$c_index);
preg_match("/<div class=\"para\".*?>(.*)<\/div>/ism", $c_content, $dispose_c);
}
if(empty($dispose_c[0])){
$ContentMessage['content'] = null;
}else{
$dispose_html_c = preg_replace('/<div[^>]*?class="lemma-picture text-pic layout-right"[^>]*>(.*?)<\/div>/is','',$dispose_c[0]);
$dispose_html_c = preg_replace('/<div[^>]*?class="lemma-picture text-pic layout-center"[^>]*>(.*?)<\/div>/is','',$dispose_html_c);
$dispose_html_c = preg_replace('/<div[^>]*?class="lemma-picture text-pic layout-left"[^>]*>(.*?)<\/div>/is','',$dispose_html_c);
$dispose_html_c = preg_replace('/<a[^>]*?class="lemma-album layout-right nslog:10000206"[^>]*>(.*?)<\/a>/is','',$dispose_html_c);
$dispose_html_c = preg_replace('/<a[^>]*?class="edit-icon j-edit-link"[^>]*>(.*?)<\/a>/is','',$dispose_html_c);//去除所有编辑按钮
//判断分割位置
$index = strpos($dispose_html_c,'词条图册');
if(!$index) {
$index = strpos($dispose_html_c,'参考资料');
if(!$index) {
$index = strpos($dispose_html_c,'词条标签');
}
if(!$index) {
$index = strpos($dispose_html_c,trim($searchCont.'图册'));
}
}
$split_content = str_split($dispose_html_c,$index);
$content = preg_replace('/<sup[^>]*>(.*?)<\/sup>/is','',strip_tags($split_content[0],"<sup>"));
$content = str_replace(array(" "),array(""),$content);
$ContentMessage['content'] = $content;
}
//返回组装好的数据
return $ContentMessage;
}
}else{
$ContentMessage['code'] = '4001';
$ContentMessage['message'] = '搜索内容百科暂未收录';
return $ContentMessage;
}
}else{
$ContentMessage['code'] = '4000';
$ContentMessage['message'] = '搜索参数不能为空';
return $ContentMessage;
}
}
/**
* @param $text
* @return mixed|string
*/
private function BaiKeSpecialWord($text)
{
//收录特殊词汇
//请根据自己的业务处理特殊词汇
}
}