/**
* 爬虫逻辑
* 获取html内容
*/
public function crawler($centent_url)
{
//https://www.xiaohongshu.com/discovery/item/5a4ca319a7c9b8481ea24c7e?_at=36df0d880cae739ee71e7e94174a6d7c70351
//接收内容源地址
$request = $centent_url;
$html = $this->getUrlContent($request);
// print_r($html);exit;
$relwvantart = array();
//获取div随机码
preg_match_all("/class=\"content\" data-v-(.*)>/iUs",$html,$temp_variable,PREG_PATTERN_ORDER);
$variable = $temp_variable[1][0];
// $variableimg = $temp_variable[1][1];
// print_r($variableimg);exit;
//获取文章主要内容
preg_match_all("/class=\"content\" data-v-".$variable.">(.*)
// $temp_content = $content[1][0];
$temp_content = "
// print_r($temp_content);exit;
//获取文章内容里的图片编号
preg_match_all("/class=\"cell image-cell\" data-v-(.*) data-v-".$variable.">/iUs",$html,$temp_variable_img,PREG_PATTERN_ORDER);
$temp_contentimg = $temp_variable_img[1][0];
// print_r($temp_contentimg);exit;
//内容里的图片处理
preg_match_all("/img src=\"(.*)\" data-v-".$temp_contentimg.">/iUs",$temp_content,$temp_img,PREG_PATTERN_ORDER);
$temp_img = $temp_img[1];
// print_r($temp_img);exit;
if(!empty($temp_img)){
//处理图片路径
//图片下载到本地,获取返回的图片路径
$temp_content_img = array();
foreach ($temp_img as $k => $v){
$temp_v = str_replace('//','https://',$v);
$res = $this->crabcontentImg($temp_v);
$temp_content_img[$k] = $res['save_path'];
}
//将内容里的图片路径替换成oss上的图片路径
foreach ($temp_content_img as $key => $value){
$temp_content = str_replace($temp_img[$key],$value,$temp_content);
}
//获取缩略图板块内容
preg_match_all("/
$img_url = $temp[1][0];
$img_url = str_replace('//','https://',$img_url);
$res = $this->crabImage($img_url);
$imgurl = $res['save_path'];
// print_r($imgurl);exit;
$relwvantart['img'] = $imgurl;
}
// print_r($content);
// print_r(base64_encode($content));exit;
$relwvantart['content'] = base64_encode($temp_content);
//获取文章标题
preg_match_all("/
(.*)/iUs",$html,$title,PREG_PATTERN_ORDER);
$title = $title[1][0];
$relwvantart['title'] = $title;
if (empty($title)){
//获取meta里的description当标题
preg_match_all("//iUs",$html,$desc,PREG_PATTERN_ORDER);
$desc = $desc[1][0];
$relwvantart['title'] = $desc;
}
//获取缩略图板块内容
preg_match_all("//iUs",$html,$temp,PREG_PATTERN_ORDER);
$tempicon = $temp[1];
// print_r($tempicon);exit;
$img_url = array();
//图片路径处理
foreach ($tempicon as $k => $v){
$img_url[$k] = str_replace('(','https:',$v);
}
foreach($img_url as $key => $value){
$img_url[$key] = str_replace(')','',$value);
}
//将图片保存在本地并上传oss
foreach ($img_url as $i => $j){
$res = $this->crabImage($j);
// print_r($res);exit;
$imgurl[$i] = $res['save_path'];
}
// print_r($res);
$relwvantart['img'] = $imgurl;
return $relwvantart;
}