需要说明的是搜狗通过关键词搜索出来的结果有100页那么多,采集的话也是可以实现的。
但是我的项目需求是首先通过搜狗的关键词推荐API获取某个关键词的所有长尾词入库,
以此来生成文章的一个标题,文章内容就是通过这个标题搜素公众号文章,
从100页文章中随机抽一篇文章获取到文章内容然后入库。
项目需求相当于通过长尾关键词自动生成文章
#获取微信公众号文文章
function http_post_json($str)
{
$url = 'https://weixin.sogou.com/weixin?type=2&query='.urlencode($str).'&ie=utf8&s_from=input&_sug_=y&_sug_type_=';
$cookie_file='cookie.txt';
$ch = curl_init();
//curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_URL, $url);
//curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($jsonStr));
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
//curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie_file);
curl_setopt($ch, CURLOPT_ENCODING, "gzip");
curl_setopt($ch, CURLOPT_HTTPHEADER, array(
"content-type: text/html; charset=utf-8",
"accept-encoding: gzip, deflate, br",
"accept-language: zh-CN,zh;q=0.9,en;q=0.8",
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
)
);
$response = curl_exec($ch);
//$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$response= mb_convert_encoding($response, 'utf-8', 'GBK,UTF-8,ASCII');
curl_close($ch);
$response=get_between($response,'news-list','pagebar_container');
preg_match_all('#href="([\s\S]*?)" uigs="article_image[\s\S]*?href="[\s\S]*?" id="[\s\S]*?href="[\s\S]*?" data-headimage#is',$response,$arr);
if(count($arr)>0){
$num=mt_rand(0,count($arr));
$url = 'https://weixin.sogou.com/'.$arr[1][$num];//随机获取一篇文章链接
$b=mt_rand(0,100);
$a=strpos($url, 'url=');
$a=substr($url, $a+4+21+$b, 1);
$url.="&k=".$b."&h=".$a;
/*获取cookie开始*/
$cookie_url='https://weixin.sogou.com/new/wap/images/app_spread.png?v=f23bdb0e';
$cookie_ch = curl_init();
//curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($cookie_ch, CURLOPT_URL, $cookie_url);
//curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($jsonStr));
curl_setopt($cookie_ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($cookie_ch, CURLOPT_HEADER, 1);
//curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie);
//curl_setopt($cookie_ch,CURLOPT_COOKIE,$cookie);
curl_setopt($cookie_ch , CURLOPT_NOBODY, 1);
curl_setopt($cookie_ch, CURLOPT_RETURNTRANSFER, 1);
//若给定url自动跳转到新的url,有了下面参数可自动获取新url内容:302跳转
//curl_setopt($cookie_ch, CURLOPT_FOLLOWLOCATION, 1);
$cookie_response = curl_exec($cookie_ch);
curl_close($cookie_ch);
//echo $cookie_response;
preg_match_all("/set\-cookie:([^\r\n]*); expires/i", $cookie_response, $matches);
$cookie_arr = $matches[1];
foreach ($cookie_arr as $value){
$cookie.=$value.';'.'SUV=; weixinIndexVisited=1; Hm_lvt_cdce8cda34e84469b1c8015204129522=1629900161; SMYUV=; UM_distinctid=; SNUID=;';//cookie自行获取
}
/*获取cookie结束SUV=1629900157643552;*/
$ch = curl_init();
//curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_URL, $url);
//curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($jsonStr));
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
//curl_setopt($ch, CURLOPT_HEADER, 1);
//curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie);
curl_setopt($ch,CURLOPT_COOKIE,$cookie);
//curl_setopt($ch , CURLOPT_NOBODY, 1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
//若给定url自动跳转到新的url,有了下面参数可自动获取新url内容:302跳转
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_ENCODING, "gzip");
curl_setopt($ch, CURLOPT_HTTPHEADER, array(
"referer: https://weixin.sogou.com",
"content-type: text/html; charset=utf-8",
"accept-encoding: gzip, deflate, br",
"accept-language: zh-CN,zh;q=0.9,en;q=0.8",
"User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
)
);
$response = curl_exec($ch);
//$info = curl_getinfo( $ch ,CURLINFO_EFFECTIVE_URL);
//$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$response= mb_convert_encoding($response, 'utf-8', 'GBK,UTF-8,ASCII');
curl_close($ch);
$response=get_between($response, '<script>', '</script>');
preg_match_all("#\+\= '(.*?)';#is", $response, $matches);
foreach ($matches[1] as $value){
$real_url.=$value;
}//获取公众号文章真正的链接
if($real_url!=''){
$result=get_between(file_get_contents($real_url),'<div class="rich_media_content " id="js_content" style="visibility: hidden;">','var first_sceen__time = (+new Date());');
//echo $result;
$newstr = preg_replace("/<script[\s\S]*?<\/script>/i","",$result,3);
$newstr = preg_replace('/style="visibility: hidden;"/i',"",$newstr);
}
return $newstr;
}
//echo $cookie_file;
//echo $response;
}
//取文本中间
function get_between($input, $start, $end) {
$substr = substr($input, strlen($start)+strpos($input, $start),(strlen($input) - strpos($input, $end))*(-1));
return $substr;
}
最后小程序效果: