最近公司有个需求,就是查询url 是否被百度收录,第一版使用的是单线程去跑,速度太慢100条数据在20多秒 ,后来改为多线程去跑,速度蹭蹭快,1000条数据大概1秒左右,有需要的朋友可以测试下
function curl_multi($array = [])
{
//创建多个 curl 句柄
$mh = curl_multi_init();
$headers = [
"User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
];
foreach ($array as $k=>$url) {
$conn[$k] = curl_init($url);
curl_setopt($conn[$k], CURLOPT_TIMEOUT, 2000);
curl_setopt($conn[$k], CURLOPT_HEADER, 0); //不要header信息
curl_setopt($conn[$k], CURLOPT_HTTPHEADER, $headers);
curl_setopt($conn[$k], CURLOPT_FOLLOWLOCATION, 1); //获取跳转后的内容
curl_setopt($conn[$k], CURLOPT_MAXREDIRS, 7); //http定向级别
curl_setopt($conn[$k], CURLOPT_RETURNTRANSFER, 1);
curl_setopt($conn[$k], CURLOPT_SSL_VERIFYPEER, 0); //跳过证书检查
curl_setopt($conn[$k], CURLOPT_SSL_VERIFYHOST, 0);
//添加句柄
curl_multi_add_handle($mh, $conn[$k]);
}
$active = null;
do {
//处理在栈中的每一个句柄。无论该句柄需要读取或写入数据都可调用此方法
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
//该函数仅返回关于整个批处理栈相关的错误。即使返回 CURLM_OK 时单个传输仍可能有问题
while ($active && $mrc == CURLM_OK) {
//阻塞直到cURL批处理连接中有活动连接
if (curl_multi_select($mh) != -1) {
do {
$mrc = curl_multi_exec($mh, $active);
} while ($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
$rs = [];
//获取请求返回值
foreach ($array as $k=>$v) {
curl_error($conn[$k]);
//获取值
$rs[$k] = curl_multi_getcontent($conn[$k]);
//处理自己业务
//释放资源
curl_multi_remove_handle($mh, $conn[$k]);
//关闭语病
curl_close($conn[$k]);
}
curl_multi_close($mh);
return $rs;
}
ini_set("memory_limit", "2048M");
ini_set("max_execution_time", 3600);
$start = microtime(true);
$str = file_get_contents('./11.txt');
$arr = explode("\r\n", $str);
$size = 1000;
$count = count($arr);
$page = ceil($count / $size);
for ($i = 0; $i <= $page; $i++) {
$offset = $i * $size;
$urlArr = array_slice($arr, $offset, $size);
curl_multi($urlArr);
}
$end = microtime(true);
echo '<br/>';
echo $end - $start;