CURL 支持多线程的抓取网页的功能,代码例子如下
class HttpMulti {
//curl选项
private static $options = array(
CURLOPT_SSL_VERIFYPEER => 0, //不开启HTTPS请求
CURLOPT_RETURNTRANSFER => 1, //请求信息以文件流方式返回
CURLOPT_CONNECTTIMEOUT => 10, //连接超时时间 默认为10s
CURLOPT_TIMEOUT => 20, //设置curl执行最大时间
CURLOPT_ENCODING => "gzip", //HTTP请求头中"Accept-Encoding"的值,为空发送所有支持的编码类型
CURLOPT_HEADER => 0, //设置为true,请求返回的文件流中就会包含response header
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
CURLOPT_POST => FALSE, //默认选择GET的方式发送
);
public static function multiRun($urlData=array()){
if(empty($urlData)) return;
$data = $curls = array();
$mh = curl_multi_init();
foreach($urlData as $k=>$val){
$ch = curl_init($val);
curl_setopt_array($ch, self::$options);
curl_multi_add_handle($mh, $ch);
$curls[$k] = $ch;
}
// 执行批处理句柄
self::execMultiHandle($mh);
if($curls){
foreach($curls as $_k=>$v){
//获得返回信息
$data[$_k] = curl_multi_getcontent($v);
curl_close($v);
curl_multi_remove_handle($mh, $v);
curl_multi_close($mh);
}
}
return $data;
}
static private function execMultiHandle($mh){
if(empty($mh)) return false;
do{
$mrc = curl_multi_exec($mh, $active);
}while($mrc == CURLM_CALL_MULTI_PERFORM);
while($active && $mrc == CURLM_OK){
if(curl_multi_select($mh) != -1){
do{
$mrc = curl_multi_exec($mh, $active);
}while($mrc == CURLM_CALL_MULTI_PERFORM);
}
}
}
}
//测试代码
$urlData = [
'https://www.baidu.com/',
'https://www.taobao.com/',
'http://weibo.com/',
'http://www.qq.com/'
];
$res = HttpMulti::multiRun($urlData);