最终版:php多线程 curl请求数据function
public function curl_http($code_list,$timeout){
$res = array();
$mh = curl_multi_init();//创建多个curl语柄
foreach($code_list as $k=>$v){
$post_data=[
"page"=>"0",
"zqzm"=>$v,
];
$conn[$k]=curl_init();
curl_setopt($conn[$k], CURLOPT_URL, 'http://www.neeq.com.cn/nqxyxxController/nqxygkxxPage.do');//url
curl_setopt($conn[$k],CURLOPT_RETURNTRANSFER,1);
curl_setopt($conn[$k], CURLOPT_POST, true);//传递类型post
if($post_data!==''){
curl_setopt($conn[$k], CURLOPT_POSTFIELDS,$post_data);//设置传递的参数
}
curl_setopt($conn[$k], CURLOPT_TIMEOUT, $timeout);//设置超时时间
curl_setopt($conn[$k], CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 5.01; Windows NT 5.0)');
curl_setopt($conn[$k], CURLOPT_MAXREDIRS, 7);//HTTp定向级别
curl_setopt($conn[$k], CURLOPT_HEADER, 0);//这里不要header,加块效率
curl_setopt($conn[$k], CURLOPT_FOLLOWLOCATION, 1); // 302 redirect
curl_multi_add_handle ($mh,$conn[$k]);
}
// 执行批处理句柄
$active = null;
do{
$mrc = curl_multi_exec($mh,$active);//当无数据,active=true
}while($mrc == CURLM_CALL_MULTI_PERFORM);//当正在接受数据时
while($active && $mrc == CURLM_OK){//当无数据时或请求暂停时,active=true
// if(curl_multi_select($mh) != -1){
do{
$mrc = curl_multi_exec($mh, $active);
}while($mrc == CURLM_CALL_MULTI_PERFORM);
// }
}
foreach ($code_list as $k => $url) {
curl_error($conn[$k]);
$res[$k]=curl_multi_getcontent($conn[$k]);//获得返回信息
$header[$k]=curl_getinfo($conn[$k]);//返回头信息
curl_close($conn[$k]);//关闭语柄
curl_multi_remove_handle($mh , $conn[$k]);//释放资源
}
curl_multi_close($mh);
return $res;
}
运行测试接口,并加入数据库,注:加入数据库的写法是TP5框架的写法.
public function index1(){
header("Content-type: text/html; charset=utf-8");
set_time_limit(0);//设置永不超时
$all_code_list=[
400002,
400005,
400006,
400007,
400008,
400009,
400010,
400011,
400012,
400013,
400016,//........总共一万两千家
];//12000家新三板股票代码
//由于接口方短时间只能处理400左右的数据来源,所以,sleep30秒后再进行抓取
$i=0;$num=100;$count_i=count($all_code_list)/$num;
$res_all=array();
for($i=0;$i
$now=$i*$num;
$now_code_list=array_slice($all_code_list,$now,$num);
$res=$this->curl_http($now_code_list,'10');
//处理数据集
foreach($res as $k=>$v){
if(strlen($v)>155){
$data=ltrim($v,'null(['); $data=rtrim($data,"])");
$data=json_decode($data,true);
foreach ($data["content"] as $k1=>$v1){
$add["stock_code"]=$v1["HQZQDM"]; $add["name"]=$v1["HQZQJC"];
$add["price"]=$v1["HQCJJG"]; $add["number"]=$v1["HQCJSL"];
$add["jine"]=$v1["HQCJJE"]; $add["buy"]=$v1["HQBJYDY"];
$add["sale"]=$v1["HQSJYDY"]; $add["time"]=$v1["HQJSRQ"];
$add["add_time"]=time();
$res=Db::name("sale_info")->insertGetId($add);
echo("
".$res."
"); //2805}
}
}
}
sleep(10);
/*$i=0;
foreach($res as $k=>$v){
if(strlen($v)>145){
$i++;
}
}
echo($i."
");*/
}
注:最开始的时候for循环一万次运行下面的get_jyinfo,但是发现一个问题就是,每次运行到四百多条的时候,就会500错误,然后就要手动从断点处重新请求.网上百度有人说是apache问题,有些说是关于内存占用什么之类的,,,采用了php的ob_flush,flush之类的都没有用,在群友的建议下百度了一个多进程的教程,附上原blog的链接: https://www.cnblogs.com/zhanghu/p/5635519.html
顺便附带最开始单次请求curl的写法
public function curl_get_jyinfo($code){
$ret = array();
$times = 5;
$timeout = 5;
$ch = curl_init();
//要发送的数据
$post_data=[
"page"=>"0",
"zqzm"=>$code,
];
curl_setopt($ch, CURLOPT_URL, 'http://www.neeq.com.cn/nqxyxxController/nqxygkxxPage.do');//url
curl_setopt($ch, CURLOPT_POST, true);//传递类型post
curl_setopt($ch, CURLOPT_HEADER, false);
if($post_data!==''){
curl_setopt($ch, CURLOPT_POSTFIELDS,$post_data);//设置传递的参数
}
//设置成功不输出
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
//设置超时时间(重要)
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
$output = curl_exec($ch);
if ($errNo = curl_errno($ch)) {
error_log("Error [$errNo]: " . curl_error($ch));
} else {
//处理数据
$data=ltrim($output,'null(['); $data=rtrim($output,"])");
$ret = json_decode($data,true);
// 解析的结果集为空时停止查询
if (!is_array($ret) && !trim($ret)) {
return false;
}
unset($output);
}
curl_close($ch);
if (isset($ret) && $ret["totalElements"]>0) {
return $ret;
}else{
return false;
}
}