php脚本爬取头像图片

因为要插入系统用户,所以用php爬取百度图片上的头像url,再存储到本地,速度一般,1000张图片差不多花费半个多小时,不知道是不是因为 file_get_content 函数的缘故,或者是没有开多线程 php-fpm , 没有仔细研究优化,提高脚本速度可以从多线程和异步网络请求然后回调去解决。一下是代码
<?php
error_reporting(E_ALL ^ E_NOTICE);
set_time_limit(0);

$pageNum = 3;
//获取数据
for ($page=0; $page < $pageNum; $page++) {
    //翻页数据
    $pnArr = array('0','30','60','90','120','150','180','210','240','270','300','330','360'); 
    $rand = rand(0,5);

    //百度头像图片获取api
    $urlArr = array(
        "http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+%E6%B5%B7%E8%BE%B9&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+%E6%B5%B7%E8%BE%B9&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=". $pnArr[$rand] ."&rn=30&gsm=3c&1476431870063=",
        "http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%A4%B4%E5%83%8F+%E4%BA%BA%E7%89%A9%E5%BD%A2%E8%B1%A1+%E5%8D%8A%E8%BA%AB&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E5%A4%B4%E5%83%8F+%E4%BA%BA%E7%89%A9%E5%BD%A2%E8%B1%A1+%E5%8D%8A%E8%BA%AB&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=". $pnArr[$rand] ."&rn=30&gsm=1e&1476431965788=",
        "http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%A4%B4%E5%83%8F+%E4%BA%BA%E7%89%A9%E5%BD%A2%E8%B1%A1+%E8%90%9D%E8%8E%89&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E5%A4%B4%E5%83%8F+%E4%BA%BA%E7%89%A9%E5%BD%A2%E8%B1%A1+%E8%90%9D%E8%8E%89&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=". $pnArr[$rand] ."&rn=30&gsm=3c&1476432025419=",
        "http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%A4%B4%E5%83%8F+%E4%BA%BA%E7%89%A9%E5%BD%A2%E8%B1%A1+%E7%94%B7%E7%94%9F&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E5%A4%B4%E5%83%8F+%E4%BA%BA%E7%89%A9%E5%BD%A2%E8%B1%A1+%E7%94%B7%E7%94%9F&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=". $pnArr[$rand] ."&rn=30&gsm=5a&1476432073843=",
        "http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%A4%B4%E5%83%8F+%E7%BB%8F%E5%85%B8%E5%8A%A8%E4%BD%9C+%E5%98%9F%E5%98%B4&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E5%A4%B4%E5%83%8F+%E7%BB%8F%E5%85%B8%E5%8A%A8%E4%BD%9C+%E5%98%9F%E5%98%B4&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=". $pnArr[$rand] ."&rn=30&gsm=3c&1476432128601=",
        "http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+%E5%AD%A4%E7%8B%AC&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+%E5%AD%A4%E7%8B%AC&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=". $pnArr[$rand] ."&rn=30&gsm=1e&1476432211175=",
        "http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+90%E5%90%8E&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+90%E5%90%8E&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=". $pnArr[$rand] ."&rn=30&gsm=1e&1476432252392=",
        "http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+%E6%A3%AE%E7%B3%BB&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+%E6%A3%AE%E7%B3%BB&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=". $pnArr[$rand] ."&rn=30&gsm=1e&1476432318321=",
        "http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+%E5%AD%97%E6%AF%8D&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+%E5%AD%97%E6%AF%8D&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=". $pnArr[$rand] ."&rn=30&gsm=5a&1476432384197=",
        "http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%A4%B4%E5%83%8F+%E5%8D%A1%E9%80%9A%E5%8A%A8%E6%BC%AB+%E7%BE%8E%E5%B0%91%E5%A5%B3%E6%88%98%E5%A3%AB&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E5%A4%B4%E5%83%8F+%E5%8D%A1%E9%80%9A%E5%8A%A8%E6%BC%AB+%E7%BE%8E%E5%B0%91%E5%A5%B3%E6%88%98%E5%A3%AB&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=". $pnArr[$rand] ."&rn=30&gsm=78&1476432429562=",
        "http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+%E6%80%A7%E6%84%9F&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E5%A4%B4%E5%83%8F+%E4%B8%8D%E5%90%8C%E9%A3%8E%E6%A0%BC+%E6%80%A7%E6%84%9F&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=". $pnArr[$rand] ."&rn=30&gsm=3c&1476432470204="
        );

    $index = array_rand($urlArr,1);
    $imgJsonData = file_get_contents($urlArr[$index]);

    //随机采样,让每次获取的图片尽量不是相同的,可以增加获取图片api去减少获取重复的图片的几率
    $imgJsonData = json_decode($imgJsonData,true);
    foreach ($imgJsonData['data'] as $key => $dataArr) {
        if ($dataArr['thumbURL']) {
            putImgToLocal($dataArr['thumbURL']);
        }
    }
}

function putImgToLocal($url) {
    //下载本地
    if (!is_dir('./userSysAvatorUrl')) {
        mkdir('./userSysAvatorUrl', 0777, true);
    }

    $hdrs = array(
      'http' =>array('header' => 
       "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8\r\n" .
       "Accept-Language: zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3\r\n" .
       "Connection: keep-alive\r\n" .
       "Host: img0.imgtn.bdimg.com\r\n" .
       "Referer: http://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gbk&word=%CD%B7%CF%F1&fr=ala&oriquery=%E5%A4%B4%E5%83%8F&ala=1&alatpl=portait&pos=0\r\n" .
       "User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0\r\n" .
       "X-Requested-With: XMLHttpRequest",
      ),
    );
    $context = stream_context_create($hdrs);

    list($msec, $sec) = explode(' ', microtime());
    $file_name = (float)sprintf('%.0f', (floatval($msec) + floatval($sec)) * 1000);
    $shullf = array('a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z');
    $rand_key = array_rand($shullf,3);
    $file_name = $shullf[$rand_key[0]].$shullf[$rand_key[1]].$shullf[$rand_key[2]].'_'.$file_name. '.jpg';

    $imgBin = file_get_contents($url, 0, $context);
    file_put_contents('./userSysAvatorUrl/'.$file_name, $imgBin);
}
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值