异步爬虫类

1.主要使用了curl_multi 这个特性.

2.对整个操作进行了封装,调用 方法如下:  

SCatcher::batch($urls, function (array $info) use (&$ret) {
    $url = $info['url'];
    $ret [$url] = $info['result'];
}); 
3.类定义如下:

<?php

/**
 * 异步爬虫
 * User: 蓝冰大侠
 * Date: 2017/11/9
 * Time: 10:10
 */
class SCatcher
{
    //默认的超时设置 10 秒
    private static $timeout = 10;

    //并发数量(同时爬20个,完成后再爬下20个)
    public $synNum = 20;

    /**
     * 构造一个连接句柄
     * @param $url string 给定的URL
     * @return resource
     */
    private static function curlInit($url)
    {
        $ch = curl_init($url);

        //随机取一个用户代理
        $agent = self::$agents[array_rand(self::$agents, 1)];

        //设置CURL参数
        curl_setopt_array($ch, [
            CURLOPT_RETURNTRANSFER => true, //要求返回结果
            CURLOPT_TIMEOUT => self::$timeout,//超时
            CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1, //http 1.1 协议
            CURLOPT_USERAGENT => $agent, //用户代理
            CURLOPT_REFERER => '', //上一次页面
            CURLOPT_COOKIE => '', //COOKIE 无
            CURLOPT_FOLLOWLOCATION => false, //不自动 跳转

            //以下是Header,用FireBug之类的抓取一个正常请求的Header数据就可以
            CURLOPT_HTTPHEADER => [
                "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
                "Accept-Language:zh-CN,zh;q=0.8",
                //"Cache-Control:max-age=0",
                "Connection:keep-alive",
            ],
        ]);
        return $ch;
    }

    //全部请求地址的栈
    private $urlStack = [];

    /**
     * 入栈一个要抓取的地址
     * @param array $info
     */
    public function push(array $info)
    {
        array_push($this->urlStack, $info);
    }

    /**
     * 全部入栈完成后,开始爬行
     * @return bool true:尚未并发完成, false:全部完成
     */
    public function run()
    {
        //如果没有需要爬行的地址
        if (!count($this->urlStack)) {
            return false;
        }

        //构造一个CURL批处理句柄
        $handle = curl_multi_init();

        //处理池
        $pool = [];

        //初始20个并发
        while (count($pool) < $this->synNum) {
            //取一个需要处理的爬行地址
            $info = array_pop($this->urlStack);
            if (!$info) {
                break;
            }

            //构造Curl句柄
            $url = $info['url'];
            $ch = self::curlInit($url);

            //判断是否使用代理服务器
            if (isset($info['proxy'])) {
                curl_setopt($ch, CURLOPT_PROXY, $info['proxy']);
            }

            //如果是SSL连接
            if (left($url, 5) == 'https') {
                curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
            }

            //记录下正在爬行的句柄
            curl_multi_add_handle($handle, $ch);
            $pool[strval($ch)] = $info;
        }

        //同时发起网络请求,持续查看运行状态
        do {
            $status = curl_multi_exec($handle, $active);
            if ($status == CURLM_CALL_MULTI_PERFORM) {
                continue;
            }

            if ($status != CURLM_OK) {
                continue;
            } //如果没有准备就绪,就再次调用curl_multi_exec

            //终于有请求完成的
            while ($done = curl_multi_info_read($handle)) {
                $ch = $done['handle'];
                $params = $pool[strval($ch)];
                $params['info'] = curl_getinfo($ch);
                $params['errno'] = curl_errno($ch);
                $params['error'] = curl_error($ch);
                $params['code'] = curl_getinfo($ch, CURLINFO_HTTP_CODE);
                $params['result'] = self::iconv(curl_multi_getcontent($ch));

                //请求出错了,应该是代理服务器的错,换代理
                if ($params['errno']) {
//                    SProxy::failure($params['proxy']);
                    if ($params['errno'] == 28) {
                        $error = 'timeout of ' . self::$timeout . ' s';
                    } else {
                        $error = $params['error'];
                    }
                    dump("\r\n" . 'URL : ' . $params['url'] . "\r\n" . 'Curl errno:' . $params['errno'] . '    Curl error: ' . $error . "\r\n");
                    usleep(50);
                    continue;
                }

                //500系列错误,换代理
                if ($params['code'] >= '500') {
                    dump("\r\n" . 'URL : ' . $params['url'] . "\r\n" . 'Code : ' . $params['code'] . ' Length : ' . strlen($params['result']) . "\r\n");
                    usleep(50);
                    //SProxy::failure($params['proxy']);
                    continue;
                }

                //目标网站出错,如: 302,504,404之类
                if ($params['code'] != 200) {
                    dump("\r\n" . 'URL : ' . $params['url'] . "\r\n" . 'Code : ' . $params['code'] . ' Length : ' . strlen($params['result']) . "\r\n");
                    usleep(50);
                    continue;
                } else {
                    //本次抓取成功
                    // SProxy::success(isset($params['proxy'])?$params['proxy']:'');
                    //echo "\r\n" . 'URL : ' . $params['url'] . "\r\n";
                    //echo 'Http Code : ' . $params['code'] . " \tUsed : " . round($params['info']['total_time'], 2) . " \tLength : " . strlen($params['result']) . "\r\n";

                    //调用 回调方法,对采集的内容进行处理
                    if ($params['callback']) {
                        $params['callback']($params);
                    }

                }
                //从并发中去除此句柄
                curl_multi_remove_handle($handle, $ch);
                curl_close($ch);

                //从栈里再拿一个放到并发中
                $info = array_pop($this->urlStack);
                if ($info) {
                    $ch = self::curlInit($info['url']);
                    if (isset($info['proxy'])) {
                        curl_setopt($ch, CURLOPT_PROXY, $info['proxy']);
                    }
                    curl_multi_add_handle($handle, $ch);
                    $pool[strval($ch)] = $info;
                }

                //如果仍然有未处理完毕的句柄,那么就select
                if ($active > 0) {
                    curl_multi_select($handle, 0.05); //此处会导致阻塞大概0.5秒。
                }
            }
        } while ($active > 0); //还有句柄处理还在进行中

        //全部结束
        curl_multi_close($handle);

        return true;
    }

    /**
     * 转编码 GBK=>UTF8
     * @param string $str
     * @return string
     */
    static private function iconv($str)
    {
        $ret = mb_convert_encoding($str, 'utf-8', 'gbk');
        if ($ret) {
            return $ret;
        }
        return $str;
    }

    //可以使用的用户代理,随机使用
    static private $agents = [
        'Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 1.1.4322; CIBA; InfoPath.2; SE 2.X MetaSr 1.0; AskTB5.6; SE 2.X MetaSr 1.0)',
        'ia_archiver (+http://www.alexa.com/site/help/webmasters; crawler@alexa.com)',
        'Mozilla/5.0 (compatible; YoudaoBot/1.0; http://www.youdao.com/help/webmaster/spider/; )',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)',
        'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)',
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; MyIE9; .NET CLR 2.0.50727; InfoPath.1; SE 2.X MetaSr 1.0)',
        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727)',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; InfoPath.2)',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Trident/4.0; EmbeddedWB 14.52 from: http://www.bsalsa.com/ EmbeddedWB 14.52; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; Shuame; Shuame)',
        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36 SE 2.X MetaSr 1.0',
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.5 Safari/536.11',
        'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36 LBBROWSER',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; .NET CLR 1.1.4322; InfoPath.1)',
        'Mozilla/5.0 (Windows NT 5.1; rv:27.0) Gecko/20100101 Firefox/27.0',
        'Mozilla/5.0 (compatible; JikeSpider; +http://shoulu.jike.com/spider.html)',
        'Mozilla/4.0 (compatible; MSIE 6.0b; Windows NT 5.1; DigExt)',
        'Mozilla/5.0 (compatible; MJ12bot/v1.4.4; http://www.majestic12.co.uk/bot.php?+)',
        'msnbot-media/1.1 (+http://search.msn.com/msnbot.htm)',
        'User-Agent: Mozilla/5.0 (compatible; MSIE 6.0;Windows XP)',
        'Mozilla/5.0 (compatible; CompSpyBot/1.0; +http://www.compspy.com/spider.html)',
        '360spider-image',
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.21 (KHTML, like Gecko) spider6 Safari/537.21',
        'NIS Nutch Spider/Nutch-1.7',
        'User-Agent\x09Baiduspider',
        'Mozilla/5.0 (compatible; CompSpyBot/1.0; +http://www.compspy.com/spider.html)',
        'Mozilla/5.0 (compatible; Ezooms/1.0; help@moz.com)',
        'Mozilla/5.0(compatible;+Sosospider/2.0;++http://help.soso.com/webspider.htm)',
        'Mozilla/5.0 (compatible; YYSpider; +http://www.yunyun.com/spider.html)',
        'Mozilla/5.0 (compatible; ZumBot/1.0; http://help.zum.com/inquiry)',
    ];

    /**
     * 异步爬取一批数据
     * @param array $rows
     * @param callable $callback 回调方法
     */
    public static function batch(array $rows, callable $callback)
    {
        $syn = new self();
        foreach ($rows as $v) {
            // 分析爬行内容
            $syn->push(['url' => $v, 'callback' => $callback]);
        }

        $begin = microtime(true);

        //这其中会有等待.
        while ($syn->run()) {
            ;
        };
        SDebug::setNet('Batch URL', count($rows), 'something', microtime(true) - $begin);
    }
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值