nodejs使用async来进行优化

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://blog.csdn.net/davidsu33/article/details/52770355

原来的爬取方式,会导致很多个连接同时访问服务器,而导致连接错误,修改为使用async的queue,保持始终只有两个处于激活的状态就不会出现这种问题了

未使用async前的代码:

/*
 使用request + cheerio来爬取zngirls网站上的数据
 */
const request = require('request');
const http = require('http');
const fs = require('fs');
const cheerio = require('cheerio');
const url = require('url');
const util = require('util');
const path = require('path');
const process = require('process');
const events = require('events');
const EventEmitter = events.EventEmitter;

function Crawl(girlID) {
    this.girlID = girlID;
    this.girlUrlFmt = 'http://www.zngirls.com/girl/%d/';
    //允许最多同时连接5个,根据标志位来判断是否有空余未使用
    //this.cacheFlag = [false, false, false, false, false];
    //this.cacheData = new Array(this.cacheFlag.length);
    //this.cache = [];
    //this.emitter = new EventEmitter();
    //this.emitter.on('resetflag', this.resetFlag);
}

Crawl.prototype = {
    start: function () {
        //爬取个人主页
        var self = this;
        request.get(this.U(this.getGirlUrl()), function (err, response, body) {
            if (err) {
                console.error('错误信息:', err);
            } else {
                var $ = cheerio.load(body);

                $('.igalleryli_link').each(function (i) {
                    var link = $(this);
                    var href = link.attr('href');

                    //根据gallery的地址来继续爬取gallery
                    var hrefID = href.match(/\/g\/(\d+)/)[1];
                    var downDir = path.join('' + self.girlID, hrefID);
                    var hostname = url.parse(response.request.href).hostname;
                    var galleryUrl = url.format({
                        hostname: hostname,
                        pathname: href,
                        protocol: 'http',
                    });

                    //创建下载目录(如果不存在)
                    var arrDir = downDir.split(path.sep);
                    var startDir = arrDir.shift();
                    while (true) {
                        if (!fs.existsSync(startDir)) {
                            fs.mkdirSync(startDir);
                        }

                        if (arrDir.length === 0) break;
                        startDir = startDir + path.sep + arrDir.shift();
                    }

                    //爬取影集
                    request.get(self.U(galleryUrl), function (err, response, body) {
                        if (err) {
                            console.error('下载错误:' + response.url, err);
                            process.exit(-1);
                        }

                        var $ = cheerio.load(body);
                        var images = $('#hgallery > img');
                        if (images) {
                            var im = $(images[0]);
                            var src = im.attr('src')
                            var preUrl = src.slice(0, src.lastIndexOf('/') + 1);

                            //爬取所有的图片并异步下载
                            $('#dinfo > span').each(function (i) {
                                    var span = $(this);
                                    var matched = span.text().match(/(\d+).*/);
                                    if (matched) {
                                        //该影集的数目
                                        var count = matched[1];
                                        for (var i = 0; i < count; ++i) {
                                            var jpgFile = self.formatIndex(i) + '.jpg';
                                            var jpgUrl = preUrl + jpgFile;
                                            var jpgDownFile = path.join(downDir, jpgFile);

                                            //console.log('下载图片从: ' + jpgUrl + '到: ' + jpgDownFile);
                                            //将下载文件加入到下载队列中
                                            //self.put(jpgUrl, jpgDownFile);
                                            //如果这么下载会导致,和服务器存在大量的连接而导致无法同时下载这么多
                                            //需要想办法减少连接数量
                                            /*
                                             request(self.U(jpgUrl), function (err, response, body) {
                                             //fs.writeFile(jpgDownFile, body, function (err) {
                                             //    if (err) {
                                             //        console.error('写入文件错误:' + jpgDownFile, err);
                                             //    }
                                             //});
                                             fs.writeFileSync(jpgDownFile, body);
                                             });
                                             */
                                            //如果下载的图片太多的话,这边是会出问题的
                                            //var opt = self.U(jpgUrl);
                                            //设置最大的socket连接数目
                                            //opt.pool = {maxSockets: 2};
                                            //request.get(opt).on('error', function (err){
                                            //    console.error('下载错误:', err);
                                            //    process.exit(-1000);
                                            //}).pipe(fs.createWriteStream(jpgDownFile))
                                            function down(jpgUrl, jpgDownFile) {
                                                //=====================================================
                                                /*
                                                 request(self.U(jpgUrl), function (err, response, body) {
                                                 //fs.writeFile(jpgDownFile, body, function (err) {
                                                 //    if (err) {
                                                 //        console.error('写入文件错误:' + jpgDownFile, err);
                                                 //    }
                                                 //});
                                                 fs.writeFileSync(jpgDownFile, body);
                                                 console.log('完成图片下载:' + jpgDownFile);
                                                 });
                                                 */

                                                //=====================================================
                                                //scheme02
                                                console.log('jpgUrl=' + jpgUrl + ' jpgFile=' + jpgDownFile);
                                                var opt = self.U(jpgUrl);
                                                //opt.pool = {maxSockets: 2};
                                                request.get(opt).on('error', function (err) {
                                                    console.error('下载错误:', err);
                                                    process.exit(-1000);
                                                }).pipe(fs.createWriteStream(jpgDownFile));
                                                console.log('完成图片下载:' + jpgDownFile);


                                                //=====================================================
                                                //增加一个函数闭包后没有问题(因为数据进行了复制)
                                                /*scheme03
                                                 var jpgUrlP = url.parse(jpgUrl);
                                                 http.get({
                                                 host: jpgUrlP.host,
                                                 port: 80,
                                                 headers: {
                                                 referer: 'http://www.baidu.com',
                                                 },
                                                 path: jpgUrl,
                                                 }, function (res) {
                                                 var buffers = [];
                                                 res.on('data', function (data) {
                                                 buffers.push(data);
                                                 });

                                                 res.on('end', function () {
                                                 var body = Buffer.concat(buffers);
                                                 fs.writeFileSync(jpgDownFile, body);
                                                 console.log('完成图片下载:' + jpgDownFile);
                                                 });
                                                 });
                                                 */
                                            };

                                            //down(jpgUrl, jpgDownFile);
                                            //使用函数来下载
                                            self.download2(jpgUrl, jpgDownFile);
                                        }
                                        ;
                                    }
                                }
                            );
                        }
                    });
                    console.log('爬取影集执行完毕');
                });
            }
        });

        console.log('个人全部影集执行完毕');
    },

    formatIndex: function (i) {
        var si = i + '';
        if (i === 0) {
            return si;
        }

        while (si.length < 3) {
            si = '0' + si;
        }

        return si;
    },

    getGirlUrl: function () {
        return util.format(this.girlUrlFmt, this.girlID);
    },

    U: function (_url) {
        return {
            url: _url,
            headers: {
                referer: 'http://www.baidu.com',
                connection: 'keep-alive',
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
            }
        };
    },

    //将下载内容存放到cache中

    /*
     put: function (_url, filename) {
     this.cache.push({url: _url, filename: filename});
     //process.nextTick(this.downloadTick);
     this.downloadTick();
     },
     downloadTick: function () {
     var self = this;
     if (this.cache.length <= 0) {
     console.log('当前没有缓存任何请求..');
     return;
     }
     for (var i = 0; i < this.cacheFlag.length; ++i) {
     if (!this.cacheFlag[i]) {
     var first = this.cache.shift();
     var flag = i;
     if (first) {
     this.cacheFlag[i] = true;
     this.cacheData[i] = first;
     console.log(first);
     request(self.U(first.url), function (err, response, body) {
     var cacheData = self.cacheData[flag];
     console.log(cacheData);
     console.log('下载文件到:' + cacheData.filename);
     fs.writeFileSync(cacheData.filename, body);
     self.emitter.emit('resetflag', flag)
     });
     }
     }
     }

     },

     resetFlag: function (flagIndex) {
     console.log(this);
     console.log(this.cacheFlag);
     if (flagIndex >= 0 && flagIndex < this.cacheFlag.length)
     this.cacheFlag[flagIndex] = false;
     else
     console.error('错误的标志位' + flagIndex);
     //process.nextTick(this.downloadTick);
     this.downloadTick();
     },
     */

    download: function (_url, filename) {
        console.log('jpgUrl=' + _url + ' jpgFile=' + filename);
        var opt = this.U(_url);
        //opt.pool = {maxSockets: 2};
        request.get(opt).on('error', function (err) {
            console.error('下载错误:', err);
            process.exit(-1000);
        }).pipe(fs.createWriteStream(filename));
        //console.log('完成图片下载:' + filename);

        //等待
        var t1 = (new Date()).getTime();
        var t2 = (new Date()).getTime();
        //3秒以后再继续执行
        while((t2-t1) < 4000){
            t2 = (new Date()).getTime();
        };
    },

    download2: function(jpgUrl, jpgDownFile){
        var jpgUrlP = url.parse(jpgUrl);
        http.get({
            host: jpgUrlP.host,
            port: 80,
            headers: {
                referer: 'http://www.baidu.com',
            },
            path: jpgUrl,
        }, function (res) {
            var buffers = [];
            res.on('data', function (data) {
                buffers.push(data);
            });

            res.on('end', function () {
                var body = Buffer.concat(buffers);
                fs.writeFileSync(jpgDownFile, body);
                console.log('完成图片下载:' + jpgDownFile);<pre name="code" class="javascript">/*
 使用request + cheerio来爬取zngirls网站上的数据
 */
const request = require('request');
const http = require('http');
const fs = require('fs');
const cheerio = require('cheerio');
const url = require('url');
const util = require('util');
const path = require('path');
const process = require('process');
const events = require('events');
const EventEmitter = events.EventEmitter;
const async = require('async');

function Crawl(girlID) {
    this.girlID = girlID;
    this.girlUrlFmt = 'http://www.zngirls.com/girl/%d/';
    //同时可以执行两个
    var self = this;
    this.queue = async.queue(function (task, callback) {
        //console.log('Hello' + task.name);
        //执行操作
        console.log('url:' + task.url + ' file:' + task.file);
        self.download2(task.url, task.file, callback);
    }, 2);
}

Crawl.prototype = {
    start: function () {
        //爬取个人主页
        var self = this;
        request.get(this.U(this.getGirlUrl()), function (err, response, body) {
            if (err) {
                console.error('错误信息:', err);
            } else {
                var $ = cheerio.load(body);

                $('.igalleryli_link').each(function (i) {
                    var link = $(this);
                    var href = link.attr('href');

                    //根据gallery的地址来继续爬取gallery
                    var hrefID = href.match(/\/g\/(\d+)/)[1];
                    var downDir = path.join('' + self.girlID, hrefID);
                    var hostname = url.parse(response.request.href).hostname;
                    var galleryUrl = url.format({
                        hostname: hostname,
                        pathname: href,
                        protocol: 'http',
                    });

                    //创建下载目录(如果不存在)
                    var arrDir = downDir.split(path.sep);
                    var startDir = arrDir.shift();
                    while (true) {
                        if (!fs.existsSync(startDir)) {
                            fs.mkdirSync(startDir);
                        }

                        if (arrDir.length === 0) break;
                        startDir = startDir + path.sep + arrDir.shift();
                    }

                    //爬取影集
                    request.get(self.U(galleryUrl), function (err, response, body) {
                        if (err) {
                            console.error('下载错误:' + response.url, err);
                            process.exit(-1);
                        }

                        var $ = cheerio.load(body);
                        var images = $('#hgallery > img');
                        if (images) {
                            var im = $(images[0]);
                            var src = im.attr('src')
                            var preUrl = src.slice(0, src.lastIndexOf('/') + 1);

                            //爬取所有的图片并异步下载
                            $('#dinfo > span').each(function (i) {
                                    var span = $(this);
                                    var matched = span.text().match(/(\d+).*/);
                                    if (matched) {
                                        //该影集的数目
                                        var count = matched[1];
                                        for (var i = 0; i < count; ++i) {
                                            var jpgFile = self.formatIndex(i) + '.jpg';
                                            var jpgUrl = preUrl + jpgFile;
                                            var jpgDownFile = path.join(downDir, jpgFile);

                                            //self.download2(jpgUrl, jpgDownFile);
                                            self.queue.push({url:jpgUrl, file:jpgDownFile});
                                        }
                                        ;
                                    }
                                }
                            );
                        }
                    });
                    console.log('爬取影集执行完毕');
                });
            }
        });

        console.log('个人全部影集执行完毕');
    },

    formatIndex: function (i) {
        var si = i + '';
        if (i === 0) {
            return si;
        }

        while (si.length < 3) {
            si = '0' + si;
        }

        return si;
    },

    getGirlUrl: function () {
        return util.format(this.girlUrlFmt, this.girlID);
    },

    U: function (_url) {
        return {
            url: _url,
            headers: {
                referer: 'http://www.baidu.com',
                connection: 'keep-alive',
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
            }
        };
    },

    download: function (_url, filename) {
        console.log('jpgUrl=' + _url + ' jpgFile=' + filename);
        var opt = this.U(_url);
        //opt.pool = {maxSockets: 2};
        request.get(opt).on('error', function (err) {
            console.error('下载错误:', err);
            process.exit(-1000);
        }).pipe(fs.createWriteStream(filename)).on('close', function () {
            console.log('完成图片下载:' + filename);
        });
    },

    download2: function (jpgUrl, jpgDownFile,callback) {
        var jpgUrlP = url.parse(jpgUrl);
        http.get({
            host: jpgUrlP.host,
            port: 80,
            headers: {
                referer: 'http://www.baidu.com',
            },
            path: jpgUrl,
        }, function (res) {
            var buffers = [];
            res.on('data', function (data) {
                buffers.push(data);
            });

            res.on('end', function () {
                var body = Buffer.concat(buffers);
                fs.writeFileSync(jpgDownFile, body);

                if(callback){
                    //保证同步
                    callback();
                }
                console.log('完成图片下载:' + jpgDownFile);
            });
        });

    }
};

var girlID = 19705;
var crawl = new Crawl(girlID);
crawl.start();

}); }); //等待 //var t1 = (new Date()).getTime(); //var t2 = (new Date()).getTime(); ////3秒以后再继续执行 //while((t2-t1) < 4000){ // t2 = (new Date()).getTime(); //}; }};var girlID = 19705;var crawl = new Crawl(girlID);crawl.start();//crawl.download2('http://t1.zngirls.com/gallery/19705/19815/019.jpg', '119.jpg');console.log('主程序执行完毕');



使用async的queue优化


展开阅读全文

没有更多推荐了,返回首页