1.新建webSpider.js
var http = require('http');
var fs = require('fs');
var iconv = require('iconv-lite');
function webSpider(argument) {
this.url = argument.url; //开始的页面地址
this.outpath = argument.outpath || 'g://temp/';
this.filter = argument.filter || new filter({
regex : argument.regex, //默认过滤所有的图片
url : argument.url,
custom : argument.custom
});
this.pagin = argument.pagin || false; //默认不处理
this.download = argument.download; //是否立即下载[默认为是true]
this.page = argument.page || new page({
filter:this.filter , //默认的过滤器
outpath:this.outpath , //默认输出路径
download:this.download //默认不处理
});
}
// 开始执行
webSpider.prototype.start = function() {
if(this.pagin == false) this.page.getHtml(this.url);
else this.paginHandle();
};
//处理多页的问题
webSpider.prototype.paginHandle = function() {
var _pagin = this.pagin,_urlRule = _pagin.urlRule,
i = _pagin.start,len = _pagin.end,
_page = this.page,_url;
//仅有第一页,需要处理
_page.getHtml(this.url ,1);
//处理剩余页
while(i<=len){
_url = _urlRule.replace(/({page})/g,i);
_page.getHtml(_url ,i);
i++;
}
};
//过滤对象
function filter(argument){
this.regex = argument.regex || /<img\s.*src="(.*?)"/g;
this.custom = argument.custom;
this.domainName = this.tools.getDomain(argument.url);
this.url = argument.url;
}
filter.prototype={
tools : {
getDomain : function _getDomain (url) {
url = url.split('/');
return url[0]+'//'+url[2];
},
getTrueFileUrl : function _getTrueFileUrl (fileUrl ,domain) {
if(fileUrl.indexOf('http')>=0) return fileUrl;
return domain+'/'+fileUrl;
//==================================================================================应该继续处理
}
},
execute : function _execute (html) {
if(!html){console.log('html is null');return;}
//处理过滤条件 或 调用过滤方法
var arr = [];
if(typeof(this.custom)=='function') {/*console.log('file -> custom');*/ arr = this.custom(html ,this.tools.getTrueFileUrl);}
else {console.log('file -> regex'); arr = this.byRegex(html);}
//return arr.removal();
return arr;
},
byRegex : function _byRegex (html) {
var results=[] ,match ,
_regex = this.regex ,
_domain = this.domainName,
_url = this.url ,
getFilrUrl = this.tools.getTrueFileUrl,i=1;
while ((match = _regex.exec(html)) != null) {
console.log('>>:'+match[1]);
results.push({src:getFilrUrl(_domain ,match[1]) ,id:i});
i++;
}
return results;
}
}
//处理页面对象 ,包括获得页面的html -> 根据过滤对象获取所需的内容数组 -> 执行下载或自定义的返回方法
//方法包括:获取一个页面
function page(argument) {
this.filter = argument.filter;
this.outpath = argument.outpath;
this.download = argument.download;
}
page.prototype={
//获取一页的html
getHtml : function _getHtml (url ,pagei) {
var self = this, data = null ,download = this.download ,charset = this.charset;
http.get(url, function (res) {
res.setEncoding('binary');
res.on('data', function (chunk) {
data += chunk;
}).on('end', function () {
var arr = self.filter.execute(iconv.decode(new Buffer(data,'binary'),'gbk')); //保证中文不乱码的问题
if(download==true) self.downloadFiles(arr ,pagei);
});
}).on('error',function () {
console.log('getHtml is error');
});
},
//下载文件集合,集合必须包含链接
downloadFiles : function _downloadFiles (arr, pagei) {
var len, _pagei = pagei || '';
if(arr && (len=arr.length) > 0){
for(var i=0,_tele;i<len;i++){
_tele = arr[i];
this.downloadFile(_tele.src, this.outpath ,_pagei+'_'+_tele.id);
}
}else{
console.log('results is null');
}
},
//下载一个文件
//outpath 的最后一个字符 必须是/
//默认多线程下载
downloadFile : function _downloadFile (src ,outpath ,_i) {
var filename = _i + '_'+ src.substring(src.lastIndexOf('/') + 1);
if(!fs.exists(outpath)) fs.mkdir(outpath, 777 ,function () {
var writestream = fs.createWriteStream(outpath + filename);
http.get(src, function (res) {
try{
res.pipe(writestream);
writestream.on('finish', function(e){
console.log('download : ' + src);
}).on('error' ,function(e) {
console.log('####download Error:'+src);
});
}catch(e){
console.log('>>>>#######download error:'+e);
}
});
});
}
}
module.exports=webSpider;
2.新建example.js
var fs = require('fs');
var cheerio = require('cheerio');
var webSpider = require('./webSpider');
var downloadZips =[];
function getApk () {
var ws = new webSpider({
url:'http://www.duote.com/android/game.html', //默认第一页
pagin : {
urlRule : 'http://www.duote.com/android/game_0_down_{page}.html',
start : 2,
end : 714
},
custom : function (html ,getpathfun) { //自定义过滤条件
var results=[], $ = cheerio.load(html) ,_this;
$('.list_item .link').each(function(i){
_this = $(this);
getPageLinks(getpathfun(_this.attr('href'),'http://www.duote.com/') ,i);
});
setTimeout(function () {
writeFile('E:/webFile/多特apk.txt' ,downloadZips.join(' '));
},1000);
},
download : false
});
ws.start();
}
function getPageLinks(url ,i){
var ws = new webSpider({
url:url, //默认第一页
custom : function (html ,getpathfun) { //自定义过滤条件
var $ = cheerio.load(html);
var _regex = /var sUrl = '(.*)';/g ,match;
while ((match = _regex.exec(html)) != null) {
downloadZips.push('\n'+$('.tit_area h1').text()+"\t\tsrc:"+getpathfun(match[1],'http://app.2345.cn'));
}
}
});
ws.start();
}
function writeFile(outpath ,str){
// 如果用writeFile,那么会删除旧文件,直接写新文件
fs.appendFile(outpath, str, function(err){
if(err) console.log("fail " + err);
else console.log("写入文件ok");
});
}
getApk();
安装:iconv-lite 、 cheerio
npm install iconv-lite
npm install cheerio