1.新建webSpider.js
var http = require('http');
var fs = require('fs');
var iconv = require('iconv-lite');
function webSpider(argument) {
this.url = argument.url; //开始的页面地址
this.outpath = argument.outpath || 'g://temp/';
this.filter = argument.filter || new filter({
regex : argument.regex, //默认过滤所有的图片
url : argument.url,
custom : argument.custom
});
this.pagin = argument.pagin || false; //默认不处理
this.download = argument.download; //是否立即下载[默认为是true]
this.page = argument.page || new page({
filter:this.filter , //默认的过滤器
outpath:this.outpath , //默认输出路径
download:this.download //默认不处理
});
}
// 开始执行
webSpider.prototype.start = function() {
if(this.pagin == false) this.page.getHtml(this.url);
else this.paginHandle();
};
//处理多页的问题
webSpider.prototype.paginHandle = function() {
var _pagin = this.pagin,_urlRule = _pagin.urlRule,
i = _pagin.start,len = _pagin.end,
_page = this.page,_url;
//仅有第一页,需要处理
_page.getHtml(this.url ,1);
//处理剩余页
while(i<=len){
_url = _urlRule.replace(/({page})/g,i);
_page.getHtml(_url ,i);
i++;
}
};
//过滤对象
function filter(argument){
this.regex = argument.regex || /<img\s.*src="(.*?)"/g;
this.custom = argument.custom;
this.domainName = this.tools.getDomain(argument.url);
this.url = argument.url;
}
filter.prototype={
tools : {
getDomain : function _getDomain (url) {
url = url.split('/');
return url[0