node.js最强递归爬虫

var request = require("request")
var cheerio = require('cheerio');
var fs = require("fs")


var urlObj = {} //储存已经爬过的连接
//连接后面不能跟 '/'
var urls = "https://im.qq.com"

link(urls) //开始工作


function link(url) {
    if (!urlObj[url]) {
        request(url, {}, async (err, response, body) => {
            if (err) {
                console.error("request error : " + url + " : " + err)
                link(url)
                delete urlObj[url]
            }
            if (!err && response.statusCode == 200) {
                if (url.lastIndexOf('html') < 0) {//
                    init(url, 4, body)
                } else {
                    init(url, 5, body)
                }
                getStatic(url) //取静态资源
                let $ = cheerio.load(body)
                for (var i = 0; i < $("a").length; i++) {
                    if ($("a")[i].attribs.href) {
                        if ($("a")[i].attribs.href.indexOf('http') < 0 && $("a")[i].attribs.href && $("a")[i].attribs.href.indexOf('#') < 0 && $("a")[i].attribs.href.indexOf('javascript') < 0 && $("a")[i].attribs.href.indexOf('mailto') < 0 && $("a")[i].attribs.href.indexOf('tel:') < 0) { //去除除了连接外标签
                            if ($("a")[i].attribs.href[0] != '/') {
                                $("a")[i].attribs.href = '/' + $("a")[i].attribs.href
                            }
                            link(encodeURI(urls + $("a")[i].attribs.href)) //转义URL
                            urlObj[urls + $("a")[i].attribs.href] = urls + $("a")[i].attribs.href
                        }
                    }
                }
            }
        })
    }
}


function getStatic(url) {//读取静态资源
    request(url, {}, async (err, res, body) => {
        if (err) {
            console.error("static err url :" + url + '---' + err)
            getStatic(url)
        }
        if (!err && res.statusCode == 200) {
            let $ = cheerio.load(body)
            /*解析JS */
            for (var i = 0; i < $("script").length; i++) {
                if ($("script")[i].attribs.src) {
                    init($("script")[i].attribs.src, 1)
                }
            }
            /*解析CSS文件 */
            for (var i = 0; i < $("link").length; i++) {
                if ($("link")[i].attribs) {
                    init($("link")[i].attribs.href, 2)
                }
            }
            /*解析img*/
            for (var i = 0; i < $("img").length; i++) {
                if ($("img")[i].attribs.src && $("img")[i].attribs.src.indexOf("data:image/png") < 0) {
                    init($("img")[i].attribs.src, 3)
                }
            }
        }
    })
}


function init(str, num, body) { //建文件和文件夹

    if (num == 4) {
        if (str == urls) {
            str = "/index.html"
        } else {
            var newstr = str.split('?')[0]
            str = newstr + "index.html";
            str = str.replace(urls, '')
            str = str + '?' + newstr[1]
        }
    }
    str = str.replace(urls, '');
    let pathStr = str.substring(0, str.lastIndexOf("/") + 1);
    if (pathStr.indexOf('http') < 0) {
        let foterName = pathStr.split('/')
        let filename = str.substring(str.lastIndexOf("/") + 1, str.length).split('?')[0];
        let canshu = str.substring(str.lastIndexOf("/") + 1, str.length).split('?')[1];
        if (filename[0] != '/') {
            filename = "/" + filename
        }
        let linPath = ""
        for (var i = 0; i < foterName.length; i++) { //新建文件
            if (foterName[i]) {
                linPath += '/' + foterName[i]
                if (foterName[i].indexOf("data:image/png") < 0 && foterName[i].indexOf("index.html") < 0) {
                    if (!fs.existsSync(__dirname + linPath)) {
                        mkdir(linPath)
                    }
                }
            }
            if (i + 1 == foterName.length) {
                if (num == 4) {
                    var path = pathStr + filename
                    path = path.replace('//', '/');
                    fs.access(__dirname + path, async (err) => {
                        if (err) {
                            console.log("add file1" + path)
                            fs.writeFile(__dirname + path, body, async (err) => {
                                if (err) {
                                    console.error(path, err)
                                    fs.writeFile(__dirname + path, body, async (err) => {
                                    })
                                }
                            })
                        }
                    })

                }
                if (num == 5) {
                    var path = pathStr + filename
                    path = path.replace('//', '/');
                    fs.access(__dirname + path, async (err) => {
                        if (err) {
                            console.log("add file1" + path)
                            fs.writeFile(__dirname + path, body, async (err) => {
                                if (err) {
                                    console.error(path, err)
                                    fs.writeFile(__dirname + path, body, async (err) => {
                                    })
                                }
                            })
                        }


                    })
                }
                if (num != 4 && num != 5) {
                    fs.access(__dirname + pathStr + filename, async (err) => {
                        if (err) {
                            var path = pathStr + filename
                            path = path.replace('//', '/');
                            console.log("add file2" + path)
                            writeFile(__dirname + path, urls + str, num, canshu)
                        }

                    })
                }
            }
        }
    }

}

function mkdir(path) { //建文件夹
    console.log("add font" + path)
    fs.mkdir(__dirname + path, async (err) => {
        if (err) {
            console.error(err)
        }
    })
}

function writeFile(file, url, num, canshu) { //建文件
    console.log("addfile--" + url)
    if (num == 3) {
        request(url + '?' + canshu).pipe(fs.createWriteStream(file)); //保存图片
    } else {
        request(url + '?' + canshu, {}, (err, res, body) => {
            if (!err && res.statusCode == 200) {
                fs.writeFile(file, body, (err) => {
                    if (err) {
                        writeFile(file, url, num, canshu);
                        console.log(err)
                    }
                })
            }
        })
    }
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值