node_爬虫2

最新推荐文章于 2023-05-21 09:57:17 发布

皮卡卡卡

最新推荐文章于 2023-05-21 09:57:17 发布

阅读量84

点赞数 1

分类专栏： node 文章标签： node.js

本文链接：https://blog.csdn.net/qq_44808878/article/details/114677107

版权

node 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

node_爬虫2

小说爬取案列

const request = require("request");
const cheerio = require("cheerio");
const fs = require("fs");

request.get(
    "http://book.zongheng.com/showchapter/907701.html",     //单个地址直接引号即可
    (err,res,body)=>{       //err:一般为错误报告
        if (err)return;
        /*虚拟DOM*/
        let $ = cheerio.load(body);
        /*找到所有的a*/
        $(".col-4 a").each(function(index){
            /*继续请求a*/
            reqA( $(this).prop("href"),index );   //传参
        });
    }
);

//模块函数
function reqA(url,index){
    request.get(
        url,
        (err,res,body)=>{
            if (err)return;
            /*每个章节单独虚拟DOM*/
            let $ = cheerio.load(body);
            let txt = $(".content").text();
            txt = txt.replace(
                /(\s{2,})/g,        //对文件类容进行处理：遇到空格添加回车符
                $1=>"\r"+$1         //可以吧所有的txt内容拼接，然后在存入到一个文件中
            );
            fs.writeFileSync(
                "./xiaoshuo/"+(index+1)+".txt",
                txt
            );
        }
    )
}

案列：爬取百度图片

const request = require("request");
const fs = require("fs");
const path = require("path");


getImg("不相干的",1000);		//搜索的内容，和保存图片的张数

function getImg(word,num){
    /*真正开发插件的时候，要写很多的类型判断*/
    if (typeof word !== "string" || typeof num !== "number" || num <= 0)return;
    /*
    * 先创建word队友的目录
    * */
    fs.readdir(
        path.join(__dirname,"./img"),
        (err,data)=>{
            if (err)return;
            if (data.indexOf(word) === -1) {
                fs.mkdir("./img/"+word,()=>{
                    init();
                });
            }else{
                init();
            }
        }
    );

    /*初始化开始运行*/
    function init(){
        /*要调用几次req*/
        let ci = Math.ceil(num / 60);
        for (let i=0;i<ci;i++){
            req(
                word,
                i*60,
                Math.min(60,num-i*60)
            );
        }
    }

    /*请求的封装*/
    function req(word,pn,rn){
        request.get(
            "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&word="+encodeURI(word)+"&pn="+pn+"&rn="+rn,
            (err,res,body)=>{
                if (err){
                    console.log(err);
                    return;
                }
                // console.log(body);
                //百度搞人心态。。格式有时候不对
                // let {data} = JSON.parse(body);
                let data = body.match(/https.+?\.jpg/g);
                data = data || [];
                data = [...new Set(data)];

                /*遍历访问*/
                data.forEach(url=>{
                    /*创建随机数名字*/
                    let str = (new Date().getTime() + Math.floor(Math.random()*999999999999)).toString(16);
                    request(url)
                        .pipe(fs.createWriteStream("./img/"+word+"/"+str+".jpg"));
                });
            }
        );
    }
}