Node.js之Cheerio爬数据!~

爬数据!~

本意是想写一些接口感受一下Nodejs魅力,奈何没有数据,延伸了一下爬虫数据采集,让咱的数据库不太假。

cheerio 数据截取

是的,没想到有一天咱也会爬别人玩~ 哈哈哈!

  • 引入模块为了搞数据
const cheerio = require("cheerio");
const axios = require("axios").default;

飘过注意!cheerio的元素选择方式和jQuery一模一样!

  • 向爬取的url对象发起请求并进行元素选择

了解一下


了解cheerio完毕,下面正式开始咯!

  • 发起一波模块引入
const cheerio = require("cheerio");
const axios = require("axios").default;
const mongoose = require("mongoose");
const Entities = require("html-entities").XmlEntities;
const entities = new Entities();
const { Product, Detail, Category } = require("./models");

  • 连接数据库
mongoose.connect("mongodb://localhost:27017/MangoStore-app", {
    useUnifiedTopology: true,
    useNewUrlParser: true
}).then(res => {
    const removeAll = Promise.all([
        Product.remove({}),
        Detail.remove({}),
        Category.remove({})
    ]);
    removeAll.then(res => {
        loadData();
    });
});
  • 定义一个“安全”的请求方法准备发起进攻
function fetchData(url) {
    const userAgents = [
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
        "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"
    ];
    return new Promise((resolve, reject) => {
        setTimeout(function() {
            axios
                .get(url, {
                    timeout: 500000,
                    headers: {
                        "user-agent": userAgents[Math.floor(Math.random() * userAgents.length)],
                        "X-FORWARDED-FOR": Math.floor(Math.random() * 255) +
                            "." +
                            Math.floor(Math.random() * 255) +
                            "." +
                            Math.floor(Math.random() * 255) +
                            "." +
                            Math.floor(Math.random() * 255),
                        "CLIENT-IP": Math.floor(Math.random() * 255) +
                            "." +
                            Math.floor(Math.random() * 255) +
                            "." +
                            Math.floor(Math.random() * 255) +
                            "." +
                            Math.floor(Math.random() * 255)
                    }

                }).then(res => resolve(res))
        }, Math.random() * 100 * Math.random());
    });
}

我的对象是一个服装商场,在此俺先对着分类下手了

function loadData() {

    const types = [{
            name: "女鞋",
            url: "https://www.yougou.com/f-0-MXZ-0-1.html"
        },
        {
            name: "男鞋",
            url: "https://www.yougou.com/f-0-Y0A-04Y004-1.html"
        }, {
            name: "运动",
            url: "https://www.yougou.com/f-0-PTK-0-1.html"
        },
        {
            name: "箱包",
            url: "https://www.yougou.com/f-0-6LJ-0-1.html"
        },
        {
            name: "儿童",
            url: "https://www.yougou.com/f-0-9XB-0-1.html"
        }
    ];
    Category.insertMany(types)
        .then(res => {
            for (var i = 0; i < res.length; i++) {
                loadProduct(res[i]);
            }
        })

}

好嘞,分类表数据有了,下面来商品的数据:

function loadProduct({
    url,
    name,
    _id
}, isFirst = true) {
    fetchData(url).then(res => {
        const $ = cheerio.load(res.data.toString());
        const products = [];
        if (isFirst) {
            const $tagProducts = $(".proList li");
            $tagProducts.each(function() {
                const product = {};
                product.title = $(this)
                    .find('.srchlst-wrap .bd .nptt a')
                    .text();
                product.imgUrl = $(this)
                    .find(".srchlst-wrap .goods-desc .collect")
                    .attr("src");
                product.CurrentPrice = $(this)
                    .find(".srchlst-wrap .bd .price_sc em")
                    .eq(2).attr("price");
                product.OriginPrice = $(this)
                    .find(".srchlst-wrap .bd .origin-price i")
                    .text();
                product.detailUrl = $(this)
                    .find(".srchlst-wrap .goods-head a")
                    .attr("href");
                product.category = name;
                product.id = _id;
                products.push(product);
            });
        } else {
            const $tagProducts = $(".proList li");
            $tagProducts.each(function(index) {
                const product = {};
                product.title = $(this)
                    .find('.srchlst-wrap .bd .nptt a')
                    .text();
                product.imgUrl = $(this)
                    .find(".srchlst-wrap .goods-desc .collect")
                    .attr("src");
                product.CurrentPrice = $(this)
                    .find(".srchlst-wrap .bd .price_sc em")
                    .eq(2).attr("price");
                product.OriginPrice = $(this)
                    .find(".srchlst-wrap .bd .origin-price i")
                    .text();
                product.detailUrl = $(this)
                    .find(".srchlst-wrap .goods-head a")
                    .attr("href");
                product.category = name;
                product.id = _id;
                products.push(product);
            });
        }
        Product.insertMany(products)
            .then(res => {
                for (var i = 0; i < res.length; i++) {
                    loadDt(res[i]);
                }
            });

    });
}

紧接着,来一些商品详情把~

function loadDt({ detailUrl, name, _id, category }) {
    fetchData(detailUrl)
        .then(res => {
            const $ = cheerio.load(res.data.toString());
            const proDetails = [];
            const $goodsDetails = $("#goodsContainer");
            $goodsDetails.each(function() {
                let proDetail = {};
                proDetail.imgUrl = $(this)
                    .find("#goodsImg0 .goodsPic img")
                    .attr("src");
                proDetail.imgUrlall = [
                    $(this)
                    .find("#goodsImg0 .goodsPic img")
                    .attr("src"),
                    $(this)
                    .find("#spec-list .list-h li .picSmallClass2")
                    .attr("src"),
                    $(this)
                    .find("#spec-list .list-h li .picSmallClass3")
                    .attr("src"),
                    $(this)
                    .find("#spec-list .list-h li .picSmallClass4")
                    .attr("src"),
                    $(this)
                    .find("#spec-list .list-h li .picSmallClass5")
                    .attr("src"),
                    $(this)
                    .find("#spec-list .list-h li .picSmallClass6")
                    .attr("src")
                ];
                proDetail.title = $(this)
                    .find(".shopping-container h1")
                    .text();
                proDetail.OriginPrice = $(this)
                    .find(".shopping-container .good_ygprcarea #ygprice_area del")
                    .text();
                proDetail.title = $(this)
                    .find(".shopping-container h1")
                    .text();
                proDetail.OriginPrice = $(this)
                    .find(".shopping-container .good_ygprcarea #ygprice_area del")
                    .text();
                proDetail.sizeAll = [
                    $(this)
                    .find(".size .prosize .prodSpec a")
                    .eq(0).attr("data-name"),
                    $(this)
                    .find(".size .prosize .prodSpec a")
                    .eq(1).attr("data-name"),
                    $(this)
                    .find(".size .prosize .prodSpec a")
                    .eq(2).attr("data-name"),
                ];

                console.log(proDetail);
                proDetail.id = _id;
                proDetail.name = category;
                proDetails.push(proDetail);
            });
            Detail.insertMany(proDetails).then(detEnd => console.log("保存详情成功"));
        })
}

OK , 够咱玩数据库和接口了,收手走起接口!~

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值