node crawler简单使用

最新推荐文章于 2024-08-13 08:08:33 发布

阿一在线

最新推荐文章于 2024-08-13 08:08:33 发布

阅读量1.8k

点赞数

分类专栏： nodejs 文章标签： web crawler nodejs

本文链接：https://blog.csdn.net/wealth_123450/article/details/111572719

版权

nodejs 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

需求：获取某一个网站，商品的名称+价格；

以下按京东商品列表URL进行测试，

1、搭建node环境，此node安装不多介绍

2、node爬虫工具，安装 npm install crawler

3、创建index.js，直接贴代码

/**
 * 此js主要是通过PATH_URL，根据源码中的html风格，根据特定的标签获取HTML中的href，写入JSON文件中
 */
const fs = require('fs');
const Crawler = require('crawler');
const _ = require('lodash')

//在爬相关图片信息时，需要得到指定的URL：https://search.jd.com, 
const PATH_URL = 'https://search.jd.com/Search?keyword=%E5%8D%8E%E4%B8%BA%E6%89%8B%E6%9C%BA&enc=utf-8&suggest=5.def.0.base&wq=huawie%E6%89%8B%E6%9C%BA&pvid=b314d64bbf02446187feba4eed246377';

// 为了console输出而定义的变量
let cnt = 0;
// 查找到的HTML地址
let listDataRes = []

// 爬虫抓取
const c = new Crawler({
  maxConnections: 10,
  retries: 3,  // 失败重连3次
  callback: function (error, res, done) {
    if (error) {
      console.log(error)
    } else {
      // 注：抓取图片的规则，需要自己定义
      const $ = res.$;
      console.log(' ------------>title: ', $('title').text())
      const dataList = $('.goods-list-v2 li');

      dataList.each((index, dataItem) => {
        let dataRes = {};
        const idKey = dataItem.attribs['data-sku']

        // 获取li标签下的标签集合
        const firstChildren = dataItem.children
        firstChildren.forEach(twoItem => {
          // 得到相关div标签
          if (twoItem.type === 'tag' && twoItem.name === 'div') {
            const twoChildren = twoItem.children

            twoChildren.forEach(threeItem => {
              // -------> 获取商品价格
              if (threeItem.type === 'tag' && threeItem.name === 'div' && threeItem.attribs.class === 'p-price') {
                const threeChildren = threeItem.children
                // 获取strong标签
                threeChildren.forEach(fourItem1 => {
                  if (fourItem1.type === 'tag' && fourItem1.name === 'strong') {
                    const fourItem1Children = fourItem1.children
                    // 获取i标签
                    fourItem1Children.forEach(fiveItem1 => {
                      if (fiveItem1.type === 'tag' && fiveItem1.name === 'i') {
                        const price = (fiveItem1.children[0]).data
                        dataRes.price = price
                      }
                    })
                  }
                })
              }

              // -------> 获取商品名称
              if (threeItem.type === 'tag' && threeItem.name === 'div' && threeItem.attribs.class === 'p-name p-name-type-2') {
                const threeChildren2 = threeItem.children
                // 获取strong标签
                threeChildren2.forEach(fourItem2 => {
                  if (fourItem2.type === 'tag' && fourItem2.name === 'a') {
                    const fourItem2Children = fourItem2.children
                    // 获取i标签
                    fourItem2Children.forEach(fiveItem3 => {
                      if (fiveItem3.type === 'tag' && fiveItem3.name === 'em') {
                        const fiveItem3Children = fiveItem3.children
                        const fiveItem3ChildrenObj = fiveItem3Children.find(f => f.type === 'text')
                        const name = fiveItem3ChildrenObj ? fiveItem3ChildrenObj.data : ''
                        dataRes.name = name
                      }
                    })
                  }
                })
              }
            })
          }
        })

        if (!_.isEmpty(dataRes)) {
          listDataRes.push(dataRes);
        }
      })

      console.log(`${cnt++}`); //这里就是为了自己在console中看到进度，没有实际用处。
    }
    done(); // 函数在回调中完成工作后必须调用它
  }
});

// 将其相关href写入json文件
const writeListJson = () => {
  console.log(' =================> 队列为空时，数据处理完成')
  // 写入文件内容(如果文件不存在会创建一个文件)
  fs.writeFile('./jd_data/jd_goods_list.json', JSON.stringify(listDataRes), function (err) {
    if (err) {
      throw err;
    }
    console.log('all requests done and json saved!');
  });
}

// 指定爬取一个Url,将其添加到队列中
//绝大多数网站，都有反爬机制。只有小众网站没有。所以我们需要使用以下配置
//浏览器可以下载，但是服务端爬虫无效。反爬：检测你这个请求是通过浏览器发出来，还是服务端（解决方案：让服务端伪装成浏览器来发这个请求）
c.queue({
  url: PATH_URL,
  headers: { 'User-Agent': 'requests' }//让服务端伪装成客户端
});

// 在队列为空时，调用以下函数
c.on('drain', writeListJson);

4、cmd进入到index.js目录，执行 node index.js，生成如下