cheerio
开发文档
cheerio中文文档 - 简书 (jianshu.com)
简单介绍
cheerio是nodejs的抓取页面模块,为服务器特别定制的,快速、灵活、实施的jQuery核心实现。适合各种Web爬虫程序。
环境配置
npm install axios
npm install cheerio
举一个例子
爬取表情包
const cheerio = require('cheerio');
const axios = require('axios');
const fs = require('fs');
const path = require('path');
let httpUrl = "https://www.doutula.com/article/list/?page=";
//获取总页数
async function getNum(){
// let res = await axios.get(httpUrl+1);
// let $ = cheerio.load(res.data);
// let btnLength = $('.pagination li').length;
// let allPageNum = $('.pagination li').eq(btnLength-2).find('a').text();
return 2;
}
//获取一页的表情包
async function getListPage(pageNum){
let res = await axios.get(httpUrl+pageNum);
//获取返回的网页信息
let $ = cheerio.load(res.data);
//通过选择器找到标签
$('#home .col-sm-9>a').each((i,element)=>{
//获取标签的href属性
let pageUrl = $(element).attr('href');
//获取标签子元素的文本内容
let title = $(element).find('.random_title').text();
let reg = /(.*?)\d/igs;
let pageTitle=reg.exec(title)[1];
//创建文件夹用于存放数据
fs.mkdir('./img/'+pageTitle,function(err){
if(err){
console.log(err);
}else{
console.log("路径创建成功");
}
});
//对一个表情包进行解析
paresPage(pageUrl,pageTitle);
})
}
async function paresPage(url,title){
var a=0;
let res = await axios.get(url);
let $ = cheerio.load(res.data);
$('.pic-content img').each((i,element)=>{
let imgUrl = $(element).attr('src');
let extName = path.extname(imgUrl);
//补全路径
let imgPath = `./img/${title}/${title}-${i}${extName}`;
//以流的形式去下载表情包
let ws = fs.createWriteStream(imgPath);
axios.get(imgUrl,{responseType:'stream'}).then(function(res){
res.data.pipe(ws);
console.log("图片加载完成:"+imgPath);
res.data.on('close',function(){
ws.close();
});
});
});
}
async function spider(){
let allPageNum = await getNum();
for(var i=1;i<=allPageNum;i++){
getListPage(i);
}
}
spider();