- node安装: https://nodejs.org/en/download/
- 新建文件文件夹,打开终端
npm init -y
,新建文件index.js
- 安装 cheerio ,官网地址:https://cheerio.js.org/
npm install axios cheerio
// 抓取豆瓣读书中的数据信息
const fs = require('fs');
const axios = require('axios').default;
const cheerio = require('cheerio');
// 目标地址
async function getDouBanHTML() {
const { data } = await axios.get('https://book.douban.com/latest')
return data;
}
// 拿到所有详情页的链接
async function getBookList() {
const html = await getDouBanHTML();
const $ = cheerio.load(html);
const aElement = $('#content .grid-16-8 li .media__img a');
return aElement.map((i, ele) => {
const href = ele.attribs['href'];
return href;
}).get();
}
// 根据url 获取详情页的里的内容
async function getBookDetail(url) {
const { data } = await axios.get(url);
const $ = cheerio.load(data);
const name = $('h1').text().trim();
const imgUrl = $('#mainpic .nbg img').attr('src');
const authTitle = $('#info span.pl').filter((i,item) => $(item).text().includes("作者"));
const auth = authTitle.next('a').text() + authTitle.next('a').next('a').text();
const publishTitle = $('#info span.pl').filter((i,item) => $(item).text().includes("出版年"));
const publishDate = publishTitle[0].nextSibling.nodeValue;
return {
name,
imgUrl,
auth,
publishDate
}
}
// 主方法,获取所有的书籍信息
async function fetchAll() {
const links = await getBookList();
const proms = links.map(link => getBookDetail(link))
Promise.all(proms).then(res => {
fs.writeFileSync('data.json', JSON.stringify(res));
});
}
fetchAll();
运行文件 node index.js
后,成功新增 data.json
文件,打开格式化如下: