**介绍**:
1.需要创建一个文件夹
2.然后再集中终端中打开
3.然后就是 npm init -y
4.然后就是 npm i nodemon -D
5.然后就是 npm i cheerio -S
6.创建一个index.js文件
7.再你刚才创建的json文件配置一项东西 就是需要启动的选项
8."scripts": {
"test": "echo \"Error: no test specified\" && exit 1",
"start": "nodemon"
},
9.然后再终端中 npm i 下载全部
10.然后再npm run 能查看到 start这个文件 就是要运行的
11.然后就是 npm start 启动起来
12.然后你创建一个 index.js 文件
13.var data = await getUrl('https://movie.douban.com/chart'); 这里里面需要填写你要爬取的网站,必须是https的的,唯一的不知只能爬出的是静态页面,布局还是很完美的,没有错乱的布局
json文件的配置文件
{
"name": "cheerios",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1",
"start":"nodemon"
},
"dependencies": {
"cheerio": "^1.0.0-rc.12"
},
"devDependencies": {
"nodemon": "^2.0.19"
},
"keywords": [],
"author": "",
"license": "ISC"
}
``
```javascript.
这里就是需爬取网页的 代码粘贴到你的index.js文件里面就可以了
var http = require('http');
var https = require('https');
var cheerio = require('cheerio');
var url = require('url');
var fsp = require('fs/promises');
init();
async function init () {
var data = await getUrl('https://movie.douban.com/chart');
var $ = cheerio.load(data)
$('a').each(async function (i, t) {
var path = $(t).attr('href');
var data = await getUrl(path);
var name = path.split('?')[0].split('/').pop().replace(/\//, "") + ".html"
await fsp.writeFile(name, data);
})
}
function getUrl (path) {
return new Promise((resolve, reject) => {
https.get(path, async (inMsgs) => {
var data = await getDate(inMsgs);
console.log(data);
resolve(data);
})
})
}
function getDate (inMsgs) {
return new Promise(function (resolve, reject) {
var data = ""
inMsgs.on("data", _chunk => data += _chunk);
inMsgs.on("end", () => resolve(data));
})
}