node版本 > 18
C:\Users\admin>nvm use 18.14.0
Now using node v18.14.0 (64-bit)
安装依赖
npm i cheerio request
"dependencies": {
"cheerio": "^1.0.0",
"request": "^2.88.0"
}
实现代码 app.js
const fs = require('fs')
const cheerio = require('cheerio')
const request = require('request')
const path = require('path')
let i = 0
let j = 0
let url = 'https://www.test.com/en/news/50'
let http = url.includes('https') ? require('https') : require('http')
const startRequest = (x) => {
http
.get(x, (res) => {
let html = ''
res.setEncoding('utf-8')
res.on('data', (chunk) => {
html += chunk
})
res.on('end', () => {
let $ = cheerio.load(html)
j = 0
savedContent($)
i++
console.log('抓取:' + i)
if (i <= 3) {
startRequest(`https://www.test.com/en/news/${50 + i}`)
} else {
console.log('抓取完成')
}
})
})
.on('error', (err) => {
console.log(err)
})
}
const savedContent = ($) => {
let item = $('.c-info div p')[j]
let x = $('.c-title')
.find('h1')
.text()
.trim()
.replace(/[\/\\:*?"<>|]/g, '')
let y = $(item).text().trim()
let z = $(item).find('img').attr('src')
console.log($('.c-info div p').length, '--', j)
// console.log(y)
// console.log(z)
let file_mkr = path.resolve(__dirname, `./data/${x}`)
if (!fs.existsSync(file_mkr)) {
fs.mkdirSync(file_mkr, { recursive: true })
}
fs.appendFile(`${file_mkr}/index.txt`, `\n${y}`, 'utf-8', (err) => {
if (err) {
console.log(`***写入失败***:${x}`, err)
}
})
if (z) {
try {
let img_obj = $(item).find('img')
for (let L = 0; L < img_obj.length; L++) {
let u = $(img_obj[L]).attr('src')
request.head(z, (err, res, body) => {
if (err) throw err
})
// let img_name = path.basename(u) // 提取图片名称(阿里云带格式转换的链接提取不了)
request(u).pipe(fs.createWriteStream(`./data/${x}/${j}-${L}.jpg`))
}
} catch (error) {
console.log(`***保存图片失败***:${x}`, error)
}
}
j++
if (j <= $('.c-info div p').length - 1) {
savedContent($)
}
}
startRequest(url)
启动脚本
node app