用nodejs写了个爬取电影心灵奇旅电影热评的功能。
//爬取豆瓣心灵奇旅影评,包括用户主页头像
let request = require('request')
let fs = require('fs')
const path = require('path');
var startNum = 0//起始爬取位置
//发送请求
function reqData(url) {
return new Promise((resolve, reject) => {
request(url, function (error, response, body) {
if (error) {
reject(error)
} else {
resolve({
response,
body
})
}
});
});
}
//请求处理
async function req(url) {
let {
response,
body
} = await reqData(url)
//爬取评论的正则表达式
let reg1 = /<span class="short">(.*?)<\/span>/igs
//爬取用户信息的正则
let reg2 = /<a title="(.*?)" href="(.*?)".*?<img src="(.*?)".*?<\/a>/igs
let users = []
let res1
let res2
let id = startNum
while ((res1 = reg1.exec(body)) && (res2 = reg2.exec(body))) {
id++
let user = {
id: id,
userName: res2[1],
userHome: res2[2],
userIMage: res2[3],
comment: res1[1]
}
users.push(user)
}
return new Promise((resolve, reject) => {
if (users.length == 0) {
reject("爬取结果为空!")
} else {
resolve(users)
}
})
}
async function scrapyComments() {
//爬取200条数据
while (startNum < 20 * 10) {
var goalUrl = "https://movie.douban.com/subject/24733428/comments?start=" + startNum + "&limit=20&status=P&sort=new_score"
let users = await req(goalUrl);
startNum += 20;
let strUser = JSON.stringify(users)
writeData("/心灵奇旅/comments.json", strUser)
}
console.log("爬取成功!");
}
scrapyComments()
//写入数据
function writeData(fileName, data) {
let filePath = path.join(__dirname, fileName)
let dirName = path.dirname(fileName)
let dirPath = path.join(__dirname, dirName)
//文件不存在则创建
fs.access(dirPath, (err) => {
if (err) {
fs.mkdirSync(dirPath, err => {
console.log(err);
})
}
});
fs.open(filePath, 'wx', (err, fd) => {
//写入
fs.writeFile(filePath, data, {
flag: 'a+'
}, err => {
if (err) {
console.error(err)
return
}
})
})
}
注:代码小白,写的不好,多指教。