《Node爬虫》系列文章:
- Node爬虫:批量下载图片
- Node爬虫:批量下载网页
- Node爬虫:从网页中抽取数据
- Node爬虫:爬取新浪博客
借助前面已经实现的图片下载器(pictureDownload.js)、网页下载器(htmlDownload.js)、网页数据抽取器(htmlExtactor.js),爬取新浪博客就比较轻松了,如下:
#!/usr/bin/env node
const fs = require('fs')
const ejs = require('ejs')
const util = require('util')
const path = require('path')
const mkdirp = require('mkdirp')
const join = path.join
const readFile = util.promisify(fs.readFile)
const writeFile = util.promisify(fs.writeFile)
const extract = require('./lib/extract')
const pictureDownloader = require('./lib/pictureDownloader')
const blogDir = join(process.cwd(), './blog')
const uid = process.argv[2]
async function fetch(uid) {
// 生成目录
mkdirp.sync(blogDir)
mkdirp.sync(blogDir + '/imgs')
mkdirp.sync(blogDir + '/post')
// 获取数据
let data = await extract(uid)
// 写入数据
writeFile(join(blogDir, 'data.json'), JSON.stringify(data), 'utf8')
// 批量下载图片
data.imgs.map(({url, name}) => {
pictureDownloader(url, join(blogDir, 'imgs', name))
})
// 生成HTML
let tplIndex = await readFile(__dirname + '/tpl/index.html', 'utf8')
let tplBlog = await readFile(__dirname + '/tpl/blog.html', 'utf8')
// index.html
writeFile(join(blogDir, 'index.html'), ejs.render(tplIndex, data), 'utf8')
// post/[n].html
for (let i = 0; i < data.post.length; i++) {
await writeFile(join(blogDir, `/post/${i + 1}.html`), ejs.render(tplBlog, data.post[i]), 'utf8')
}
}
if (/\d{10}/.test(uid)) {
fetch(uid)
} else {
console.info(`\n请输入正确的新浪博客UID, 例如 dature 1263917762\n`)
}
上述代码,使用模版(tpl/index.htm和tpl/blog)重新生成了 html。
完整的代码见:https://github.com/junyiz/dature。