思路:
1.从需要被爬的网站里获取数据
2.依据获取到的数据结构(字段)创建数据库表结构
3.创建数据表
4.插入数据前整理数据
5.将整理好的数据插入到数据库中
6.插入操作结束后关闭数据库连接或者收集错误信息
待优化部分:
1.特殊字符(如表情符号)存入数据库中时会报错,需要优化编码方式
2.错误信息可以通过邮件发给管理员
3.优化同步异步操作,把好多层callback换成Promise
4.包含发邮件和promise的优化版本https://blog.csdn.net/tengxi_5290/article/details/103368115
公共资源:
import {connection} from './../../common/dbConnect.js' // 连接数据库(代码在文章最底端)
const https = require('https') // 网络请求
// 需要被爬的网站网址
const url = 'https://douban.fm/j/v2/songlist/explore?type=hot&genre=0&limit=20&sample_cnt=5' // 这是获取数据的接口url
const urlDir = 'https://douban.fm/explore/songlists' // 这是浏览器访问页面的url
let errorInfoGroup = [] // 存放报错的数据条目
let successInfoGroup = [] // 存放没报错的数据条目
let testInterval // 计时器
实例代码:
// 从需要被爬的网站里获取数据
let getWebData = function (callback) {
https.get(url, (res) => {
let chunks = []
let size = 0
res.on('data', (trunk) => {
chunks.push(trunk)
size += trunk.length
})
res.on('end', () => {
callback(chunks, size)
})
})
}
// 依据获取到的数据结构(字段)创建数据库表结构
let getTableSkelon = function (chunks, size, callback) {
let data = Buffer.concat(chunks, size)
let html = JSON.parse(data.toString()).reverse()
let createString = 'CREATE TABLE IF NOT EXISTS doubanAlbum ('
let createParams = ''
for(let j in html[0]) {
if(j !== 'id') {
createParams = createParams + j + ' VARCHAR (255) DEFAULT NULL, '
} else {
createParams = createParams + 'id INT (10) NOT NULL, '
}
}
createString = createString + createParams + 'PRIMARY KEY (id))'
callback(createString, html)
}
// 创建数据表
let createTable = function (createString, html, callback) {
connection.query(createString, (err, data) => {
if(err) {
throw err
} else {
callback(html)
}
})
}
// 插入数据前整理数据
let fillTable = function (newInfo) {
let tempSongs = []
for (let i in newInfo) {
if(newInfo[i] !== null) {
if(Array.prototype.isPrototypeOf(newInfo[i]) == true) {
if(i == 'sample_songs') {
newInfo['sample_songs'].forEach((item, index, array) => {
tempSongs.push(item.sid)
})
newInfo[i] = tempSongs.join(',')
} else {
newInfo[i] = JSON.stringify(newInfo[i])
}
} else if(typeof(newInfo[i]) == 'object') {
if(newInfo[i].hasOwnProperty('id')) {
newInfo[i] = newInfo[i].id
} else {
newInfo[i] = null
}
} else {
newInfo[i] = newInfo[i]
}
}
}
fillbd(newInfo)
}
// 将整理好的数据插入到数据库中
let fillbd = function (newInfo) {
connection.query('INSERT INTO doubanAlbum SET ?', newInfo, (err, result) => {
if(err) {
errorInfoGroup.push(newInfo)
// throw err
} else {
successInfoGroup.push(newInfo)
}
})
}
// 插入操作结束后关闭数据库连接或者收集错误信息
let endConnection = function (html) {
if(successInfoGroup.length + errorInfoGroup.length == html.length) {
connection.end()
clearInterval(testInterval)
// TODO: 错误数据发邮件告知管理员并写入log留作记录
} else {
console.log('错误数据')
console.log(errorInfoGroup)
}
}
// 调用方法
getWebData((chunks, size) => {
getTableSkelon(chunks, size, (createString, html) => {
createTable(createString, html, () => {
html.forEach((item, index, array) => {
fillTable(item)
})
testInterval = setInterval(endConnection,1000, html);
})
})
})
// codes in dbConnect.js
const mysql = require('mysql')
const dbName = YOUR DATABASE NAME
const connection = mysql.createConnection({
host: 'localhost',
user: 'root',
password: YOUR PASSWORD FOR YOUR DATABASE,
port: 3306,
autoReconnect: true
})
connection.query('USE ' + dbName)
module.exports = {
connection
}