1. 数据爬取
爬虫部分主要是调用官方API,本次用到的API主要有两个:
工具:
Python3.6
sublime3
MySQL(数据存储)
scrapy(数据清洗)
pyecharts(可视化工具库)
* 关于网易云音乐官方API,后期会整理一下做个汇总放在GitHub上。
1.1 评论爬取
实际操作过程中,网易云官方对于API的请求是有限制的,有条件的可以采用更换代理IP来防反爬,本次采用的是单线程爬取,所以IP封的并不太频繁,后面会对代码进行重构,实现多线程+更换IP来加快爬取速度。
根据获取评论的API,请求URL有3个可变部分:歌曲ID、每页限制数limit和评论总偏移量offset,通过API分析得知:当offeset=0时,返回json数据中包含有评论总数量total,所以根据API可设计爬虫如下:
# -*- coding:utf8 -*-
# python3.6
from urllib import request
import json
import pymysql
from datetime import datetime
import re
ROOT_URL = 'http://music.163.com/api/v1/resource/comments/R_SO_4_%s?limit=%s&offset=%s'
LIMIT_NUMS = 50 # 每页限制爬取数
DATABASE = '' # 数据库名
TABLE = '' # 数据库表名
# 数据表设计如下:
'''
id(int) commentId(varchar)
content(text) likedCount(int)
userId(varchar) time(datetime)
'''
PATTERN = re.compile(r'[\n\t\r\/]') # 替换掉评论中的特殊字符以防插入数据库时报错
def getData(url):
if not url:
return None, None
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
"Host": "music.163.com",
}
print('Crawling>>> ' + url)
try:
req = request.Request(url, headers=headers)
content = request.urlopen(req).read().decode("utf-8")
js = json.loads(content)
total = int(js['total'])
datas = []
for c in js['comments']:
data = dict()
data['commentId'] = c['commentId']
data['content'] = PATTERN.sub('', c['content'])
data['time'] = datetime.fromtimestamp(c['time']//1000)
data['likedCount'] = c['likedCount']
data['userId'] = c['user']['userId']
datas.append(data)
return total, datas
except Exception as e:
print('Down err>>> ', e)
pass
def saveData(data):
if not data:
return None
conn = pymysql.connect(host='localhost', user='****', passwd='****', db='****', charset='utf8mb4') # 注意字符集要设为utf8mb4,以支持存储评论中的emoji表情
cursor = conn.cursor()
sql = 'insert into ' + TABLE + ' (id,commentId,content,likedCount,time,userId) VALUES (%s,%s,%s,%s,%s,%s)'
for d in data:
try:
cursor.execute('SELECT max(id) FROM '+TABLE)
id_ = cursor.fetchone()[0]
cursor.execute(sql, (id_+1,d['commentId'], d['content'], d['likedCount'], d['time'], d['userId']))
conn.commit()
except Exception as e:
print('mysql err>>> ',d['commentId'],e)
pass
cursor.close()
conn.close()
if __name__ == '__main__':
songId = input('歌曲ID:').strip()
total,data = getData(ROOT_URL%(songId, LIMIT_NUMS, 0))
saveData(data)
if total:
for i in range(1, tot