转载自: CDA数据分析师
1、数据获取
https://y.qq.com/n/yqq/album/0009C3rp3Kfwg0.html#comment_box
评论区的内容是被封装在json中
复制此条json数据,放到在线json解析中
分析数据结构
comment_list = json_data['comment']['commentlist']
# 昵称
nick_name = [i.get('nick') for i in comment_list]
# 评论内容
content = [i.get('rootcommentcontent') for i in comment_list]
# 评论时间
comment_time = [i.get('time') for i in comment_list]
# 点赞数
praise_num = [i.get('praisenum') for i in comment_list]
整个代码
# 导入包
import pandas as pd
import time
import requests
import json
from fake_useragent import UserAgent
def get_qq_comment(page_num):
# 存储数据
df_all = pd.DataFrame()
for i in range(page_num):
# 打印进度
print('我正在获取第{}页的信息'.format(i))
# 获取URL
url = 'https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg?g_tk_new_20200303=5381&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=GB2312¬ice=0&platform=yqq.json&needNewCode=0&cid=205360772&reqtype=2&biztype=2&topid=12924001&cmd=8&needmusiccrit=0&pagenum={}&pagesize=25'.format(i)
# 添加headers
headers = {
'user-agent': UserAgent().random
}
# 发起请求
try:
r = requests.get(url, headers=headers)
except Exception as e:
print(e)
continue
# 解析网页
json_data = json.loads(r.text)
# 获取数据
comment_list = json_data['comment']['commentlist']
# 昵称
nick_name = [i.get('nick') for i in comment_list]
# 评论内容
content = [i.get('rootcommentcontent') for i in comment_list]
# 评论时间
comment_time = [i.get('time') for i in comment_list]
# 点赞数
praise_num = [i.get('praisenum') for i in comment_list]
# 存储数据
df = pd.DataFrame({
'nick_name': nick_name,
'content': content,
'comment_time': comment_time,
'praise_num': praise_num
})
# 追加数据
df_all = df_all.append(df, ignore_index=True)
# 休眠一秒
time.sleep(1)
return df_all
# 运行函数
df = get_qq_comment(page_num=5)
df.to_excel('C:\\Users\\Administrator\\Desktop\\mojito.xlsx',index = False)
只要5页内容,试一下就行了
2、数据处理
读入数据
df = pd.read_excel('C:\\Users\\Administrator\\Desktop\\mojito.xlsx')
查看重复值和空值
print(df.duplicated().sum()) # 0
print(df.isnull().sum())
'''
nick_name 0
content 0
comment_time 0
praise_num 0
dtype: int64
'''
df.info
时间格式转换
df['comment_time']
# 原格式
'''
0 1592708649
1 1592708347
2 1592708274
3 1592708154
4 1592708011
120 1592636465
121 1592636412
122 1592636120
123 1592636047
124 1592636017
Name: comment_time, Length: 125, dtype: int64
'''
def transform_time(time_second):
time_array = time.localtime(time_second)
otherStyleTime = time.strftime('%Y-%m-%d %H:%M:%S', time_array)
return otherStyleTime
# 时间数据处理
df['comment_time'] = df['comment_time'].apply(lambda x: transform_time(x))
'''
0 2020-06-21 11:04:09
1 2020-06-21 10:59:07
2 2020-06-21 10:57:54
3 2020-06-21 10:55:54
4 2020-06-21 10:53:31
120 2020-06-20 15:01:05
121 2020-06-20 15:00:12
122 2020-06-20 14:55:20
123 2020-06-20 14:54:07
124 2020-06-20 14:53:37
Name: comment_time, Length: 125, dtype: object
'''
content 评论内容初步处理(正则—表达式)
import re
pattern = re.compile(r'\[em\](.*?)\[/em\]')
df['content'] = df.content.str.replace(pattern, '')
df.head()
按时间排序
comment_num = df.comment_time.str.split(':').str[0].value_counts().sort_index()
'''
索引
年 - 月- 日 小时 这个时点评论人数
2020-06-20 14 3
2020-06-20 15 15
2020-06-20 16 6
2020-06-20 17 15
2020-06-20 18 11
2020-06-20 19 12
2020-06-20 20 4
2020-06-20 21 10
2020-06-20 22 5
2020-06-20 23 10
2020-06-21 00 9
2020-06-21 01 1
2020-06-21 02 3
2020-06-21 05 1
2020-06-21 06 1
2020-06-21 07 1
2020-06-21 08 3
2020-06-21 09 7
2020-06-21 10 7
2020-06-21 11 1
Name: comment_time, dtype: int64
'''
产生时间序列数据
# 去掉年份
x_line1 = [i.replace('2020-','') for i in comment_num.index.to_list()]
'''
['06-20 14',
'06-20 15',
'06-20 16',
'06-20 17',
'06-20 18',
...
'''
# 这个时点评论人数
y_line1 = comment_num.values.tolist()
# [3, 15, 6, 15, 11, 12, 4, 10, 5, 10, 9, 1, 3, 1, 1, 1, 3, 7, 7, 1]
3、数据可视化
导入所需包
import jieba
import stylecloud
from pyecharts.charts import Pie, Bar, Map, Line, WordCloud, Page
from pyecharts import options as opts
from pyecharts.globals import SymbolType
Mojito评论人数走势图
c = (
Line() #初始化
.add_xaxis(x_line1) # X轴
# Y轴
.add_yaxis('', # 系列名称
y_line1,# 系列数据
# 标记点配置项
markpoint_opts=opts.MarkPointOpts(data=[
opts.MarkPointItem(type_='max', name='最大值'),# 标记最大值
opts.MarkPointItem(type_='min', name='最小值') # 标记最小值
]))
# 全局配置项
.set_global_opts(
# 标题设置
title_opts=opts.TitleOpts('Mojito评论人数走势图'),
# 轴标签设置
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate='30')), # 标签旋转
)
# 系统配置项
.set_series_opts(
# 标签配置项
label_opts=opts.LabelOpts(is_show=False), # 不显示标签
# 线配置项
linestyle_opts=opts.LineStyleOpts(width=3)) # 线宽3
.render("line_base.html")
)
QQ音乐评论词云图
def get_cut_words(content_series):
# 读入停用词表
stop_words = []
with open(r"C:\\Users\\Administrator\\Desktop\\stop_words.txt", 'r') as f:
lines = f.readlines()
for line in lines:
stop_words.append(line.strip())
# 添加关键词
my_words = ['周杰伦', '一首歌', '好好听', '方文山', '30多岁']
for i in my_words:
jieba.add_word(i)
# 自定义停用词
my_stop_words = ['歌有', '真的', '这首', '一首', '一点',
'反正', '一段', '一句', '首歌', '啊啊啊',
'哈哈哈', '转发', '微博', '那段', '他会'
]
stop_words.extend(my_stop_words)
# 分词
content=';'.join([ str(c) for c in df['content'].tolist()])
word_num = jieba.lcut(content)
# 条件筛选
word_num_selected = [i for i in word_num if i not in stop_words and len(i)>=2]
return word_num_selected
text1 = get_cut_words(content_series=df.content)
stylecloud.gen_stylecloud(text=' '.join(text1),
max_words=1000,
collocations=False,
font_path='C:\\Windows\\Fonts\\simhei.ttf',
icon_name='fas fa-music',
size=624,
gradient='vertical' ,
palette='cartocolors.diverging.TealRose_2',
output_name='C:\\Users\\Administrator\\Desktop\\QQ音乐评论词云图.png')