腾讯弹幕数据分析实战

通用爬虫代码:

'''
Date: 2020-12-21 23:27:59
LastEditTime: 2020-12-22 16:36:43
'''
import requests
import json
import time
import pandas as pd
  

def get_danmu_all_page(target_id, vid,filename):
    df = pd.DataFrame()
    for page in range(15, 20000, 30):  #亲测有效
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
        url = 'https://mfm.video.qq.com/danmu?otype=json&timestamp={0}&target_id={1}vid{2}&count=80'.format(page,target_id,vid)
        print("正在提取第" + str(page) + "页")
        html = requests.get(url,headers = headers)
        bs = json.loads(html.text,strict = False)  #strict参数解决部分内容json格式解析报错
        time.sleep(0.5)
        #遍历获取目标字段
        try:
            for i in bs['comments']:
                opername = i['opername']
                content = i['content']  #弹幕
                upcount = i['upcount']  #点赞数
                user_degree =i['uservip_degree'] #会员等级
                timepoint = i['timepoint']  #发布时间
                comment_id = i['commentid']  #弹幕id
                cache = pd.DataFrame({'用户id':[opername],'弹幕':[content],'会员等级':[user_degree],'发布时间':[timepoint],'弹幕点赞':[upcount],'弹幕id':[comment_id]})
                df = pd.concat([df,cache])
        except Exception as e:
            break
    # df.to_csv(f'{filename}.csv',encoding = 'utf-8')
    return df
#自己按需添加
target_id = ['6130942571%26','6164313448%26','6194952391%26','6227063464%26']
vid = ['%3Dt0034o74jpr','%3Dr00346rvwyq','%3Dd0035rctvoh','%3Db0035j0tgo0']
filename = ['面试篇','第1期','第2期','第3期']
df = pd.DataFrame()
for i in range(len(vid)):
    df = get_danmu_all_page(target_id=target_id[i], vid=vid[i],filename=filename[i])#df{}.format(data_dm[i])
    df.insert(0, '所属期数', filename[i])
    df.to_csv(f'{filename[i]}.csv',index=False)

令人心动的offer2可视化分析

import pandas as pd
import numpy as np
import pyecharts.options as opts
from pyecharts.charts import *
from pyecharts.globals import ThemeType

批量导入数据并合并

# path = 'D:/Pandas/csv/'
# #文件目录路径
# df_all = pd.DataFrame()
# for i in os.listdir(path):
#     df_one = pd.read_csv(path+f'{i}', engine='python', encoding='utf-8')  
#     df_all = df_all.append(df_one, ignore_index=False)#ignore_index:默认值为False,如果为True则不使用index标签
# df_all.shape
# #输出
# df_all.to_excel('offer.xlsx',index=False)

数据读取

df = pd.read_excel('offer.xlsx')
df[:10]
所属期数用户id弹幕会员等级发布时间弹幕点赞弹幕id
0第1期NaN47,第一来了45236732257356663684096
1第1期NaN第一1856732257393620750336
2第1期NaN哈哈哈,我还以为我是第一个呢17326732257548063721472
3第1期NaNYEYEYEY08166732257672952315904
4第1期NaN来了来了,66186732258003787425792
5第1期叶湘伦来了0806732258629720189952
6第1期NaN来了来了0826732258673319944192
7第1期海浪来啦0806732258806970479616
8第1期NaN来咯0546732258885488543744
9第1期熙崽是神仙第一!0796732260786762655744

数据处理及清洗

重命名

所属期数 episodes 用户id user 弹幕 comment 会员等级 grade 发布时间 date 弹幕点赞 likecounts 弹幕id dmid

df.rename(columns={'所属期数':'episodes','用户id':'user','弹幕':'comment','会员等级':'grade','发布时间':'date','弹幕点赞':'likecounts','弹幕id':'dmid'},inplace=True)
df.head()
episodesusercommentgradedatelikecountsdmid
0第1期NaN47,第一来了45236732257356663684096
1第1期NaN第一1856732257393620750336
2第1期NaN哈哈哈,我还以为我是第一个呢17326732257548063721472
3第1期NaNYEYEYEY08166732257672952315904
4第1期NaN来了来了,66186732258003787425792
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70798 entries, 0 to 70797
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   episodes    70798 non-null  object
 1   user        26409 non-null  object
 2   comment     70797 non-null  object
 3   grade       70798 non-null  int64 
 4   date        70798 non-null  int64 
 5   likecounts  70798 non-null  int64 
 6   dmid        70798 non-null  int64 
dtypes: int64(4), object(3)
memory usage: 3.8+ MB

过滤字段

#对用户进行空值补充
df['user'] = df['user'].fillna('未知用户')
#对弹幕进行re匹配处理
df['comment'] = df['comment'].str.extract(r"([\u4e00-\u9fa5]+)") #提取中文内容
df = df.dropna()  #纯表情弹幕直接删除
#提取分析字段
df = df.iloc[:,:-1]
df.head()
episodesusercommentgradedatelikecounts
0第1期未知用户第一来了4523
1第1期未知用户第一185
2第1期未知用户哈哈哈1732
4第1期未知用户来了来了6618
5第1期叶湘伦来了080

时间格式转换

def time_change(seconds):
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    ss_time = "%d:%02d:%02d" % (h, m, s)
    # print(ss_time)
    return ss_time
# time_change(seconds=8888)
#将time_change函数应用于date字段:
df["date"] = df["date"].apply(time_change)
#设置为需要的时间格式
df['date'] = pd.to_datetime(df['date'])
df['date'] = df['date'].apply(lambda x : x.strftime('%H:%M:%S'))

机械压缩函数处理comment

#定义机械压缩函数
def yasuo(st):
    for i in range(1,int(len(st)/2)+1):
        for j in range(len(st)):
            if st[j:j+i] == st[j+i:j+2*i]:
                k = j + i
                while st[k:k+i] == st[k+i:k+2*i] and k<len(st):   
                    k = k + i
                st = st[:j] + st[k:]    
    return st
# yasuo(st='')
#调用机械压缩函数
df["comment"] = df["comment"].astype("str").apply(yasuo)

会员等级打标

df.grade.value_counts().index.tolist()
[0, 3, 1, 4, 2, 5, 6, 7, 8]
df['grade'] = 'v'+df['grade'].astype('str')#['v'+i for i in df['grade']]
df.sample(2)
episodesusercommentgradedatelikecounts
9235第1期未知用户王颖飞好像一个演员v000:27:425
57421面试篇zxz彩虹袜v300:13:0712

数据分析

#绘图通用函数
def get_pyechart(x,y,chart,title,size,pos,theme):
    if chart == 'bar':
        c = (
            Bar(init_opts=opts.InitOpts(theme=theme))
            .add_xaxis(x)
            .add_yaxis("",y) 
            .set_series_opts(label_opts=opts.LabelOpts(font_size=size,position=pos))
            )
    elif chart == 'barh':
        c = (
            Bar(init_opts=opts.InitOpts(theme=theme))
            .add_xaxis(x)
            .add_yaxis("",y).reversal_axis() #X轴与y轴调换顺序
            .set_series_opts(label_opts=opts.LabelOpts(font_size=size,position=pos))
            )
    elif chart == 'pie':
        c = (
            Pie(init_opts=opts.InitOpts(theme=theme))
            .add("", list(zip(x,y)))
            .set_series_opts(label_opts=opts.LabelOpts(formatter="等级{b}占比:{d}%",font_size=size))
            )
    elif chart == 'line':
        c = (
            Line(init_opts=opts.InitOpts(theme=theme))
            .add_xaxis(x)
            .add_yaxis('情感倾向',y, is_smooth=True,is_connect_nones=True,areastyle_opts=opts.AreaStyleOpts(opacity=0.5))
            .set_global_opts(title_opts=opts.TitleOpts(title=title,subtitle="数据来源:腾讯视频",pos_left = 'left'))
            )
    c.set_global_opts(title_opts=opts.TitleOpts(title=title,subtitle="数据来源:腾讯视频",pos_left = 'left'),
                        xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(font_size=13)), #更改横坐标字体大小
                        yaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(font_size=13)), #更改纵坐标字体大小
                        )
    return c.render_notebook()

1.各期弹幕数量对比

df_e = df['episodes'].value_counts()
df_e
面试篇    18603
第3期    18107
第1期    17551
第2期    15844
Name: episodes, dtype: int64
x = df_e.index.tolist()
y = df_e.values.tolist()
df_e = get_pyechart(x=x,y=y,chart='bar',title='各期弹幕数量',size=16,pos='top',theme=ThemeType.DARK)
df_e

在这里插入图片描述

2.谁是弹幕发射机

df_u = df['user'].value_counts()[1:10].sort_values(ascending=True)
df_u
神马不是浮云     78
.          79
圣雪天使       80
。          89
白龙吟        92
为时不晚i      93
momo       93
ベ☆小强呐     103
想太多de猫    135
Name: user, dtype: int64
x = df_u.index.tolist()
y = df_u.values.tolist()
df_u = get_pyechart(x=x,y=y,chart='barh',title='弹幕发送数量TOP10',size=16,pos='right',theme=ThemeType.DARK)
df_u

在这里插入图片描述

df[df["user"]=="想太多de猫"].sample(10)
episodesusercommentgradedatelikecounts
17104第1期想太多de猫爱吃辣的都是美女v400:51:549
20881第2期想太多de猫如果这都是咸鱼v400:09:4510
1489第1期想太多de猫领带神马的太高端v400:04:4212
5408第1期想太多de猫谦虚不等于诚实哦v400:15:4912
57919面试篇想太多de猫大多数父母的想法v400:14:547
60506面试篇想太多de猫越看越自卑v400:21:3613
45714第3期想太多de猫这是绝对的高质量对抗v000:37:002
21397第2期想太多de猫瞿泽林这句对人性的关怀v400:11:0549
59906面试篇想太多de猫假发了解一下v400:19:5313
39564第3期想太多de猫个人觉得直接问更好v000:21:491

3.会员等级分布

df_g = df['grade'].value_counts().sort_values(ascending=True)
df_g
v8       12
v7      135
v6     1496
v5     2097
v2     2499
v4     2957
v1     3220
v3     3403
v0    54286
Name: grade, dtype: int64
x = df_g.index.tolist()
y = df_g.values.tolist()
df_g = get_pyechart(x=x,y=y,chart='pie',title='会员等级分布',size=14,pos='right',theme=ThemeType.DARK)
df_g

在这里插入图片描述

词云图讨论

import stylecloud
import jieba
import os 
from IPython.display import Image # 用于在jupyter lab中显示本地图
# 定义分词函数
def get_cut_words(content_series):
    # 读入停用词表
    stop_words = [] 
    with open(r"D:/Pandas/已学习/如何制作stylecloud词云?/stop_words.txt", 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines:
            stop_words.append(line.strip())
    # 添加关键词
    my_words = ['撒老师', '范丞丞','第一季']  
    for i in my_words:
        jieba.add_word(i) 
    # 自定义停用词
    my_stop_words = ['好像', '真的','感觉']   
    stop_words.extend(my_stop_words)               
    # 分词
    word_num = jieba.lcut(content_series.str.cat(sep='。'), cut_all=False)
    # 条件筛选
    word_num_selected = [i for i in word_num if i not in stop_words and len(i)>=2]
    return word_num_selected
# 绘制词云图
text1 = get_cut_words(content_series=df['comment'])
stylecloud.gen_stylecloud(text=' '.join(text1), max_words=100,
                          collocations=False,
                          font_path='字酷堂清楷体.ttf',
                          icon_name='fas fa-dog',
                          size=512,
                          #palette='matplotlib.Inferno_9',
                          output_name='offer.png')
Image(filename='offer.png')
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 2.044 seconds.
Prefix dict has been built successfully.

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-PL3JJJhN-1608626592533)(demo-offer_files/demo-offer_39_1.png)]

4.人物被提及

丁辉,詹秋怡,王骁,朱一暄,瞿泽林,李晋晔,王颖飞,刘煜成

#df.str.contains('')虽然不能用来判断,但是可以进行聚合操作
df_talk = ['丁辉','詹秋怡','王骁','朱一暄','瞿泽林','李晋晔','王颖飞','刘煜成']
dft = [df['comment'].str.contains(i).sum() for i in df_talk]
df_t = pd.DataFrame(data={'people':df_talk,'count':dft})
df_t = df_t.sort_values('count',ascending=False)
x = df_t['people'].tolist()
y = df_t['count'].tolist()
df_t = get_pyechart(x=x,y=y,chart='bar',title='被提及次数',size=16,pos='top',theme=ThemeType.DARK)
df_t

在这里插入图片描述

情感分析

import paddlehub as hub
#这里使用了百度开源的成熟NLP模型来预测情感倾向
senta = hub.Module(name="senta_bilstm")
texts = df['comment'].tolist()
input_data = {'text':texts}
res = senta.sentiment_classify(data=input_data)
df['pos'] = [x['positive_probs'] for x in res]
#重采样至15分钟
df.index = pd.to_datetime(df['date'])
data = df.resample('15min').mean().reset_index()

# #给数据表添加调色板
# import seaborn as sns
# color_map = sns.light_palette('orange', as_cmap=True)  #light_palette调色板
# data.style.background_gradient(color_map)
[2020-12-22 16:27:46,410] [    INFO] - Installing senta_bilstm module
[2020-12-22 16:27:46,436] [    INFO] - Module senta_bilstm already installed in C:\Users\Administrator\.paddlehub\modules\senta_bilstm
[2020-12-22 16:27:57,111] [    INFO] - Installing lac module
[2020-12-22 16:27:57,579] [    INFO] - Module lac already installed in C:\Users\Administrator\.paddlehub\modules\lac
df[:10]
x = data["date"].to_list()
y = list(data["pos"].round(2))
df_p = get_pyechart(x=x,y=y,chart='line',title='情感倾向',size=16,pos='top',theme=ThemeType.DARK)
df_p

在这里插入图片描述

数据集:

链接:https://pan.baidu.com/s/1p4O8-SF-IVqnJkH-5Fp3jw
提取码:love
复制这段内容后打开百度网盘手机App,操作更方便哦

总结

情感分析会比较耗时间

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

高中不复,大学纷飞

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值