deftime_change(seconds):
m, s =divmod(seconds,60)
h, m =divmod(m,60)
ss_time ="%d:%02d:%02d"%(h, m, s)# print(ss_time)return ss_time
# time_change(seconds=8888)#将time_change函数应用于date字段:
df["date"]= df["date"].apply(time_change)#设置为需要的时间格式
df['date']= pd.to_datetime(df['date'])
df['date']= df['date'].apply(lambda x : x.strftime('%H:%M:%S'))
机械压缩函数处理comment
#定义机械压缩函数defyasuo(st):for i inrange(1,int(len(st)/2)+1):for j inrange(len(st)):if st[j:j+i]== st[j+i:j+2*i]:
k = j + i
while st[k:k+i]== st[k+i:k+2*i]and k<len(st):
k = k + i
st = st[:j]+ st[k:]return st
# yasuo(st='')#调用机械压缩函数
df["comment"]= df["comment"].astype("str").apply(yasuo)
会员等级打标
df.grade.value_counts().index.tolist()
[0, 3, 1, 4, 2, 5, 6, 7, 8]
df['grade']='v'+df['grade'].astype('str')#['v'+i for i in df['grade']]
df.sample(2)
episodes
user
comment
grade
date
likecounts
9235
第1期
未知用户
王颖飞好像一个演员
v0
00:27:42
5
57421
面试篇
zxz
彩虹袜
v3
00:13:07
12
数据分析
#绘图通用函数defget_pyechart(x,y,chart,title,size,pos,theme):if chart =='bar':
c =(
Bar(init_opts=opts.InitOpts(theme=theme)).add_xaxis(x).add_yaxis("",y).set_series_opts(label_opts=opts.LabelOpts(font_size=size,position=pos)))elif chart =='barh':
c =(
Bar(init_opts=opts.InitOpts(theme=theme)).add_xaxis(x).add_yaxis("",y).reversal_axis()#X轴与y轴调换顺序.set_series_opts(label_opts=opts.LabelOpts(font_size=size,position=pos)))elif chart =='pie':
c =(
Pie(init_opts=opts.InitOpts(theme=theme)).add("",list(zip(x,y))).set_series_opts(label_opts=opts.LabelOpts(formatter="等级{b}占比:{d}%",font_size=size)))elif chart =='line':
c =(
Line(init_opts=opts.InitOpts(theme=theme)).add_xaxis(x).add_yaxis('情感倾向',y, is_smooth=True,is_connect_nones=True,areastyle_opts=opts.AreaStyleOpts(opacity=0.5)).set_global_opts(title_opts=opts.TitleOpts(title=title,subtitle="数据来源:腾讯视频",pos_left ='left')))
c.set_global_opts(title_opts=opts.TitleOpts(title=title,subtitle="数据来源:腾讯视频",pos_left ='left'),
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(font_size=13)),#更改横坐标字体大小
yaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(font_size=13)),#更改纵坐标字体大小)return c.render_notebook()
x = df_u.index.tolist()
y = df_u.values.tolist()
df_u = get_pyechart(x=x,y=y,chart='barh',title='弹幕发送数量TOP10',size=16,pos='right',theme=ThemeType.DARK)
df_u
x = df_g.index.tolist()
y = df_g.values.tolist()
df_g = get_pyechart(x=x,y=y,chart='pie',title='会员等级分布',size=14,pos='right',theme=ThemeType.DARK)
df_g
词云图讨论
import stylecloud
import jieba
import os
from IPython.display import Image # 用于在jupyter lab中显示本地图
# 定义分词函数defget_cut_words(content_series):# 读入停用词表
stop_words =[]withopen(r"D:/Pandas/已学习/如何制作stylecloud词云?/stop_words.txt",'r', encoding='utf-8')as f:
lines = f.readlines()for line in lines:
stop_words.append(line.strip())# 添加关键词
my_words =['撒老师','范丞丞','第一季']for i in my_words:
jieba.add_word(i)# 自定义停用词
my_stop_words =['好像','真的','感觉']
stop_words.extend(my_stop_words)# 分词
word_num = jieba.lcut(content_series.str.cat(sep='。'), cut_all=False)# 条件筛选
word_num_selected =[i for i in word_num if i notin stop_words andlen(i)>=2]return word_num_selected
# 绘制词云图
text1 = get_cut_words(content_series=df['comment'])
stylecloud.gen_stylecloud(text=' '.join(text1), max_words=100,
collocations=False,
font_path='字酷堂清楷体.ttf',
icon_name='fas fa-dog',
size=512,#palette='matplotlib.Inferno_9',
output_name='offer.png')
Image(filename='offer.png')
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 2.044 seconds.
Prefix dict has been built successfully.
4.人物被提及
丁辉,詹秋怡,王骁,朱一暄,瞿泽林,李晋晔,王颖飞,刘煜成
#df.str.contains('')虽然不能用来判断,但是可以进行聚合操作
df_talk =['丁辉','詹秋怡','王骁','朱一暄','瞿泽林','李晋晔','王颖飞','刘煜成']
dft =[df['comment'].str.contains(i).sum()for i in df_talk]
df_t = pd.DataFrame(data={'people':df_talk,'count':dft})
df_t = df_t.sort_values('count',ascending=False)
x = df_t['people'].tolist()
y = df_t['count'].tolist()
df_t = get_pyechart(x=x,y=y,chart='bar',title='被提及次数',size=16,pos='top',theme=ThemeType.DARK)
df_t
情感分析
import paddlehub as hub
#这里使用了百度开源的成熟NLP模型来预测情感倾向
senta = hub.Module(name="senta_bilstm")
texts = df['comment'].tolist()
input_data ={'text':texts}
res = senta.sentiment_classify(data=input_data)
df['pos']=[x['positive_probs']for x in res]#重采样至15分钟
df.index = pd.to_datetime(df['date'])
data = df.resample('15min').mean().reset_index()# #给数据表添加调色板# import seaborn as sns# color_map = sns.light_palette('orange', as_cmap=True) #light_palette调色板# data.style.background_gradient(color_map)
x = data["date"].to_list()
y =list(data["pos"].round(2))
df_p = get_pyechart(x=x,y=y,chart='line',title='情感倾向',size=16,pos='top',theme=ThemeType.DARK)
df_p