电影简介
2021年7月23日, 白蛇2:青蛇劫起在大陆上映,主要讲述南宋末年,小白为救许仙水漫金山,终被法海压在雷峰塔下。小青则意外被法海打入诡异的修罗城幻境。几次危机中小青被神秘蒙面少年所救,小青带着出去救出小白的执念历经劫难与成长,同蒙面少年一起寻找离开办法的故事。
import pandas as pd
# from PIL import Image
from collections import Counter
from pyecharts.charts import Geo
from pyecharts.charts import Bar
from pyecharts.charts import Line
from pyecharts.charts import Pie
from pyecharts.charts import Calendar
# from pyecharts.charts import WordCloud
from pyecharts import options as opts
from pyecharts.commons.utils import JsCode
from pyecharts.globals import ThemeType,SymbolType,ChartType
df = pd.read_excel("./白蛇2.xlsx")
数据大小
df.shape
一共有20584条影评数据,数据时间分布2021-8-01至2021-08-31。
查看索引、数据类型和内存信息
df.info()
用户名存在一条缺失,其他各列数据完整,用“未知”填充空值:
df['用户名'].fillna('未知', inplace=True)
Pyecharts数据可视化
# 线性渐变
color_js = """new echarts.graphic.LinearGradient(0, 0, 1, 0,
[{offset: 0, color: '#009ad6'}, {offset: 1, color: '#ed1941'}], false)"""
df_star = df.groupby('评分')['评论'].count()
df_star = df_star.sort_values(ascending=True)
x_data = [str(i) for i in list(df_star.index)]
y_data = df_star.values.tolist()
b1 = (
Bar(init_opts=opts.InitOpts(
width="1000px",height="600px",
bg_color="white", # 默认主题风格,或 "Theme-default" 或 不要此参数,均为默认风格
# theme=p_theme # 图表主题
)
)
.add_xaxis(x_data)
.add_yaxis('',y_data,itemstyle_opts=opts.ItemStyleOpts(color=JsCode(color_js)))
.reversal_axis()
.set_series_opts(label_opts=opts.LabelOpts(position='right'))
.set_global_opts(
yaxis_opts=opts.AxisOpts(name='评分等级'),
xaxis_opts=opts.AxisOpts(name='人/次'),
title_opts=opts.TitleOpts(title='评分等级分布',pos_left='45%',pos_top="5%"),
)
)
b1.render_notebook()
结论:5.0的评分占比达到了56%,超过了半数观众打出了五星好评,四星以上好评更是达到了85%之多,看来大家对这部动漫还是高度认可的。
每日评论量
#每日评论量
# 设置样式
color_js = """new echarts.graphic.LinearGradient(0, 1, 0, 0,
[{offset: 0, color: '#009ad6'}, {offset: 1, color: '#ed1941'}], false)"""
df['评论时间'] = pd.to_datetime(df['评论时间'], format='%Y/%m/%d %H:%M:%S')
# 每日评论量
df['评论时间'] = pd.to_datetime(df['评论时间'], format='%Y/%m/%d %H:%M:%S')
df_day = df.groupby(df['评论时间'].dt.day)['评论'].count()
day_x_data = [str(i) for i in list(df_day.index)]
day_y_data = df_day.values.tolist()
line1 = (
Line(init_opts=opts.InitOpts(bg_color=JsCode(color_js)))
.add_xaxis(xaxis_data=day_x_data)
.add_yaxis(
series_name="",
y_axis=day_y_data,
symbol="circle",
symbol_size=6,
linestyle_opts=opts.LineStyleOpts(color="#fff"),
label_opts=opts.LabelOpts(is_show=True, position="top", color="white"),
itemstyle_opts=opts.ItemStyleOpts(
color="red", border_color="#fff", border_width=3
),
)
.set_global_opts(
title_opts=opts.TitleOpts(
title="八月每日评论量",
pos_top="5%",
pos_left="center",
title_textstyle_opts=opts.TextStyleOpts(color="#fff", font_size=16),
),
xaxis_opts=opts.AxisOpts(
axislabel_opts=opts.LabelOpts( color="#ffffff63"), # x轴的颜色
axisline_opts=opts.AxisLineOpts(is_show= False),
axistick_opts=opts.AxisTickOpts(
length=2,
linestyle_opts=opts.LineStyleOpts(color="#ffffff1f"),
),
splitline_opts=opts.SplitLineOpts(
is_show=True, linestyle_opts=opts.LineStyleOpts(color="#ffffff1f")
),
),
yaxis_opts=opts.AxisOpts(
axislabel_opts=opts.LabelOpts( color="#ffffff63"),
axisline_opts=opts.AxisLineOpts(
linestyle_opts=opts.LineStyleOpts(width=2, color="#fff")
),
axistick_opts=opts.AxisTickOpts(
linestyle_opts=opts.LineStyleOpts(color="#ffffff1f"),
),
splitline_opts=opts.SplitLineOpts(
is_show=True, linestyle_opts=opts.LineStyleOpts(color="#ffffff1f")
),
),
)
)
line1.render_notebook()
每天评论量在8月1日达到峰值(数据不包含7月份),随着时间的推移评论数量逐渐减少,这也符合一般电影观影规律。
每小时评论量
df_hour = df.groupby(df['评论时间'].dt.hour)['评论'].count()
hours_x_data = [str(i) for i in list(df_hour.index)]
hours_y_data = df_hour.values.tolist()
line1 = (
Line(init_opts=opts.InitOpts(width='1000px', height='400px',theme= 'dark'))
.add_xaxis(xaxis_data=hours_x_data)
.add_yaxis(
series_name="",
y_axis=hours_y_data,
symbol_size=6,
linestyle_opts=opts.LineStyleOpts(color="blue"),
itemstyle_opts=opts.ItemStyleOpts(
color="yellow", border_color="green", border_width=3
),
)
.set_series_opts(
markpoint_opts=opts.MarkPointOpts(
data=[opts.MarkPointItem(type_="max",itemstyle_opts=opts.ItemStyleOpts(
color="#06FFD7", border_width=3)),
opts.MarkPointItem(type_="min",itemstyle_opts=opts.ItemStyleOpts(
color="#06FFD7", border_width=3))],
),
)
.set_global_opts(
title_opts=opts.TitleOpts(
title="每小时评论量",
pos_left="center",
title_textstyle_opts=opts.TextStyleOpts(color="red", font_size=29),
),
)
)
line1.render_notebook()
从小时分布来看,大家一般选择在下午到晚上评论的比较多,尤其是在17:00以后,大家在工作时段还都是比较敬业的。第二次评论峰值在22:00,这个时间段是熬夜青年比较活跃的时段,小伙伴们的作息时间都比较靠后。
一周各天评论量
# 增加字段'星期'
dic = {1:'星期一',2:'星期二',3:'星期三',4:'星期四',5:'星期五',6:'星期六',7:'星期日'}
df['星期'] = df['评论时间'].dt.dayofweek+1
df['星期'] = df['星期'].map(dic)
# 一周各天评论量
dic = {1:'星期一',2:'星期二',3:'星期三',4:'星期四',5:'星期五',6:'星期六',7:'星期日'}
df['星期'] = df['评论时间'].dt.dayofweek+1
df1 = df.sort_values('星期',ascending=True)
df_week = df1.groupby(['星期'])['评论'].count()
week_x_data = [dic[i] for i in list(df_week.index)]
week_y_data = df_week.values.tolist()
line1 = (
Line(init_opts=opts.InitOpts(theme = 'dark'))
.add_xaxis(xaxis_data=week_x_data)
.add_yaxis(
series_name="",
y_axis=week_y_data,
is_smooth=True,
symbol="circle",
symbol_size=6,
linestyle_opts=opts.LineStyleOpts(color="#fff"),
label_opts=opts.LabelOpts(color="white"), # 数字标签
itemstyle_opts=opts.ItemStyleOpts(
color="red", border_color="#fff", border_width=3
),
# 提示框
tooltip_opts=opts.TooltipOpts(is_show=False),
areastyle_opts=opts.AreaStyleOpts(color='#FFF8DC', opacity=1), # 玉米色
)
.set_global_opts(
title_opts=opts.TitleOpts(
title="一周各天评论量",
pos_top="5%",
pos_left="center",
title_textstyle_opts=opts.TextStyleOpts(color="#fff", font_size=16),
),
xaxis_opts=opts.AxisOpts(
axislabel_opts=opts.LabelOpts(color="pink"),
axisline_opts=opts.AxisLineOpts(is_show= False), # 默认是True
splitline_opts=opts.SplitLineOpts(
is_show=True, linestyle_opts=opts.LineStyleOpts(color="#ffffff1f")
),
),
yaxis_opts=opts.AxisOpts(
axislabel_opts=opts.LabelOpts(color="yellow"),
axisline_opts=opts.AxisLineOpts(
linestyle_opts=opts.LineStyleOpts(width=5, color="red") # w为了让大家看到参数是哪里
),
splitline_opts=opts.SplitLineOpts(
is_show=True, linestyle_opts=opts.LineStyleOpts(color="#ffffff1f")
),
),
)
)
line1.render_notebook()
日历图
times = [x.strftime('%Y-%m-%d') for x in list(pd.date_range('20210801', '20210831'))]
data = [i for i in zip(times,day_y_data)]
Cal = (
Calendar(init_opts=opts.InitOpts(width="800px", height="500px"))
.add(
series_name="八月份每日评论量分布情况",
yaxis_data=data,
calendar_opts=opts.CalendarOpts(
pos_top='20%',
pos_left='15%',
range_="2021-08",
cell_size=30,
),
tooltip_opts='{c}',
)
.set_global_opts(
visualmap_opts=opts.VisualMapOpts(
orient="horizontal",
max_=2000,
pos_bottom='10%',
is_piecewise=True,
pieces=[{"min": 1200},
{"min": 800, "max": 1200},
{"min": 500, "max": 800},
{"min": 300, "max": 500},
{"min": 80, "max": 300},
{"max": 80}],
range_color=["#F5F5F5", "#FFE4E1", "#FFCC99", "#F08080", "#CD5C5C", "#990000"]
),
legend_opts=opts.LegendOpts(is_show=True,
pos_top='5%',
item_width = 20, # 红色的legend的宽度
item_height = 10,
textstyle_opts=opts.TextStyleOpts(font_size=16,color='#EB1934'),
),
)
)
Cal.render_notebook()
角色热度
主要人物:小白、小青、许仙、法海、司马、孙姐、牛头帮主、蒙面男子、宝青坊主、书生
roles=['小白','小青','许仙','法海','司马','孙姐','牛头帮主','蒙面男子','宝青坊主','书生']
content=''.join([str(i) for i in list(df['评论'])])#把评论都放在字符串里面
roles_num=[]
for role in roles:
count=content.count(role)
roles_num.append((role,count))
roles_num=pd.DataFrame(roles_num)
roles_num.columns=['名称','出现次数']
roles_num=roles_num.sort_values(by='出现次数',ascending=False)
roles_num=roles_num.reset_index(drop=True) # 重置索引
# 线性渐变
b2 = (
Bar()
.add_xaxis(list(roles_num['名称']))
.add_yaxis('频次', list(roles_num['出现次数']),itemstyle_opts=opts.ItemStyleOpts(color='red'))
.set_global_opts(title_opts=opts.TitleOpts(title='影评角色频次分布',pos_top='2%',pos_left = 'center'),
legend_opts=opts.LegendOpts(is_show=False), # 图例为False
yaxis_opts=opts.AxisOpts(name="频次",
#name_location='middle',
name_gap=30,
name_textstyle_opts=opts.TextStyleOpts(font_size=16,color = 'red')))
)
b2.render_notebook()
观众地域分布
cities = df['城市'].to_list()
data = Counter(cities).most_common(80) # 出现最多次数的80个
geo = (
Geo(init_opts=opts.InitOpts(width="1000px", height="600px", bg_color="#404a59"))
.add_schema(maptype="china")
.add("评论数量", data,type_=ChartType.HEATMAP,)
.set_global_opts(
title_opts=opts.TitleOpts(title="地理位置分布",pos_top="2%", pos_left="center",
title_textstyle_opts=opts.TextStyleOpts(color="#fff", font_size=16)),
legend_opts=opts.LegendOpts(is_show=False),
visualmap_opts=opts.VisualMapOpts(
is_show=True,
is_piecewise=True,
min_ = 0,
max_ = 500,
split_number = 5,
pos_bottom='5%',
pos_left='5%',
textstyle_opts=opts.TextStyleOpts(color="#fff"),
pieces=[
{'max':500, 'min':401, 'label':'401-500', 'color': '#990000'},
{'max':400, 'min':301, 'label':'301-400', 'color': '#CD5C5C'},
{'max':300, 'min':201, 'label':'201-300', 'color': '#F08080'},
{'max':200, 'min':101, 'label':'101-200', 'color': '#FFCC99'},
{'max':100, 'min':0, 'label':'0-100', 'color': '#FFE4E1'},
],
),
)
)
geo.render_notebook()
从地域分布图来看,观众主要分布在北京、天津、上海、重庆、四川、广东、云南等地。
影评词云
# #建立云图
corpus = ' '.join(df['评论'])
corpus = corpus.replace(',', '')
from matplotlib import colors
import matplotlib.pyplot as plt
color_list = ['#FFB6C1','#FFFF00','#B22222','#000000']
c = colors.ListedColormap(color_list)
from wordcloud import WordCloud, STOPWORDS
wordcloud = WordCloud(stopwords=STOPWORDS,
background_color='white',
width=2400,
height=2000,
colormap= c,
font_path=r'C:\Windows\Fonts\simsun.ttc').generate(
corpus, )
#绘制职位要求能力云图
plt.figure(figsize = (10,10))
plt.imshow(wordcloud) # imshow()接收一张图像,只是画出该图,并不会立刻显示出来(pycharm中,notebook可以显示)
plt.axis('off')
plt.show()