基于pycharts的白蛇2影评分析

电影简介

2021年7月23日, 白蛇2:青蛇劫起在大陆上映,主要讲述南宋末年,小白为救许仙水漫金山,终被法海压在雷峰塔下。小青则意外被法海打入诡异的修罗城幻境。几次危机中小青被神秘蒙面少年所救,小青带着出去救出小白的执念历经劫难与成长,同蒙面少年一起寻找离开办法的故事。

import pandas as pd
# from PIL import Image
from collections import Counter
from pyecharts.charts import Geo
from pyecharts.charts import Bar
from pyecharts.charts import Line
from pyecharts.charts import Pie
from pyecharts.charts import Calendar
# from pyecharts.charts import WordCloud
from pyecharts import options as opts
from pyecharts.commons.utils import JsCode
from pyecharts.globals import ThemeType,SymbolType,ChartType

df = pd.read_excel("./白蛇2.xlsx")

数据大小

df.shape

 

 一共有20584条影评数据,数据时间分布2021-8-01至2021-08-31。

查看索引、数据类型和内存信息

df.info()

用户名存在一条缺失,其他各列数据完整,用“未知”填充空值:

df['用户名'].fillna('未知', inplace=True)

Pyecharts数据可视化

# 线性渐变
color_js = """new echarts.graphic.LinearGradient(0, 0, 1, 0,
    [{offset: 0, color: '#009ad6'}, {offset: 1, color: '#ed1941'}], false)"""

df_star = df.groupby('评分')['评论'].count()
df_star = df_star.sort_values(ascending=True)
x_data = [str(i) for i in list(df_star.index)]
y_data = df_star.values.tolist()
b1 = (
    Bar(init_opts=opts.InitOpts(
                width="1000px",height="600px",
                bg_color="white",     # 默认主题风格,或 "Theme-default" 或 不要此参数,均为默认风格
                # theme=p_theme   # 图表主题
            )
)
    .add_xaxis(x_data)
    .add_yaxis('',y_data,itemstyle_opts=opts.ItemStyleOpts(color=JsCode(color_js)))
    .reversal_axis()
    .set_series_opts(label_opts=opts.LabelOpts(position='right'))    
    .set_global_opts(
        yaxis_opts=opts.AxisOpts(name='评分等级'),
        xaxis_opts=opts.AxisOpts(name='人/次'),
        title_opts=opts.TitleOpts(title='评分等级分布',pos_left='45%',pos_top="5%"),
    )
    
)

b1.render_notebook()

 结论:5.0的评分占比达到了56%,超过了半数观众打出了五星好评,四星以上好评更是达到了85%之多,看来大家对这部动漫还是高度认可的。

每日评论量

#每日评论量
# 设置样式
color_js = """new echarts.graphic.LinearGradient(0, 1, 0, 0,
    [{offset: 0, color: '#009ad6'}, {offset: 1, color: '#ed1941'}], false)"""

df['评论时间'] = pd.to_datetime(df['评论时间'], format='%Y/%m/%d %H:%M:%S')

# 每日评论量
df['评论时间'] = pd.to_datetime(df['评论时间'], format='%Y/%m/%d %H:%M:%S')
df_day = df.groupby(df['评论时间'].dt.day)['评论'].count()
day_x_data = [str(i) for i in list(df_day.index)]
day_y_data = df_day.values.tolist()

line1 = (
    Line(init_opts=opts.InitOpts(bg_color=JsCode(color_js)))
    .add_xaxis(xaxis_data=day_x_data)
    .add_yaxis(
        series_name="",
        y_axis=day_y_data,
        symbol="circle",
        symbol_size=6,
        linestyle_opts=opts.LineStyleOpts(color="#fff"),
        label_opts=opts.LabelOpts(is_show=True, position="top", color="white"),
        itemstyle_opts=opts.ItemStyleOpts(
            color="red", border_color="#fff", border_width=3
        ),
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title="八月每日评论量",
            pos_top="5%",
            pos_left="center",
            title_textstyle_opts=opts.TextStyleOpts(color="#fff", font_size=16),
        ),
        xaxis_opts=opts.AxisOpts(
            axislabel_opts=opts.LabelOpts( color="#ffffff63"), # x轴的颜色
            axisline_opts=opts.AxisLineOpts(is_show= False),
            axistick_opts=opts.AxisTickOpts(
                length=2,
                linestyle_opts=opts.LineStyleOpts(color="#ffffff1f"),
            ),
            splitline_opts=opts.SplitLineOpts(
                is_show=True, linestyle_opts=opts.LineStyleOpts(color="#ffffff1f")
            ),
        ),
        yaxis_opts=opts.AxisOpts(

            axislabel_opts=opts.LabelOpts( color="#ffffff63"),
            axisline_opts=opts.AxisLineOpts(
                linestyle_opts=opts.LineStyleOpts(width=2, color="#fff")
            ),
            axistick_opts=opts.AxisTickOpts(

                linestyle_opts=opts.LineStyleOpts(color="#ffffff1f"),
            ),
            splitline_opts=opts.SplitLineOpts(
                is_show=True, linestyle_opts=opts.LineStyleOpts(color="#ffffff1f")
            ),
        ),
    )
)
line1.render_notebook()

 每天评论量在8月1日达到峰值(数据不包含7月份),随着时间的推移评论数量逐渐减少,这也符合一般电影观影规律。

每小时评论量

df_hour = df.groupby(df['评论时间'].dt.hour)['评论'].count()
hours_x_data = [str(i) for i in list(df_hour.index)]
hours_y_data = df_hour.values.tolist()

line1 = (
    Line(init_opts=opts.InitOpts(width='1000px', height='400px',theme= 'dark'))
    .add_xaxis(xaxis_data=hours_x_data)
    .add_yaxis(
        series_name="",
        y_axis=hours_y_data,
        symbol_size=6,
        linestyle_opts=opts.LineStyleOpts(color="blue"),
        itemstyle_opts=opts.ItemStyleOpts(
            color="yellow", border_color="green", border_width=3
        ),
    )
    .set_series_opts(
        markpoint_opts=opts.MarkPointOpts(
            data=[opts.MarkPointItem(type_="max",itemstyle_opts=opts.ItemStyleOpts(
            color="#06FFD7", border_width=3)), 
            opts.MarkPointItem(type_="min",itemstyle_opts=opts.ItemStyleOpts(
            color="#06FFD7", border_width=3))],
            ),
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title="每小时评论量",
            pos_left="center",
            title_textstyle_opts=opts.TextStyleOpts(color="red", font_size=29),
        ),
    )
)
line1.render_notebook()

 从小时分布来看,大家一般选择在下午到晚上评论的比较多,尤其是在17:00以后,大家在工作时段还都是比较敬业的。第二次评论峰值在22:00,这个时间段是熬夜青年比较活跃的时段,小伙伴们的作息时间都比较靠后。

一周各天评论量

# 增加字段'星期'
dic = {1:'星期一',2:'星期二',3:'星期三',4:'星期四',5:'星期五',6:'星期六',7:'星期日'}
df['星期'] = df['评论时间'].dt.dayofweek+1
df['星期'] = df['星期'].map(dic)

# 一周各天评论量
dic = {1:'星期一',2:'星期二',3:'星期三',4:'星期四',5:'星期五',6:'星期六',7:'星期日'}
df['星期'] = df['评论时间'].dt.dayofweek+1
df1 = df.sort_values('星期',ascending=True)
df_week = df1.groupby(['星期'])['评论'].count()
week_x_data = [dic[i] for i in list(df_week.index)]
week_y_data = df_week.values.tolist()



line1 = (
    Line(init_opts=opts.InitOpts(theme = 'dark'))
    .add_xaxis(xaxis_data=week_x_data)
    .add_yaxis(
        series_name="",
        y_axis=week_y_data,
        is_smooth=True,
        symbol="circle",
        symbol_size=6,
        linestyle_opts=opts.LineStyleOpts(color="#fff"),
        label_opts=opts.LabelOpts(color="white"), #  数字标签 
        itemstyle_opts=opts.ItemStyleOpts(
            color="red", border_color="#fff", border_width=3
        ),
        # 提示框
        tooltip_opts=opts.TooltipOpts(is_show=False),
        areastyle_opts=opts.AreaStyleOpts(color='#FFF8DC', opacity=1), #  玉米色 
    )
    .set_global_opts(
        title_opts=opts.TitleOpts(
            title="一周各天评论量",
            pos_top="5%",
            pos_left="center",
            title_textstyle_opts=opts.TextStyleOpts(color="#fff", font_size=16),
        ),
        xaxis_opts=opts.AxisOpts(
            axislabel_opts=opts.LabelOpts(color="pink"),
            axisline_opts=opts.AxisLineOpts(is_show= False),  # 默认是True 

            splitline_opts=opts.SplitLineOpts(
                is_show=True, linestyle_opts=opts.LineStyleOpts(color="#ffffff1f")
            ),
        ),
        yaxis_opts=opts.AxisOpts(
            axislabel_opts=opts.LabelOpts(color="yellow"),
            axisline_opts=opts.AxisLineOpts(
                linestyle_opts=opts.LineStyleOpts(width=5, color="red") #  w为了让大家看到参数是哪里 
            ),

            splitline_opts=opts.SplitLineOpts(
                is_show=True, linestyle_opts=opts.LineStyleOpts(color="#ffffff1f")
            ),
        ),
    )
)
line1.render_notebook()

 日历图

times = [x.strftime('%Y-%m-%d') for x in list(pd.date_range('20210801', '20210831'))]

data = [i for i  in  zip(times,day_y_data)]


Cal = (
    Calendar(init_opts=opts.InitOpts(width="800px", height="500px"))
    .add(
        series_name="八月份每日评论量分布情况",
        yaxis_data=data,
        calendar_opts=opts.CalendarOpts(
             pos_top='20%',
             pos_left='15%',
             range_="2021-08",
             cell_size=30,
        ),
        tooltip_opts='{c}',
    )
    .set_global_opts(
        visualmap_opts=opts.VisualMapOpts(
            orient="horizontal", 
            max_=2000,
            pos_bottom='10%',
            is_piecewise=True,
            pieces=[{"min": 1200},
                    {"min": 800, "max": 1200},
                    {"min": 500, "max": 800},
                    {"min": 300, "max": 500},
                    {"min": 80, "max": 300},
                    {"max": 80}],
            range_color=["#F5F5F5", "#FFE4E1", "#FFCC99", "#F08080", "#CD5C5C", "#990000"]
        ),
        legend_opts=opts.LegendOpts(is_show=True,
                                    pos_top='5%',
                                    item_width = 20, #  红色的legend的宽度
                                    item_height = 10,
                                    textstyle_opts=opts.TextStyleOpts(font_size=16,color='#EB1934'),
                                   ),
    )
)
Cal.render_notebook()

 角色热度

主要人物:小白、小青、许仙、法海、司马、孙姐、牛头帮主、蒙面男子、宝青坊主、书生

roles=['小白','小青','许仙','法海','司马','孙姐','牛头帮主','蒙面男子','宝青坊主','书生']
content=''.join([str(i) for i in list(df['评论'])])#把评论都放在字符串里面
roles_num=[]


for role in roles:
    count=content.count(role)
    roles_num.append((role,count))
roles_num=pd.DataFrame(roles_num)
roles_num.columns=['名称','出现次数']

roles_num=roles_num.sort_values(by='出现次数',ascending=False)
roles_num=roles_num.reset_index(drop=True)  # 重置索引


# 线性渐变
b2 = (
        Bar()
        .add_xaxis(list(roles_num['名称']))
        .add_yaxis('频次', list(roles_num['出现次数']),itemstyle_opts=opts.ItemStyleOpts(color='red'))
        .set_global_opts(title_opts=opts.TitleOpts(title='影评角色频次分布',pos_top='2%',pos_left = 'center'),
            legend_opts=opts.LegendOpts(is_show=False), # 图例为False
            yaxis_opts=opts.AxisOpts(name="频次",
                                     #name_location='middle',
                                     name_gap=30,
                                     name_textstyle_opts=opts.TextStyleOpts(font_size=16,color = 'red')))

    )
b2.render_notebook()

 观众地域分布

cities = df['城市'].to_list()
data = Counter(cities).most_common(80)  # 出现最多次数的80个 

geo = (
    Geo(init_opts=opts.InitOpts(width="1000px", height="600px", bg_color="#404a59"))
    .add_schema(maptype="china")
    .add("评论数量", data,type_=ChartType.HEATMAP,)
    .set_global_opts(
       title_opts=opts.TitleOpts(title="地理位置分布",pos_top="2%", pos_left="center",
                                 title_textstyle_opts=opts.TextStyleOpts(color="#fff", font_size=16)),
       legend_opts=opts.LegendOpts(is_show=False),
       visualmap_opts=opts.VisualMapOpts(
            is_show=True,
            is_piecewise=True,
            min_ = 0,
            max_ = 500,
            split_number = 5,
            pos_bottom='5%',
            pos_left='5%',
            textstyle_opts=opts.TextStyleOpts(color="#fff"),
            pieces=[
                {'max':500, 'min':401, 'label':'401-500', 'color': '#990000'},
                {'max':400, 'min':301, 'label':'301-400', 'color': '#CD5C5C'},
                {'max':300, 'min':201, 'label':'201-300', 'color': '#F08080'},
                {'max':200, 'min':101, 'label':'101-200', 'color': '#FFCC99'},
                {'max':100, 'min':0, 'label':'0-100', 'color': '#FFE4E1'},
               ],
            ),
    )
)
geo.render_notebook()

 从地域分布图来看,观众主要分布在北京、天津、上海、重庆、四川、广东、云南等地。

影评词云

# #建立云图

corpus = ' '.join(df['评论'])

corpus = corpus.replace(',', '')

from matplotlib import colors
import matplotlib.pyplot as  plt

color_list = ['#FFB6C1','#FFFF00','#B22222','#000000']

c = colors.ListedColormap(color_list)
from wordcloud import WordCloud, STOPWORDS
wordcloud = WordCloud(stopwords=STOPWORDS,
                         background_color='white',
                         width=2400,
                         height=2000,
                         colormap= c,
                         font_path=r'C:\Windows\Fonts\simsun.ttc').generate(
                             corpus, )



#绘制职位要求能力云图
plt.figure(figsize = (10,10))
plt.imshow(wordcloud)  #  imshow()接收一张图像,只是画出该图,并不会立刻显示出来(pycharm中,notebook可以显示)
plt.axis('off')
plt.show()

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值