为更好地提供数据支持的电影制作依据,要求大家以TMDB 5000 Movie Dataset数据集为研究对象,使用Pycharts库完成以下数据可视化任务:比较2012-2016年期间Universal Pictures和Paramount Pictures两家影视公司每年制作各类型电影的数量。(参考课本P208时间线轮播多图)
import pandas as pd
from pyecharts import options as opts
from pyecharts.charts import Line,Timeline
import warnings
warnings.filterwarnings('ignore')
df = pd.read_excel('第13周课后作业T2预处理后的数据.xlsx')
df
df结果
df.dropna(inplace=True)
df = df[(2012 <= df['year']) & (df['year'] <= 2016)]
df.reset_index(inplace=True)
for i in range(0,df.shape[0]):
if('Universal Pictures' in df.loc[i,'production_companies'])\
& ('Paramount Pictures' in df.loc[i,'production_companies']):
df.loc[i,'production_companies'] = 'UP'
elif 'Universal Pictures' in df.loc[i,'production_companies']:
df.loc[i,'production_companies'] = 'U'
elif 'Paramount Pictures' in df.loc[i,'production_companies']:
df.loc[i,'production_companies'] = 'P'
else:
df.drop(i,inplace=True)
#reset_index修改排序,inplace=True不创建新的对象,直接对原对象进行修改
df.reset_index(inplace=True)
df
处理之后的df
#建立genres列表,提取电影的类型
genres_set = set()
for genre in df['genres'].str.split('|'):
for item in genre:
genres_set.add(item)
genres_list = list(genres_set)
genres_list.sort()
for genre in genres_list:
#判断每行 有这个类型 对应类型的列下面添个1
df[genre] = df['genres'].str.contains(genre).apply(lambda x: 1 if x else 0 )
df_Uni = df[(df['production_companies']=='U')|(df['production_companies']=='UP')]
genre_year_Uni = df_Uni.loc[:,genres_list]
genre_year_Uni.index = df_Uni['year']
#将Universal Pictures公司对应电影类型数量按年份分组求和
genresdf_Uni = genre_year_Uni.groupby('year').sum()
genresdf_Uni
genresdf_Uni显示
df_Par = df[(df['production_companies']=='P')|(df['production_companies']=='UP')]
genre_year_Par = df_Par.loc[:,genres_list]
genre_year_Par.index = df_Par['year']
#按年份分组求和
genresdf_Par = genre_year_Par.groupby('year').sum()
genresdf_Par
genresdf_Par显示
tl = Timeline(init_opts = opts.InitOpts(width='1200px',height='650px'))
#画图
for i in range(2012,2017):
line=(
Line()
.add_xaxis(genres_list)
.add_yaxis("Universal Pictures",list(genresdf_Uni.iloc[i-2012]))
.add_yaxis("Paramount Pictures",list(genresdf_Par.iloc[i-2012]))
.set_global_opts(title_opts=opts.TitleOpts("两家公司每年制作各类型电影数量对比"),
xaxis_opts=opts.AxisOpts(name_location="start",
axislabel_opts=opts.LabelOpts(rotate=30),
name_textstyle_opts=opts.TextStyleOpts(font_size=30)),
yaxis_opts=opts.AxisOpts(name="电影数量(部)",
name_location="center",
name_gap=30))
)
tl.add(line,"{}年".format(i))
tl.add_schema(pos_right='10px',orient='vertical',width='80px',height='500px')
tl.render("两公司对比图轮播多图T2.html")
生成结果图: