import matplotlib.pylab as plt
import pandas as pd
from matplotlib.font_manager import FontProperties
from pyecharts.charts import Bar,Pie,Grid
from pyecharts import options
from pyecharts import options as opts
import re
font_set = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=12)
df = pd.read_csv(r"E:\movie_info(1.3).csv",encoding='gb18030')
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']
#电影发行量分析:折线图
def FaXingLiang():
dfyear = df['年份'].dropna().astype(int) #dropna:去空值
# year = df['年份'][0][:4] #取前四位
dff = pd.Series(dfyear).value_counts()
dff.sort_index(inplace=True,ascending=True) #对索引列排序,inplace:覆盖源数据;ascending:true:升序。false:降序
# dff.sort_values(inplace=False) #对dateframe的多列同时进行排序
# print(dff)
plt.plot(dff.index,dff.values) #线
plt.ylabel('数量', fontproperties=font_set) #Y轴
plt.xlabel('年份', fontproperties=font_set) #X轴
plt.title('年份发行量分析', fontproperties=font_set) #标题
plt.xlabel('数据分析:1888年开始数据一致处于上升趋势,且2000年到2016年上升趋势最大,2016年之后呈下降趋势', color='red'
'', fontsize=15, fontproperties=font_set)
plt.xticks(dff.index) #将x轴所有数据都显示
# plt.leg end() #图例
# plt.show() #输出图
return plt
#电影类型分析:饼图
def LeiXing():
df = pd.read_csv(r"E:\movie_info(1.3).csv", encoding='gb18030')
LX = df["类型"]
list1 = []
for x in LX:
if "/" in str(x):
x = x.split("/")
for y in x:
list1.append(y)
else:
list1.append(x)
dff = pd.Series(list1).value_counts()
# print(dff)
# print(list1)
# 画饼图
# explode:设置各部分突出
# label:设置各部分标签
# labeldistance:设置标签文本距圆心位置,1.1表示1.1倍半径
# autopct:设置圆里面文本
# shadow:设置是否有阴影
# startangle:起始角度,默认从0开始逆时针转
# pctdistance:设置圆内文本距圆心距离
# 返回值
# l_text:圆内部文本,matplotlib.text.Text object
# p_text:圆外部文本
plt.pie(dff, autopct='%.2f%%')
plt.legend(labels=dff.index, ncol=2)
plt.axis("equal") # 设置横轴和纵轴大小相等,这样饼才是圆的
plt.title("各个类型电影分析")
plt.xlabel("数据分析:剧情和喜剧依然是最爱出的电影", color='red', fontsize=15)
# plt.legend()
return plt
#评分分析:echarts柱状图
def PingFen():
# 导入输出图片工具
from pyecharts.render import make_snapshot
# 使用snapshot-selenium 渲染图片
from snapshot_selenium import snapshot
dff = df["评分"].dropna().value_counts() # 去空值 统计
dff.sort_index(inplace=True, ascending=True) # 对索引列排序,inplace:覆盖源数据;ascending:true:升序。false:降序
# dff.plot(kind="bar")
# plt.xlabel("数据分析:(1)从均分的分布情况来看出来,评分分布最多的区间大概是6.5~7.5分之间,和我们计算的评分均值--6.8分也是不冲突的;"
# "(2)整体来看,大部分电影都是超过5分的,所以说5分以下的电影是烂片一点也不过分。",color='red',fontsize=12)
# plt.show()
# print(dff)
bar = (Bar()
.add_xaxis([float(x) for x in dff.index])
.add_yaxis('评分', [int(x) for x in dff])
.set_global_opts(title_opts=options.TitleOpts('评分分析')) # 添加标题
.set_series_opts(label_opts=options.LabelOpts(is_show=False)) #不显示具体数据
)
# 输出保存为图片
make_snapshot(snapshot, bar.render(), "评分分析.png")
return bar
#分析电影的时长:echarts
def ShiChang():
dff = df['长度'].dropna()#去空值并统计
list1 = []
for x in dff:
if " (" in str(x):
x = x.split(' (')[0]
list1.append(x)
elif ": " in str(x):
x = x.split(': ')[1]
list1.append(x)
elif "(" in str(x):
x = x.split('(')[0]
list1.append(x)
elif '(' in str(x):
x = x.split('(')[0]
else:
list1.append(x)
# list1.sort_index(inplace=True,ascending=True) #排序
list2 = []
for x in list1:
i = re.sub('\s','',x)
y = re.sub('[a-zA-Z]+','',i)
z = re.sub('[^\w\s]+','',y)
zz = re.sub('[\u4e00-\u9fa5]','',z)
list2.append(zz)
df1 = pd.Series(list2)
df1.values.astype(int)
count = df1.value_counts()
count.sort_values(ascending=True,inplace=True)
# bar = (Bar()
# .add_xaxis([x for x in count.index])
# .add_yaxis("时长",[x for x in count.values])
# .set_global_opts(title_opts=opts.TitleOpts(title="时长分析"))
# )
h = list(count.items())
pie = Pie()
pie.add('总数',data_pair=h)
pie.render("地区总数饼图.html")
return pie
#不同国家或地区发行电影数量:饼图
def GuoJia():
dfcountry = df['国家']
list1 = []
list2 = []
for x in dfcountry:
if " / " in str(x) or "/" in str(x):
x1 = x.split(" / " and "/")
for xx in x1:
list1.append(xx)
else:
list1.append(x)
for i in list1:
if " " in str(i):
ii = i.replace(' ', '')
list2.append(ii)
dff = pd.Series(list2).value_counts()
df1 = pd.DataFrame({"地区": dff.index, "总数": dff.values})
# dff.plot(kind="bar")
list2 = []
for i in df1['总数']:
if i == 1 or i == 2 or i == 3 or i == 4 or i == 5 or i == 6 or i == 7 or i == 8 or i == 10 or i == 11:
list2.append(i)
dff["少数国家"] = len(list2)
dff.sort_values(inplace=True, ascending=False)
df1.to_csv('地区.csv', index=False)
df2 = pd.read_csv('地区.csv')
# plt.pie(dff.values, autopct='%.2f%%', center=(10, 0))
plt.pie(df2['总数'], autopct="%.2f%%")
plt.title('地区饼图')
plt.legend(labels=dff.index, ncol=5, loc=10, bbox_to_anchor=(-0.09, 0.5))
# plt.legend(labels=df2['地区'], ncol=2, loc=10, bbox_to_anchor=(-0.3, 0.5))
return plt
#用echarts画不同国家发行电影总量
def GuoJiaE():
dff = pd.read_csv(r'E:\地区.csv',encoding='gb18030')
# 1.x版本支持链式调用
# bar = (Bar()
# .add_xaxis([x for x in dff['地区']])
# .add_yaxis('总数', [x for x in dff['总数']])
# .set_global_opts(title_opts=opts.TitleOpts(title="不同国家或地区发行电影数量"))
# .set_series_opts(label_opts=options.LabelOpts(is_show=False)) # 不显示具体数据
# )
pie = (Pie()
.add('', [list(z) for z in zip([x for x in dff['地区']], [x for x in dff['总数']])],
radius=["30%", "70%"],center=["50%", "50%"],
rosetype="radius")
.set_global_opts(title_opts=opts.TitleOpts(title="不同国家或地区发行电影数量"))
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {d}%"))
# .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) # 不显示具体数据 ????????????????
)
return pie
#语言分析
def YuYan():
dff = df['语言'].dropna()
# print(dff)
list1 = []
list2 = []
for x in dff:
if " / " in str(x) or "/" in str(x):
x1 = x.split(" / " and "/")
for x2 in x1:
list1.append(x2)
else:
list1.append(x)
# print(list1)
for x in list1:
if " " in str(x):
x1 = x.replace(" ","")
list2.append(x1)
else:
list2.append(x)
# print(list2)
df1 = pd.Series(list2).value_counts()
one = []
two = []
for x in df1.index:
one.append(x)
for x in df1.values:
two.append(int(x))
Fang = 0
Shou = 0
for i in df1.index:
if i !="汉语普通话":
if ("方言" or "话" or "手语") in i:
Fang = Fang + df1[i]
df1.drop(i,inplace=True,axis = 0)
if ("手语" or "话") in i:
Shou = Shou + df1[i]
df1.drop(i, inplace=True, axis=0)
df1["方言"] = Fang
# df1["手语"] = Shou
print(df1)
# bar = (
# Bar()
# .add_xaxis(one)
# .add_yaxis('111',two)
# )
# bar.render('111.html')
# df2 = pd.DataFrame({'语言':df1.index,'总数':df1.values})
# print(df2.values)
# bar = (Bar()
# .add_xaxis(df2.index)
# .add_yaxis("总数",df2.values)
# .set_global_opts(title_opts=options.TitleOpts("语言分析"),toolbox_opts=options.ToolboxOpts(is_show=True))
# )
# return bar
if __name__ == "__main__":
# FaXingLiang().show()
# LeiXing().show()
# PingFen().render('评分分析.html')
# ShiChang().render("电影时长.html")
# GuoJia().show()
GuoJiaE().render("不同国家或地区发行电影数量扇形图.html")
# YuYan().render("语言分析.html")
# YuYan()
豆瓣电影数据可视化,分别用matplotlib和pyecharts画图
最新推荐文章于 2023-04-25 18:37:19 发布