对数据集“Netflix电影电视剧及用户观影数据“的分析处理和可视化
一、寻找数据集
from kaggle:《Netflix Movies and TV Shows》 -------- Shivam Bansal
二、数据集分析
1、首先,通过pandas模块导入csv包
import pandas as pd
data = pd.read_csv('movie_data.csv')
In [3] data #数据内容
num_critic_for_reviews duration gross genres num_voted_users num_user_for_reviews language country budget title_year imdb_score
0 723.0 178.0 760505847.0 Action|Adventure|Fantasy|Sci-Fi 886204 3054.0 English USA 237000000.0 2009.0 7.9
1 302.0 169.0 309404152.0 Action|Adventure|Fantasy 471220 1238.0 English USA 300000000.0 2007.0 7.1
2 602.0 148.0 200074175.0 Action|Adventure|Thriller 275868 994.0 English UK 245000000.0 2015.0 6.8
3 813.0 164.0 448130642.0 Action|Thriller 1144337 2701.0 English USA 250000000.0 2012.0 8.5
4 NaN NaN NaN Documentary 8 NaN NaN NaN NaN NaN 7.1
... ... ... ... ... ... ... ... ... ... ... ...
5038 1.0 87.0 NaN Comedy|Drama 629 6.0 English Canada NaN 2013.0 7.7
5039 43.0 43.0 NaN Crime|Drama|Mystery|Thriller 73839 359.0 English USA NaN NaN 7.5
5040 13.0 76.0 NaN Drama|Horror|Thriller 38 3.0 English USA 1400.0 2013.0 6.3
5041 14.0 100.0 10443.0 Comedy|Drama|Romance 1255 9.0 English USA NaN 2012.0 6.3
5042 43.0 90.0 85222.0 Documentary 4285 84.0 English USA 1100.0 2004.0 6.6
5043 rows × 11 columns
2、然后我们首先处理数据集中IMDB电影评分的数据:
In [4] score1,score2,score3,score4=0,0,0,0
for i in range(5043):
if(data.imdb_score[i]<5):
score1=score1+1;
elif(data.imdb_score[i]>5 and data.imdb_score[i]<7):
score2=score2+1;
elif (data.imdb_score[i]>7 and data.imdb_score[i] < 9):
score3 = score3 + 1;
else :
score4 = score4 + 1;
导入绘图包,这里我使用的是matplotlib
import matplotlib.pyplot as plt
labels = '0-5', '5-7', '7-9', '>9' #定义各板块名称
sizes = score1,score2,score3,score4 #各板块数据
colors = 'yellowgreen', 'gold', 'lightskyblue', 'lightcoral' # 设定颜色
explode = 0, 0.1, 0, 0 #板块间的间隙
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=50)
plt.axis('equal')
plt.show()
3、再统计1987-2020的中国电影,并做成折线图
from pandas import Series,DataFrame
import numpy as np
x = np.arange(1987,2020) #定义折线图X坐标
y=[0]*33 #定义折线图y坐标
k=0
for i in data.country: #遍历数据集中的国家项
if(i=='China'):
m=int(data.title_year[k])-1987
y[m]=y[m]+1
k=k+1
plt.figure(figsize=(10, 4), dpi=100) #定义画布
plt.plot(x, y)
plt.show() #显示图像
4、接下里统计各国电影的数量
mpl.rcParams["font.sans-serif"] = ["SimHei"]
mpl.rcParams["axes.unicode_minus"] = False #解决中文显示问题
plt.figure(figsize=(8,6))
labels=list(data.country.unique()) #使用pandas内置函数进行分类计数
fracs=[]
for i in labels:
fracs.append(data.loc[data.country==i].shape[0]) #labels列表存的是国家,fracs列表存的是对应国家的个数
导入画世界地图所需的包
from pyecharts import options as opts
from pyecharts.charts import Map,Geo
import os
作图
data = []
for index in range(len(labels)):
city_ionfo=[labels[index],fracs[index]]
data.append(city_ionfo)
c = (
Map()
.add("Netflix全球电影分布",data, "world")
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(
title_opts=opts.TitleOpts(),
visualmap_opts=opts.VisualMapOpts(max_=200) #因为美国和其他国家的数量差距过大,所以这里将最大值设为200(美国为3807),方便观察。
)
)
c.render_notebook() #将图片在jupyter中显示出来
# os.system("render.html") # 用html打开
5、我们再来统计各类电影的占比
PS:这里我采用依次计数的方式,有更好的方法请告诉我。。。。
action,adventure,fantasy,sciencefiction,mystery,family,thriller,documentary,romance,comedy,animation,musical,western,history,drama,crime=0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
for i in data.genres:
if("Action" in i):
action=action+1;
if("Adventure" in i):
adventure=adventure+1;
if("Fantasy" in i):
fantasy=fantasy+1;
if("Sci-Fi" in i):
sciencefiction=sciencefiction+1;
if("Mystery" in i):
mystery=mystery+1;
if("Family" in i):
family=family+1;
if("Thriller" in i):
thriller=thriller+1;
if("Documentary" in i):
documentary=documentary+1;
if("Romance" in i):
romance=romance+1;
if("Comedy" in i):
comedy=comedy+1;
if("Animation" in i):
animation=animation+1;
if("Musical" in i):
musical=musical+1;
if("Western" in i):
western=western+1;
if("History" in i):
history=history+1;
if("Drama" in i):
drama=drama+1;
if("Crime" in i):
crime=crime+1;
print(action,adventure,fantasy,sciencefiction,mystery,family,thriller,documentary,romance,comedy,animation,musical,western,history,drama,crime)
out:1153 923 610 616 500 546 1411 121 1107 1872 242 132 97 207 2594 889
print(" 电影类型饼状图") #输出饼状图,同上
labels ='action','adventure','fantasy','sciencefiction','mystery','family','thriller','documentary','romance','comedy','animation','musical','western','history','drama','crime'
sizes = action,adventure,fantasy,sciencefiction,mystery,family,thriller,documentary,romance,comedy,animation,musical,western,history,drama,crime
colors = 'yellowgreen', 'gold', 'lightskyblue', 'lightcoral','yellowgreen', 'gold', 'lightskyblue', 'lightcoral','yellowgreen', 'gold', 'lightskyblue', 'lightcoral','yellowgreen', 'gold', 'lightskyblue', 'lightcoral'
explode = 0, 0, 0, 0,0, 0, 0, 0,0, 0, 0, 0,0, 0, 0, 0
plt.pie(sizes, radius=2.5,explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=50)
plt.show()
5、再我们统计netflix上电影的预算以及对应的总票房和观众的认可度
data=data.loc[data.budget.notnull()] #剔除掉数据集中budget属性为空的元组
y1,y2=[],[]
for i in range(100,125):
y1.append(data.budget[i])
y2.append(data.gross[i])
x = np.arange(25) #定义折线图X坐标
plt.figure(figsize=(8,4)) #定义画布
plt.plot(x, y1, '.-',label='预算/投入') #添加第一条折线到图中
plt.plot(x, y2, '.-',label='票房/收入')
plt.legend()
plt.xlabel('个数')
plt.ylabel('/十亿美元')
plt.ylim((0,1000000000))
plt.title('Netflix电影的预算以及对应的总票房')
plt.show()
6、最后我们分析数据集中,观众点赞数的数据
①总数据集观众点赞数饼状图分布
data = pd.read_csv('movie_metadata.csv')
score1,score2,score3,score4,score5=0,0,0,0,0
for i in range(5043):
if(data.num_voted_users[i]<2000):
score1=score1+1;
elif(data.num_voted_users[i]>2000 and data.num_voted_users[i]<10000):
score2=score2+1;
elif (data.num_voted_users[i]>10000 and data.num_voted_users[i] <20000):
score3 = score3 + 1;
elif (data.num_voted_users[i]>20000 and data.num_voted_users[i] <50000):
score4 = score4 + 1;
elif(data.num_voted_users[i]>50000):
score5 = score5 + 1;
labels1 ='2千以下', '2千-1万','1万-2万', '2万-5万', '5万以上'
sizes = score1,score2,score3,score4,score5
colors = 'yellowgreen', 'gold', 'lightskyblue', 'lightcoral','gold'
explode = 0, 0, 0, 0,0
plt.pie(sizes, explode=explode, labels=labels1, colors=colors, autopct='%1.1f%%', shadow=True, startangle=50)
plt.axis('equal')
plt.title('观众点赞饼状图')
plt.show()
②从数据集中随机抽取100个数据作成散点图
import random
list,d2=[],[]
for i in range(100):
list.append(random.randint(1, 4551))
for i in list:
d2.append(data.num_user_for_reviews[i])
d1 = np.random.randn(100)
plt.scatter(d1,d2)
plt.title("观众点赞/投票数散点图")
数据集分析完毕