对数据集“Netflix电影电视剧及用户观影数据“的分析处理和可视化

最新推荐文章于 2025-04-24 21:53:25 发布

Amar la vida

最新推荐文章于 2025-04-24 21:53:25 发布

阅读量5.4k

点赞数 16

分类专栏：数据分析数据可视化数据集处理文章标签： matplotlib 大数据

本文链接：https://blog.csdn.net/qq_45672511/article/details/116260199

版权

数据集处理同时被 3 个专栏收录

6 篇文章

订阅专栏

数据分析

5 篇文章

订阅专栏

数据可视化

1 篇文章

订阅专栏

对数据集“Netflix电影电视剧及用户观影数据“的分析处理和可视化

一、寻找数据集

from kaggle：《Netflix Movies and TV Shows》 -------- Shivam Bansal

二、数据集分析

1、首先，通过pandas模块导入csv包

import pandas as pd
data = pd.read_csv('movie_data.csv')
In [3] data    #数据内容

	num_critic_for_reviews	duration	gross	genres	num_voted_users	num_user_for_reviews	language	country	budget	title_year	imdb_score
0	723.0	178.0	760505847.0	Action|Adventure|Fantasy|Sci-Fi	886204	3054.0	English	USA	237000000.0	2009.0	7.9
1	302.0	169.0	309404152.0	Action|Adventure|Fantasy	471220	1238.0	English	USA	300000000.0	2007.0	7.1
2	602.0	148.0	200074175.0	Action|Adventure|Thriller	275868	994.0	English	UK	245000000.0	2015.0	6.8
3	813.0	164.0	448130642.0	Action|Thriller	1144337	2701.0	English	USA	250000000.0	2012.0	8.5
4	NaN	NaN	NaN	Documentary	8	NaN	NaN	NaN	NaN	NaN	7.1
...	...	...	...	...	...	...	...	...	...	...	...
5038	1.0	87.0	NaN	Comedy|Drama	629	6.0	English	Canada	NaN	2013.0	7.7
5039	43.0	43.0	NaN	Crime|Drama|Mystery|Thriller	73839	359.0	English	USA	NaN	NaN	7.5
5040	13.0	76.0	NaN	Drama|Horror|Thriller	38	3.0	English	USA	1400.0	2013.0	6.3
5041	14.0	100.0	10443.0	Comedy|Drama|Romance	1255	9.0	English	USA	NaN	2012.0	6.3
5042	43.0	90.0	85222.0	Documentary	4285	84.0	English	USA	1100.0	2004.0	6.6
5043 rows × 11 columns

2、然后我们首先处理数据集中IMDB电影评分的数据：

In [4] score1,score2,score3,score4=0,0,0,0
for i in range(5043):
    if(data.imdb_score[i]<5):
        score1=score1+1;
    elif(data.imdb_score[i]>5 and data.imdb_score[i]<7):
        score2=score2+1;
    elif (data.imdb_score[i]>7 and data.imdb_score[i] < 9):
        score3 = score3 + 1;
    else :
        score4 = score4 + 1;

导入绘图包，这里我使用的是matplotlib

import matplotlib.pyplot as plt 
labels = '0-5', '5-7', '7-9', '>9'    #定义各板块名称
sizes = score1,score2,score3,score4   #各板块数据
colors = 'yellowgreen', 'gold', 'lightskyblue', 'lightcoral' # 设定颜色
explode = 0, 0.1, 0, 0  #板块间的间隙
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=50)
plt.axis('equal')
plt.show()

3、再统计1987-2020的中国电影，并做成折线图

from pandas import Series,DataFrame
import numpy as np
x = np.arange(1987,2020)  #定义折线图X坐标
y=[0]*33   #定义折线图y坐标
k=0
for i in data.country:    #遍历数据集中的国家项
    if(i=='China'):
        m=int(data.title_year[k])-1987
        y[m]=y[m]+1
    k=k+1
plt.figure(figsize=(10, 4), dpi=100)     #定义画布	
plt.plot(x, y)			
plt.show()         #显示图像

在这里插入图片描述
4、接下里统计各国电影的数量

mpl.rcParams["font.sans-serif"] = ["SimHei"]
mpl.rcParams["axes.unicode_minus"] = False    #解决中文显示问题
plt.figure(figsize=(8,6))         
labels=list(data.country.unique())       #使用pandas内置函数进行分类计数
fracs=[]
for i in labels:
    fracs.append(data.loc[data.country==i].shape[0])    #labels列表存的是国家，fracs列表存的是对应国家的个数

导入画世界地图所需的包

from pyecharts import options as opts  
from pyecharts.charts import Map,Geo
import os

作图

data = []
for index in range(len(labels)):
    city_ionfo=[labels[index],fracs[index]]
    data.append(city_ionfo)

c = (
    Map()
    .add("Netflix全球电影分布",data, "world")
    .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
    .set_global_opts(
        title_opts=opts.TitleOpts(),
        visualmap_opts=opts.VisualMapOpts(max_=200)    #因为美国和其他国家的数量差距过大，所以这里将最大值设为200（美国为3807），方便观察。

    )
    
)
c.render_notebook()      #将图片在jupyter中显示出来

# os.system("render.html") # 用html打开

在这里插入图片描述

5、我们再来统计各类电影的占比

PS：这里我采用依次计数的方式，有更好的方法请告诉我。。。。

action,adventure,fantasy,sciencefiction,mystery,family,thriller,documentary,romance,comedy,animation,musical,western,history,drama,crime=0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
for i in data.genres:
    if("Action" in i):
        action=action+1;
    if("Adventure" in i):
        adventure=adventure+1;
    if("Fantasy" in i):
        fantasy=fantasy+1;
    if("Sci-Fi" in i):
        sciencefiction=sciencefiction+1;
    if("Mystery" in i):
        mystery=mystery+1;
    if("Family" in i):
        family=family+1;
    if("Thriller" in i):
        thriller=thriller+1;
    if("Documentary" in i):
        documentary=documentary+1;
    if("Romance" in i):
        romance=romance+1;
    if("Comedy" in i):
       comedy=comedy+1;
    if("Animation" in i):
        animation=animation+1;
    if("Musical" in i):
        musical=musical+1;
    if("Western" in i):
        western=western+1;
    if("History" in i):
        history=history+1;
    if("Drama" in i):
        drama=drama+1;
    if("Crime" in i):
        crime=crime+1;
print(action,adventure,fantasy,sciencefiction,mystery,family,thriller,documentary,romance,comedy,animation,musical,western,history,drama,crime)

out：1153 923 610 616 500 546 1411 121 1107 1872 242 132 97 207 2594 889

print("                      电影类型饼状图")         #输出饼状图，同上
labels ='action','adventure','fantasy','sciencefiction','mystery','family','thriller','documentary','romance','comedy','animation','musical','western','history','drama','crime'
sizes = action,adventure,fantasy,sciencefiction,mystery,family,thriller,documentary,romance,comedy,animation,musical,western,history,drama,crime
colors = 'yellowgreen', 'gold', 'lightskyblue', 'lightcoral','yellowgreen', 'gold', 'lightskyblue', 'lightcoral','yellowgreen', 'gold', 'lightskyblue', 'lightcoral','yellowgreen', 'gold', 'lightskyblue', 'lightcoral'
explode = 0, 0, 0, 0,0, 0, 0, 0,0, 0, 0, 0,0, 0, 0, 0
plt.pie(sizes, radius=2.5,explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=50)
plt.show()

在这里插入图片描述
5、再我们统计netflix上电影的预算以及对应的总票房和观众的认可度
data=data.loc[data.budget.notnull()] #剔除掉数据集中budget属性为空的元组

y1,y2=[],[]
for i in range(100,125):
    y1.append(data.budget[i])
    y2.append(data.gross[i])
x = np.arange(25)  #定义折线图X坐标
plt.figure(figsize=(8,4))   #定义画布
plt.plot(x, y1, '.-',label='预算/投入')   #添加第一条折线到图中
plt.plot(x, y2, '.-',label='票房/收入')
plt.legend()
plt.xlabel('个数')
plt.ylabel('/十亿美元')
plt.ylim((0,1000000000))
plt.title('Netflix电影的预算以及对应的总票房')
plt.show()

在这里插入图片描述
6、最后我们分析数据集中，观众点赞数的数据
①总数据集观众点赞数饼状图分布

data = pd.read_csv('movie_metadata.csv')
score1,score2,score3,score4,score5=0,0,0,0,0
for i in range(5043):
    if(data.num_voted_users[i]<2000):
        score1=score1+1;
    elif(data.num_voted_users[i]>2000 and data.num_voted_users[i]<10000):
        score2=score2+1;
    elif (data.num_voted_users[i]>10000 and data.num_voted_users[i] <20000):
        score3 = score3 + 1;
    elif (data.num_voted_users[i]>20000 and data.num_voted_users[i] <50000):
        score4 = score4 + 1;    
    elif(data.num_voted_users[i]>50000):
        score5 = score5 + 1;
labels1 ='2千以下', '2千-1万','1万-2万', '2万-5万', '5万以上' 
sizes = score1,score2,score3,score4,score5
colors = 'yellowgreen', 'gold', 'lightskyblue', 'lightcoral','gold'
explode = 0, 0, 0, 0,0
plt.pie(sizes, explode=explode, labels=labels1, colors=colors, autopct='%1.1f%%', shadow=True, startangle=50)
plt.axis('equal')
plt.title('观众点赞饼状图')
plt.show()

在这里插入图片描述

②从数据集中随机抽取100个数据作成散点图

import random
list,d2=[],[]
for i in range(100):
    list.append(random.randint(1, 4551))
for i in list:
    d2.append(data.num_user_for_reviews[i])
d1 = np.random.randn(100)
plt.scatter(d1,d2)
plt.title("观众点赞/投票数散点图")

在这里插入图片描述

数据集分析完毕