Netflix Shows and Movies - Exploratory Analysis
搬运Kaggle上Shivam Bansal对网飞TV和movie的数据分析
其中数据可视化主要使用了plotly库
导入数据
1.pd.to_datetime 将日期参数转换为年月(datetime对象.dt.year/dt.month返回日期的年月信息)
2.新增列可以直接df[‘新的列名’] = df[‘date_added’].dt.month(整列的数据)
3.df.apply搭配lambda函数使用,对固定列的每行元素进行修改
lambda x(输入参数):x[‘duration’].split(" ")[0] 返回值 后面接简单的函数
4.df.head() 返回数据的前五行
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
import pandas as pd
df = pd.read_csv("../input/netflix-shows/netflix_titles_nov_2019.csv")
## add new features in the dataset
df["date_added"] = pd.to_datetime(df['date_added'])
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month
df['season_count'] = df.apply(lambda x : x['duration'].split(" ")[0] if "Season" in x['duration'] else "", axis = 1)
df['duration'] = df.apply(lambda x : x['duration'].split(" ")[0] if "Season" not in x['duration'] else "", axis = 1)
df.head()
Content Type on Netflix
1.df[‘type’].value_counts() 将不同类型的元素分别汇总计数 。reset_index()重新设置索引
2.df.rename(columns = {col : “count”, “index” : col})修改df的列名。
3.plotly库的基本使用分为两个模块trace,layout,figure
trace模块主负责图表的类型和数据
layout模块主负责图表的参数设置(标题,尺寸等)
figure模块负责图表的搭建fig = go.Figure(data = [trace], layout = layout)
图表显示iplot(fig)
4.go.Pie 绘制饼状图
labels= 标签名类
values=各标签的数值
pull=
marker=dict(color=’#483D8B’,#设置条形图的颜色
line=dict(color=‘rgb(256, 256, 256)’,width=1.0,)),#设置条形图边框
name=‘总次数’,#设置这个图的名字,和图例对应
orientation=‘h’,#如果水平条形图需设置,竖直条形图不用设置
opacity=0.9)#条形图颜色的不透明度
col = "type"
grouped = df[col].value_counts().reset_index()
grouped = grouped.rename(columns = {col : "count", "index" : col})
## plot
trace = go.Pie(labels=grouped[col], values=grouped['count'], pull=[0.05, 0], marker=dict(colors=["#6ad49b", "#a678de"]))
layout = go.Layout(title="", height=400, legend=dict(x=0.1, y=1.1))
fig = go.Figure(data = [trace], layout = layout)
iplot(fig)
Growth in content over the years
1.d1 = df[df[“type”] == “TV Show”] 将数据划分成tv和movie
2.vc1 = d1[ “year_added”].value_counts().reset_index() 按每年的增加量统计
3.vc1 = vc1.sort_values(col) 按年份增长排序
4.go.Scatter 线型图像
5.fig.show() 显示图像
d1 = df[df["type"] == "TV Show"]
d2 = df[df["type"] == "Movie"]
col = "year_added"
vc1 = d1[col].value_counts().reset_index()
vc1 = vc1.rename(columns = {col : "count", "index" : col})
vc1['percent'] = vc1['count'].apply(lambda x : 100*x/sum(vc1['count']))
vc1 = vc1.sort_values(col)
vc2 = d2[col].value_counts().reset_index()
vc2 = vc2.rename(columns = {col : "count", "index" : col})
vc2['percent'] = vc2['count'].apply(lambda x : 100*x/sum(vc2['count']))
vc2 = vc2.sort_values(col)
trace1 = go.Scatter(x=vc1[col], y=vc1["count"], name="TV Shows", marker=dict(color="#a678de"))
trace2 = go.Scatter(x=vc2[col], y=vc2["count"], name="Movies", marker=dict(color="#6ad49b"))
data = [trace1, trace2]
layout = go.Layout(title="Content added over the years", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()
netflix 上电影数量的增长远远高于优秀电视节目。2018年和2019年新增电影约1300部。内容的增长始于2013年。多年来,Netflix 一直在其平台上添加不同的电影和电视节目。这些内容来自不同的国家,不同的种类,内容是多年来发布的。
Original Release Year of the movies
绘制每年发行电影的柱状图
col = "release_year"
vc1 = d1[col].value_counts().reset_index()
vc1 = vc1.rename(columns = {col : "count", "index" : col})
vc1['percent'] = vc1['count'].apply(lambda x : 100*x/sum(vc1['count']))
vc1 = vc1.sort_values(col)
vc2 = d2[col].value_counts().reset_index()
vc2 = vc2.rename(columns = {col : "count", "index" : col})
vc2['percent'] = vc2['count'].apply(lambda x : 100*x/sum(vc2['count']))
vc2 = vc2.sort_values(col)
trace1 = go.Bar(x=vc1[col], y=vc1["count"], name="TV Shows", marker=dict(color="#a678de"))
trace2 = go.Bar(x=vc2[col], y=vc2["count"], name="Movies", marker=dict(color="#6ad49b"))
data = [trace1, trace2]
layout = go.Layout(title="Content added over the years", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()
In which month, the conent is added the most?
每个月电影数量的增长
1.go.Bar
col = 'month_added'
vc1 = d1[col].value_counts().reset_index()
vc1 = vc1.rename(columns = {col : "count", "index" : col})
vc1['percent'] = vc1['count'].apply(lambda x : 100*x/sum(vc1['count']))
vc1 = vc1.sort_values(col)
trace1 = go.Bar(x=vc1[col], y=vc1["count"], name="TV Shows", marker=dict(color="#a678de"))
data = [trace1]
layout = go.Layout(title="In which month, the conent is added the most?", legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()
Some of the oldest movies/TV shows on Netflix
#按发行年份排序
small = df.sort_values("release_year", ascending = True)
#筛选出电影
small = small[small['duration'] != ""]
#只返回电影标题和发行年份 【:15】返回前15个值
small[['title', "release_year"]][:15]
small = df.sort_values("release_year", ascending = True)
small = small[small['season_count'] != ""]
small[['title', "release_year"]][:15]
各国内容产出数量
from collections import Counter
def geoplot(ddf):
country_with_code, country = {}, {}
shows_countries = ", ".join(ddf['country'].dropna()).split(", ")
for c,v in dict(Counter(shows_countries)).items():
code = ""
if c.lower() in country_codes:
code = country_codes[c.lower()]
country_with_code[code] = v
country[c] = v
country_vals = geoplot(df)
tabs = Counter(country_vals).most_common(25)
labels = [_[0] for _ in tabs][::-1]
#[::-1]将前面的元素倒序
values = [_[1] for _ in tabs][::-1]
trace1 = go.Bar(y=labels, x=values, orientation="h", name="", marker=dict(color="#a678de"))
data = [trace1]
layout = go.Layout(title="Countries with most content", height=700, legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data, layout=layout)
fig.show()
1.df=df.dropna()#删除所有包含NaN的行。
2.[::-1]将前面的元素倒序。
3.", “.join(ddf[‘country’].dropna()).split(”, "),join函数将括号内元素用“,”字符拼接,split将前面的元素根据括号内的字符分割成数组,split(’.’,2))可指定分割次数。
未完待续
后续持续更新该篇