import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
1 问题一
想要知道电影数据中某些数据的均值等
In [24]:
movie = pd.read_csv("./data/IMDB-Movie-Data.csv")
In [25]:
movie.head()
Out[25]:
Rank | Title | Genre | Description | Director | Actors | Year | Runtime (Minutes) | Rating | Votes | Revenue (Millions) | Metascore | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Guardians of the Galaxy | Action,Adventure,Sci-Fi | A group of intergalactic criminals are forced … | James Gunn | Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S… | 2014 | 121 | 8.1 | 757074 | 333.13 | 76.0 |
1 | 2 | Prometheus | Adventure,Mystery,Sci-Fi | Following clues to the origin of mankind, a te… | Ridley Scott | Noomi Rapace, Logan Marshall-Green, Michael Fa… | 2012 | 124 | 7.0 | 485820 | 126.46 | 65.0 |
2 | 3 | Split | Horror,Thriller | Three girls are kidnapped by a man with a diag… | M. Night Shyamalan | James McAvoy, Anya Taylor-Joy, Haley Lu Richar… | 2016 | 117 | 7.3 | 157606 | 138.12 | 62.0 |
3 | 4 | Sing | Animation,Comedy,Family | In a city of humanoid animals, a hustling thea… | Christophe Lourdelet | Matthew McConaughey,Reese Witherspoon, Seth Ma… | 2016 | 108 | 7.2 | 60545 | 270.32 | 59.0 |
4 | 5 | Suicide Squad | Action,Adventure,Fantasy | A secret government agency recruits some of th… | David Ayer | Will Smith, Jared Leto, Margot Robbie, Viola D… | 2016 | 123 | 6.2 | 393727 | 325.02 | 40.0 |
In [26]:
movie["Rating"].mean()
Out[26]:
6.723200000000003
In [27]:
movie["Director"].count()
Out[27]:
1000
In [28]:
movie["Director"].unique().shape[0]
Out[28]:
644
2 问题2
这一组电影数据,如果我们想Rating,Runtime (Minutes)的分布情况,应该如何呈现数据?
In [29]:
movie["Rating"].plot(kind ='hist')
Out[29]:
<AxesSubplot:ylabel='Frequency'>
In [30]:
# 改进
# 1.创建画布
plt.figure(figsize=(20,8),dpi=100)
# 2.绘制
plt.hist(movie["Rating"].values,20)
# 2.1 增加X轴刻度
x_max = movie["Rating"].max()
x_min = movie["Rating"].min()
x1 = np.linspace(x_min ,x_max, 21) # 从x_min到x_max,分成20块
# print(x1)
plt.xticks(x1)
plt.grid()
plt.show()
- 电影时长
In [31]:
#创建画布
plt.figure(figsize=(20,8),dpi = 100 )
#绘制直方图
plt.hist(movie["Runtime (Minutes)"].values,20)
#增加X轴刻度
max_ = movie["Runtime (Minutes)"].max()
min_ = movie["Runtime (Minutes)"].min()
x2 = np.linspace(min_,max_,21)
plt.xticks(x2)
plt.grid()
plt.show()
3 问题三:
对于这一组电影数据,如果我们希望统计电影不同种类(genre)的个数,应该如何处理数据?
In [32]:
movie.head()
Out[32]:
Rank | Title | Genre | Description | Director | Actors | Year | Runtime (Minutes) | Rating | Votes | Revenue (Millions) | Metascore | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Guardians of the Galaxy | Action,Adventure,Sci-Fi | A group of intergalactic criminals are forced … | James Gunn | Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S… | 2014 | 121 | 8.1 | 757074 | 333.13 | 76.0 |
1 | 2 | Prometheus | Adventure,Mystery,Sci-Fi | Following clues to the origin of mankind, a te… | Ridley Scott | Noomi Rapace, Logan Marshall-Green, Michael Fa… | 2012 | 124 | 7.0 | 485820 | 126.46 | 65.0 |
2 | 3 | Split | Horror,Thriller | Three girls are kidnapped by a man with a diag… | M. Night Shyamalan | James McAvoy, Anya Taylor-Joy, Haley Lu Richar… | 2016 | 117 | 7.3 | 157606 | 138.12 | 62.0 |
3 | 4 | Sing | Animation,Comedy,Family | In a city of humanoid animals, a hustling thea… | Christophe Lourdelet | Matthew McConaughey,Reese Witherspoon, Seth Ma… | 2016 | 108 | 7.2 | 60545 | 270.32 | 59.0 |
4 | 5 | Suicide Squad | Action,Adventure,Fantasy | A secret government agency recruits some of th… | David Ayer | Will Smith, Jared Leto, Margot Robbie, Viola D… | 2016 | 123 | 6.2 | 393727 | 325.02 | 40.0 |
In [33]:
m_g = [i.split(",") for i in movie['Genre']]
In [34]:
m_g[0:10:]
#前十个
Out[34]:
[['Action', 'Adventure', 'Sci-Fi'],
['Adventure', 'Mystery', 'Sci-Fi'],
['Horror', 'Thriller'],
['Animation', 'Comedy', 'Family'],
['Action', 'Adventure', 'Fantasy'],
['Action', 'Adventure', 'Fantasy'],
['Comedy', 'Drama', 'Music'],
['Comedy'],
['Action', 'Adventure', 'Biography'],
['Adventure', 'Drama', 'Romance']]
In [35]:
genre_un =np.array( np.unique([j for i in m_g for j in i ]))
In [36]:
genre_un
Out[36]:
array(['Action', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime',
'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music',
'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Sport', 'Thriller',
'War', 'Western'], dtype='<U9')
In [37]:
genre_un.shape[0] # shape[0]:输出行数 shape[1]:输出列数
Out[37]:
20
In [38]:
movie.shape[0]
Out[38]:
1000
In [39]:
# 提取数据,形成表格
data_gen = pd.DataFrame(np.zeros([movie.shape[0],genre_un.shape[0]]),columns = genre_un)
In [40]:
data_gen.head()
Out[40]:
Action | Adventure | Animation | Biography | Comedy | Crime | Drama | Family | Fantasy | History | Horror | Music | Musical | Mystery | Romance | Sci-Fi | Sport | Thriller | War | Western | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
In [41]:
for i in range(movie.shape[0]):
data_gen.loc[i,m_g[i]] = 1
# df.loc[ 行索引, 列索引]
# loc函数通过调用index名称的具体值来取数据
In [42]:
data_gen.head()
Out[42]:
Action | Adventure | Animation | Biography | Comedy | Crime | Drama | Family | Fantasy | History | Horror | Music | Musical | Mystery | Romance | Sci-Fi | Sport | Thriller | War | Western | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
3 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
In [43]:
data_gen.sum().sort_values()
Out[43]:
Musical 5.0
Western 7.0
War 13.0
Music 16.0
Sport 18.0
History 29.0
Animation 49.0
Family 51.0
Biography 81.0
Fantasy 101.0
Mystery 106.0
Horror 119.0
Sci-Fi 120.0
Romance 141.0
Crime 150.0
Thriller 195.0
Adventure 259.0
Comedy 279.0
Action 303.0
Drama 513.0
dtype: float64
In [44]:
data_gen.sum().sort_values(ascending = False).plot(kind = "bar",figsize = (20,5),fontsize = 15)
Out[44]:
<AxesSubplot:>