import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pyplot as mpl
import pandas as pd
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False
#获取数据框
df = pd.read_csv("C:/Users/23608/Desktop/IMDB-Movie-Data.csv")
df.head()
Rank | Title | Genre | Description | Director | Actors | Year | Runtime (Minutes) | Rating | Votes | Revenue (Millions) | Metascore | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Guardians of the Galaxy | Action,Adventure,Sci-Fi | A group of intergalactic criminals are forced ... | James Gunn | Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S... | 2014 | 121 | 8.1 | 757074 | 333.13 | 76.0 |
1 | 2 | Prometheus | Adventure,Mystery,Sci-Fi | Following clues to the origin of mankind, a te... | Ridley Scott | Noomi Rapace, Logan Marshall-Green, Michael Fa... | 2012 | 124 | 7.0 | 485820 | 126.46 | 65.0 |
2 | 3 | Split | Horror,Thriller | Three girls are kidnapped by a man with a diag... | M. Night Shyamalan | James McAvoy, Anya Taylor-Joy, Haley Lu Richar... | 2016 | 117 | 7.3 | 157606 | 138.12 | 62.0 |
3 | 4 | Sing | Animation,Comedy,Family | In a city of humanoid animals, a hustling thea... | Christophe Lourdelet | Matthew McConaughey,Reese Witherspoon, Seth Ma... | 2016 | 108 | 7.2 | 60545 | 270.32 | 59.0 |
4 | 5 | Suicide Squad | Action,Adventure,Fantasy | A secret government agency recruits some of th... | David Ayer | Will Smith, Jared Leto, Margot Robbie, Viola D... | 2016 | 123 | 6.2 | 393727 | 325.02 | 40.0 |
#将评分一栏单独提取出来
cinema = df["Rating"]
cinema
0 8.1
1 7.0
2 7.3
3 7.2
4 6.2
...
995 6.2
996 5.5
997 6.2
998 5.6
999 5.3
Name: Rating, Length: 1000, dtype: float64
#转化为numpy格式
movies = np.array(cinema)
#将数组内的结果求和除以数组的shape得到平均值
result = sum(movies) / movies.shape[0]
result = round(result,2)
result
tx1 = df["Rating"]
tx2 = df["Runtime (Minutes)"]
print("Rating",tx1.head())
print("Runtime (Minutes)",tx2.head())
plt.subplots(1, 1,figsize=(10,8), dpi=100)
plt.scatter(y, tx1,label='Rating')
plt.title('Rating分布')
plt.legend()
plt.subplots(1, 1,figsize=(10,8), dpi=100)
plt.scatter(y,tx2,label='Runtime (Minutes)')
plt.title('Runtime (Minutes)分布')
plt.legend()
plt.show()
tx3 = df["Genre"]
tx4 = tx3.iloc[2].split(",")
temp = []
for i in range(tx3.shape[0]):
tx4 = np.array(tx3.iloc[i].split(","))
for j in range(tx4.shape[0]):
temp.append(tx4[j])
temp
result = np.array(temp)
print(result.shape[0])
result[:5]
df2 = pd.DataFrame({"key":result, })
df2.merge(df2)
df3 = df2.groupby(["key"]).size()
df3
x_axis = df3.index
y_axis = df3.values
x_axis
y_axis
plt.subplots(1, 1,figsize=(10,8), dpi=100)
plt.pie(y_axis,labels=x_axis)
plt.title('不同类别的电影数量')
plt.legend()