前言
原始数据可以通过我分享的资源获取
NBA–2017年数据表
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
获取数据
data = pd.read_csv("./data/nba_2017_nba_players_with_salary.csv")
data.head()
data.shape
(342, 38)
data.describe()
| Rk | AGE | MP | FG | FGA | FG% | 3P | 3PA | 3P% | 2P | ... | GP | MPG | ORPM | DRPM | RPM | WINS_RPM | PIE | PACE | W | SALARY_MILLIONS |
---|
count | 342.000000 | 342.000000 | 342.000000 | 342.000000 | 342.000000 | 342.000000 | 342.000000 | 342.000000 | 320.000000 | 342.000000 | ... | 342.000000 | 342.000000 | 342.000000 | 342.000000 | 342.000000 | 342.000000 | 342.000000 | 342.000000 | 342.000000 | 342.000000 |
---|
mean | 217.269006 | 26.444444 | 21.572515 | 3.483626 | 7.725439 | 0.446096 | 0.865789 | 2.440058 | 0.307016 | 2.620175 | ... | 58.198830 | 21.572807 | -0.676023 | -0.005789 | -0.681813 | 2.861725 | 9.186842 | 98.341053 | 28.950292 | 7.294006 |
---|
std | 136.403138 | 4.295686 | 8.804018 | 2.200872 | 4.646933 | 0.078992 | 0.780010 | 2.021716 | 0.134691 | 1.828714 | ... | 22.282015 | 8.804121 | 2.063237 | 1.614293 | 2.522014 | 3.880914 | 3.585475 | 2.870091 | 14.603876 | 6.516326 |
---|
min | 1.000000 | 19.000000 | 2.200000 | 0.000000 | 0.800000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 2.000000 | 2.200000 | -4.430000 | -3.920000 | -6.600000 | -2.320000 | -1.600000 | 87.460000 | 0.000000 | 0.030000 |
---|
25% | 100.250000 | 23.000000 | 15.025000 | 1.800000 | 4.225000 | 0.402250 | 0.200000 | 0.800000 | 0.280250 | 1.200000 | ... | 43.500000 | 15.025000 | -2.147500 | -1.222500 | -2.422500 | 0.102500 | 7.100000 | 96.850000 | 19.000000 | 2.185000 |
---|
50% | 205.500000 | 26.000000 | 21.650000 | 3.000000 | 6.700000 | 0.442000 | 0.700000 | 2.200000 | 0.340500 | 2.200000 | ... | 66.000000 | 21.650000 | -0.990000 | -0.130000 | -1.170000 | 1.410000 | 8.700000 | 98.205000 | 29.000000 | 4.920000 |
---|
75% | 327.750000 | 29.000000 | 29.075000 | 4.700000 | 10.400000 | 0.481000 | 1.400000 | 3.600000 | 0.373500 | 3.700000 | ... | 76.000000 | 29.075000 | 0.257500 | 1.067500 | 0.865000 | 4.487500 | 10.900000 | 100.060000 | 39.000000 | 11.110000 |
---|
max | 482.000000 | 40.000000 | 37.800000 | 10.300000 | 24.000000 | 0.750000 | 4.100000 | 10.000000 | 1.000000 | 9.700000 | ... | 82.000000 | 37.800000 | 7.270000 | 6.020000 | 8.420000 | 20.430000 | 23.000000 | 109.870000 | 66.000000 | 30.960000 |
---|
8 rows × 35 columns
数据分析
数据相关性
data_cor = data.loc[:, ['RPM', 'AGE', 'SALARY_MILLIONS', 'ORB',
'DRB', 'TRB','AST', 'STL',
'BLK', 'TOV', 'PF',
'POINTS', 'GP', 'MPG', 'ORPM', 'DRPM']]
data_cor.head()
| RPM | AGE | SALARY_MILLIONS | ORB | DRB | TRB | AST | STL | BLK | TOV | PF | POINTS | GP | MPG | ORPM | DRPM |
---|
0 | 6.27 | 28 | 26.50 | 1.7 | 9.0 | 10.7 | 10.4 | 1.6 | 0.4 | 5.4 | 2.3 | 31.6 | 81 | 34.6 | 6.74 | -0.47 |
---|
1 | 4.81 | 27 | 26.50 | 1.2 | 7.0 | 8.1 | 11.2 | 1.5 | 0.5 | 5.7 | 2.7 | 29.1 | 81 | 36.4 | 6.38 | -1.57 |
---|
2 | 1.83 | 27 | 6.59 | 0.6 | 2.1 | 2.7 | 5.9 | 0.9 | 0.2 | 2.8 | 2.2 | 28.9 | 76 | 33.8 | 5.72 | -3.89 |
---|
3 | 4.35 | 23 | 22.12 | 2.3 | 9.5 | 11.8 | 2.1 | 1.3 | 2.2 | 2.4 | 2.2 | 28.0 | 75 | 36.1 | 0.45 | 3.90 |
---|
4 | 4.20 | 26 | 16.96 | 2.1 | 8.9 | 11.0 | 4.6 | 1.4 | 1.3 | 3.7 | 3.9 | 27.0 | 72 | 34.2 | 3.56 | 0.64 |
---|
corr = data_cor.corr()
corr.head()
| RPM | AGE | SALARY_MILLIONS | ORB | DRB | TRB | AST | STL | BLK | TOV | PF | POINTS | GP | MPG | ORPM | DRPM |
---|
RPM | 1.000000 | 0.175820 | 0.477542 | 0.388764 | 0.623515 | 0.587853 | 0.481971 | 0.599008 | 0.463097 | 0.492014 | 0.434226 | 0.604432 | 0.340810 | 0.549449 | 0.769822 | 0.578388 |
---|
AGE | 0.175820 | 1.000000 | 0.353312 | -0.015752 | 0.088859 | 0.062064 | 0.114908 | 0.069892 | -0.062917 | 0.030673 | 0.005512 | 0.031422 | 0.051863 | 0.099657 | 0.136177 | 0.100636 |
---|
SALARY_MILLIONS | 0.477542 | 0.353312 | 1.000000 | 0.264954 | 0.531569 | 0.482088 | 0.486159 | 0.446763 | 0.260288 | 0.536993 | 0.341512 | 0.635425 | 0.348093 | 0.594162 | 0.503682 | 0.102307 |
---|
ORB | 0.388764 | -0.015752 | 0.264954 | 1.000000 | 0.731345 | 0.861103 | -0.011632 | 0.169075 | 0.654265 | 0.274670 | 0.557957 | 0.284908 | 0.296975 | 0.342140 | 0.102113 | 0.476857 |
---|
DRB | 0.623515 | 0.088859 | 0.531569 | 0.731345 | 1.000000 | 0.976244 | 0.350786 | 0.485726 | 0.660733 | 0.598043 | 0.670708 | 0.648267 | 0.473376 | 0.684662 | 0.428433 | 0.426536 |
---|
plt.figure(figsize=(20, 8), dpi=100)
sns.heatmap(corr, square=True, linewidths=0.1, annot=True)

基本数据排名分析
data.loc[:, ["PLAYER", "RPM",
"AGE"]].sort_values(by="RPM", ascending=False).head()
| PLAYER | RPM | AGE |
---|
6 | LeBron James | 8.42 | 32 |
---|
37 | Chris Paul | 7.92 | 31 |
---|
8 | Stephen Curry | 7.41 | 28 |
---|
120 | Draymond Green | 7.14 | 26 |
---|
7 | Kawhi Leonard | 7.08 | 25 |
---|
data.loc[:, ["PLAYER", "RPM", "AGE",
"SALARY_MILLIONS"]].sort_values(by="SALARY_MILLIONS",
ascending=False).head()
| PLAYER | RPM | AGE | SALARY_MILLIONS |
---|
6 | LeBron James | 8.42 | 32 | 30.96 |
---|
25 | Mike Conley | 4.47 | 29 | 26.54 |
---|
67 | Al Horford | 1.82 | 30 | 26.54 |
---|
0 | Russell Westbrook | 6.27 | 28 | 26.50 |
---|
1 | James Harden | 4.81 | 27 | 26.50 |
---|
Seaborn常用的三个数据可视化方法
单变量:
sns.set_style("darkgrid")
plt.figure(figsize=(10, 10))
plt.subplot(3, 1, 1)
sns.distplot(data["SALARY_MILLIONS"])
plt.ylabel("salary")
plt.subplot(3, 1, 2)
sns.distplot(data["RPM"])
plt.ylabel("RPM")
plt.subplot(3, 1, 3)
sns.distplot(data["AGE"])
plt.ylabel("AGE")

双变量
sns.jointplot(data.AGE, data.SALARY_MILLIONS, kind="hex")

多变量
multi_data = data.loc[:, ['RPM','SALARY_MILLIONS','AGE','POINTS']]
multi_data.head()
| RPM | SALARY_MILLIONS | AGE | POINTS |
---|
0 | 6.27 | 26.50 | 28 | 31.6 |
---|
1 | 4.81 | 26.50 | 27 | 29.1 |
---|
2 | 1.83 | 6.59 | 27 | 28.9 |
---|
3 | 4.35 | 22.12 | 23 | 28.0 |
---|
4 | 4.20 | 16.96 | 26 | 27.0 |
---|
sns.pairplot(multi_data)

衍生变量的一些可视化实践-以年龄为例
def age_cut(df):
"""年龄划分"""
if df.AGE <= 24:
return "young"
elif df.AGE >= 30:
return "old"
else:
return "best"
data["age_cut"] = data.apply(lambda x:age_cut(x), axis=1)
data.head()
| Rk | PLAYER | POSITION | AGE | MP | FG | FGA | FG% | 3P | 3PA | ... | MPG | ORPM | DRPM | RPM | WINS_RPM | PIE | PACE | W | SALARY_MILLIONS | age_cut |
---|
0 | 1 | Russell Westbrook | PG | 28 | 34.6 | 10.2 | 24.0 | 0.425 | 2.5 | 7.2 | ... | 34.6 | 6.74 | -0.47 | 6.27 | 17.34 | 23.0 | 102.31 | 46 | 26.50 | best |
---|
1 | 2 | James Harden | PG | 27 | 36.4 | 8.3 | 18.9 | 0.440 | 3.2 | 9.3 | ... | 36.4 | 6.38 | -1.57 | 4.81 | 15.54 | 19.0 | 102.98 | 54 | 26.50 | best |
---|
2 | 3 | Isaiah Thomas | PG | 27 | 33.8 | 9.0 | 19.4 | 0.463 | 3.2 | 8.5 | ... | 33.8 | 5.72 | -3.89 | 1.83 | 8.19 | 16.1 | 99.84 | 51 | 6.59 | best |
---|
3 | 4 | Anthony Davis | C | 23 | 36.1 | 10.3 | 20.3 | 0.505 | 0.5 | 1.8 | ... | 36.1 | 0.45 | 3.90 | 4.35 | 12.81 | 19.2 | 100.19 | 31 | 22.12 | young |
---|
4 | 6 | DeMarcus Cousins | C | 26 | 34.2 | 9.0 | 19.9 | 0.452 | 1.8 | 5.0 | ... | 34.2 | 3.56 | 0.64 | 4.20 | 11.26 | 17.8 | 97.11 | 30 | 16.96 | best |
---|
5 rows × 39 columns
data["cut"] = 1
data.loc[data.age_cut == "best"].SALARY_MILLIONS.head()
0 26.50
1 26.50
2 6.59
4 16.96
5 24.33
Name: SALARY_MILLIONS, dtype: float64
sns.set_style("darkgrid")
plt.figure(figsize=(10,10), dpi=100)
plt.title("RPM and Salary")
x1 = data.loc[data.age_cut == "old"].SALARY_MILLIONS
y1 = data.loc[data.age_cut == "old"].RPM
plt.plot(x1, y1, "^")
x2 = data.loc[data.age_cut == "best"].SALARY_MILLIONS
y2 = data.loc[data.age_cut == "best"].RPM
plt.plot(x2, y2, "^")
x3 = data.loc[data.age_cut == "young"].SALARY_MILLIONS
y3 = data.loc[data.age_cut == "young"].RPM
plt.plot(x3, y3, ".")

multi_data2 = data.loc[:, ['RPM','POINTS',
'TRB','AST','STL','BLK','age_cut']]
sns.pairplot(multi_data2, hue="age_cut")

球队数据分析
球队薪资排行
data.groupby(by="age_cut").agg({"SALARY_MILLIONS":np.max})
| SALARY_MILLIONS |
---|
age_cut | |
---|
best | 26.54 |
---|
old | 30.96 |
---|
young | 22.12 |
---|
data_team = data.groupby(by="TEAM").agg({"SALARY_MILLIONS":np.mean})
data_team.sort_values(by="SALARY_MILLIONS",
ascending=False).head(10)
| SALARY_MILLIONS |
---|
TEAM | |
---|
CLE | 17.095000 |
---|
HOU | 13.432000 |
---|
GS | 12.701429 |
---|
ORL/TOR | 11.125000 |
---|
POR | 9.730000 |
---|
WSH | 9.628889 |
---|
ORL | 9.490000 |
---|
MIL/CHA | 9.425000 |
---|
SA | 9.347273 |
---|
NO/SAC | 8.970000 |
---|
data_rpm = data.groupby(by=["TEAM",
"age_cut"]).agg({"SALARY_MILLIONS": np.mean,
"RPM": np.mean, "PLAYER": np.size})
data_rpm.sort_values(by=["PLAYER", "RPM"], ascending=False).head()
data_rpm.head()
| | SALARY_MILLIONS | RPM | PLAYER |
---|
TEAM | age_cut | | | |
---|
ATL | best | 4.678000 | -1.768000 | 5 |
---|
old | 12.775000 | 0.982500 | 4 |
---|
young | 1.926667 | -3.076667 | 3 |
---|
ATL/CLE | old | 5.040000 | -2.485000 | 2 |
---|
ATL/PHI/OKC | best | 8.400000 | 1.720000 | 1 |
---|
按照球队综合实力排名
data_rpm2 = data.groupby(by=['TEAM'],
as_index=False).agg({'SALARY_MILLIONS': np.mean,
'RPM': np.mean,
'PLAYER': np.size,
'POINTS': np.mean,
'eFG%': np.mean,
'MPG': np.mean,
'AGE': np.mean})
data_rpm2.head()
| TEAM | SALARY_MILLIONS | RPM | PLAYER | POINTS | eFG% | MPG | AGE |
---|
0 | ATL | 6.689167 | -1.178333 | 12 | 7.416667 | 0.442667 | 18.541667 | 27.000000 |
---|
1 | ATL/CLE | 5.040000 | -2.485000 | 2 | 7.650000 | 0.582000 | 21.050000 | 35.500000 |
---|
2 | ATL/PHI/OKC | 8.400000 | 1.720000 | 1 | 13.100000 | 0.511000 | 26.100000 | 29.000000 |
---|
3 | BKN | 5.704545 | -1.224545 | 11 | 9.045455 | 0.487273 | 20.227273 | 27.636364 |
---|
4 | BKN/WSH | 4.910000 | -4.045000 | 2 | 8.150000 | 0.470000 | 17.350000 | 27.000000 |
---|
data_rpm2.sort_values(by="RPM", ascending=False).head()
| TEAM | SALARY_MILLIONS | RPM | PLAYER | POINTS | eFG% | MPG | AGE |
---|
18 | GS | 12.701429 | 3.478571 | 7 | 14.528571 | 0.575143 | 26.700000 | 28.714286 |
---|
9 | CLE | 17.095000 | 2.566667 | 6 | 15.883333 | 0.555833 | 29.766667 | 28.000000 |
---|
2 | ATL/PHI/OKC | 8.400000 | 1.720000 | 1 | 13.100000 | 0.511000 | 26.100000 | 29.000000 |
---|
20 | HOU | 13.432000 | 1.582000 | 5 | 15.420000 | 0.534600 | 29.980000 | 27.200000 |
---|
44 | SA | 9.347273 | 0.901818 | 11 | 9.818182 | 0.524182 | 21.472727 | 29.545455 |
---|
利用箱线图和小提琴图进行数据分析
data.TEAM.isin(['GS', 'CLE', 'SA', 'LAC',
'OKC', 'UTAH', 'CHA', 'TOR', 'NO', 'BOS']).head()
0 True
1 False
2 True
3 True
4 False
Name: TEAM, dtype: bool
sns.set_style("whitegrid")
plt.figure(figsize=(20, 10))
data_team2 = data[data.TEAM.isin(['GS', 'CLE', 'SA', 'LAC',
'OKC', 'UTAH', 'CHA',
'TOR', 'NO', 'BOS'])]
plt.subplot(3,1,1)
sns.boxplot(x="TEAM", y="SALARY_MILLIONS", data = data_team2)
plt.subplot(3,1,2)
sns.boxplot(x="TEAM", y="AGE", data = data_team2)
plt.subplot(3,1,3)
sns.boxplot(x="TEAM", y="MPG", data = data_team2)

sns.set_style("whitegrid")
plt.figure(figsize=(20, 10))
plt.subplot(3,1,1)
sns.violinplot(x="TEAM", y="3P%", data=data_team2)
plt.subplot(3,1,2)
sns.violinplot(x="TEAM", y="eFG%", data=data_team2)
plt.subplot(3,1,3)
sns.violinplot(x="TEAM", y="POINTS", data=data_team2)
