前言
原始数据可以通过我分享的资源获取 NBA–2017年数据表
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
import seaborn as sns
获取数据
data = pd. read_csv( "./data/nba_2017_nba_players_with_salary.csv" )
data. head( )
data. shape
(342, 38)
data. describe( )
Rk AGE MP FG FGA FG% 3P 3PA 3P% 2P ... GP MPG ORPM DRPM RPM WINS_RPM PIE PACE W SALARY_MILLIONS count 342.000000 342.000000 342.000000 342.000000 342.000000 342.000000 342.000000 342.000000 320.000000 342.000000 ... 342.000000 342.000000 342.000000 342.000000 342.000000 342.000000 342.000000 342.000000 342.000000 342.000000 mean 217.269006 26.444444 21.572515 3.483626 7.725439 0.446096 0.865789 2.440058 0.307016 2.620175 ... 58.198830 21.572807 -0.676023 -0.005789 -0.681813 2.861725 9.186842 98.341053 28.950292 7.294006 std 136.403138 4.295686 8.804018 2.200872 4.646933 0.078992 0.780010 2.021716 0.134691 1.828714 ... 22.282015 8.804121 2.063237 1.614293 2.522014 3.880914 3.585475 2.870091 14.603876 6.516326 min 1.000000 19.000000 2.200000 0.000000 0.800000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 2.000000 2.200000 -4.430000 -3.920000 -6.600000 -2.320000 -1.600000 87.460000 0.000000 0.030000 25% 100.250000 23.000000 15.025000 1.800000 4.225000 0.402250 0.200000 0.800000 0.280250 1.200000 ... 43.500000 15.025000 -2.147500 -1.222500 -2.422500 0.102500 7.100000 96.850000 19.000000 2.185000 50% 205.500000 26.000000 21.650000 3.000000 6.700000 0.442000 0.700000 2.200000 0.340500 2.200000 ... 66.000000 21.650000 -0.990000 -0.130000 -1.170000 1.410000 8.700000 98.205000 29.000000 4.920000 75% 327.750000 29.000000 29.075000 4.700000 10.400000 0.481000 1.400000 3.600000 0.373500 3.700000 ... 76.000000 29.075000 0.257500 1.067500 0.865000 4.487500 10.900000 100.060000 39.000000 11.110000 max 482.000000 40.000000 37.800000 10.300000 24.000000 0.750000 4.100000 10.000000 1.000000 9.700000 ... 82.000000 37.800000 7.270000 6.020000 8.420000 20.430000 23.000000 109.870000 66.000000 30.960000
8 rows × 35 columns
数据分析
数据相关性
data_cor = data. loc[ : , [ 'RPM' , 'AGE' , 'SALARY_MILLIONS' , 'ORB' ,
'DRB' , 'TRB' , 'AST' , 'STL' ,
'BLK' , 'TOV' , 'PF' ,
'POINTS' , 'GP' , 'MPG' , 'ORPM' , 'DRPM' ] ]
data_cor. head( )
RPM AGE SALARY_MILLIONS ORB DRB TRB AST STL BLK TOV PF POINTS GP MPG ORPM DRPM 0 6.27 28 26.50 1.7 9.0 10.7 10.4 1.6 0.4 5.4 2.3 31.6 81 34.6 6.74 -0.47 1 4.81 27 26.50 1.2 7.0 8.1 11.2 1.5 0.5 5.7 2.7 29.1 81 36.4 6.38 -1.57 2 1.83 27 6.59 0.6 2.1 2.7 5.9 0.9 0.2 2.8 2.2 28.9 76 33.8 5.72 -3.89 3 4.35 23 22.12 2.3 9.5 11.8 2.1 1.3 2.2 2.4 2.2 28.0 75 36.1 0.45 3.90 4 4.20 26 16.96 2.1 8.9 11.0 4.6 1.4 1.3 3.7 3.9 27.0 72 34.2 3.56 0.64
corr = data_cor. corr( )
corr. head( )
RPM AGE SALARY_MILLIONS ORB DRB TRB AST STL BLK TOV PF POINTS GP MPG ORPM DRPM RPM 1.000000 0.175820 0.477542 0.388764 0.623515 0.587853 0.481971 0.599008 0.463097 0.492014 0.434226 0.604432 0.340810 0.549449 0.769822 0.578388 AGE 0.175820 1.000000 0.353312 -0.015752 0.088859 0.062064 0.114908 0.069892 -0.062917 0.030673 0.005512 0.031422 0.051863 0.099657 0.136177 0.100636 SALARY_MILLIONS 0.477542 0.353312 1.000000 0.264954 0.531569 0.482088 0.486159 0.446763 0.260288 0.536993 0.341512 0.635425 0.348093 0.594162 0.503682 0.102307 ORB 0.388764 -0.015752 0.264954 1.000000 0.731345 0.861103 -0.011632 0.169075 0.654265 0.274670 0.557957 0.284908 0.296975 0.342140 0.102113 0.476857 DRB 0.623515 0.088859 0.531569 0.731345 1.000000 0.976244 0.350786 0.485726 0.660733 0.598043 0.670708 0.648267 0.473376 0.684662 0.428433 0.426536
plt. figure( figsize= ( 20 , 8 ) , dpi= 100 )
sns. heatmap( corr, square= True , linewidths= 0.1 , annot= True )
基本数据排名分析
data. loc[ : , [ "PLAYER" , "RPM" ,
"AGE" ] ] . sort_values( by= "RPM" , ascending= False ) . head( )
PLAYER RPM AGE 6 LeBron James 8.42 32 37 Chris Paul 7.92 31 8 Stephen Curry 7.41 28 120 Draymond Green 7.14 26 7 Kawhi Leonard 7.08 25
data. loc[ : , [ "PLAYER" , "RPM" , "AGE" ,
"SALARY_MILLIONS" ] ] . sort_values( by= "SALARY_MILLIONS" ,
ascending= False ) . head( )
PLAYER RPM AGE SALARY_MILLIONS 6 LeBron James 8.42 32 30.96 25 Mike Conley 4.47 29 26.54 67 Al Horford 1.82 30 26.54 0 Russell Westbrook 6.27 28 26.50 1 James Harden 4.81 27 26.50
Seaborn常用的三个数据可视化方法
单变量:
sns. set_style( "darkgrid" )
plt. figure( figsize= ( 10 , 10 ) )
plt. subplot( 3 , 1 , 1 )
sns. distplot( data[ "SALARY_MILLIONS" ] )
plt. ylabel( "salary" )
plt. subplot( 3 , 1 , 2 )
sns. distplot( data[ "RPM" ] )
plt. ylabel( "RPM" )
plt. subplot( 3 , 1 , 3 )
sns. distplot( data[ "AGE" ] )
plt. ylabel( "AGE" )
双变量
sns. jointplot( data. AGE, data. SALARY_MILLIONS, kind= "hex" )
多变量
multi_data = data. loc[ : , [ 'RPM' , 'SALARY_MILLIONS' , 'AGE' , 'POINTS' ] ]
multi_data. head( )
RPM SALARY_MILLIONS AGE POINTS 0 6.27 26.50 28 31.6 1 4.81 26.50 27 29.1 2 1.83 6.59 27 28.9 3 4.35 22.12 23 28.0 4 4.20 16.96 26 27.0
sns. pairplot( multi_data)
衍生变量的一些可视化实践-以年龄为例
def age_cut ( df) :
"""年龄划分"""
if df. AGE <= 24 :
return "young"
elif df. AGE >= 30 :
return "old"
else :
return "best"
data[ "age_cut" ] = data. apply ( lambda x: age_cut( x) , axis= 1 )
data. head( )
Rk PLAYER POSITION AGE MP FG FGA FG% 3P 3PA ... MPG ORPM DRPM RPM WINS_RPM PIE PACE W SALARY_MILLIONS age_cut 0 1 Russell Westbrook PG 28 34.6 10.2 24.0 0.425 2.5 7.2 ... 34.6 6.74 -0.47 6.27 17.34 23.0 102.31 46 26.50 best 1 2 James Harden PG 27 36.4 8.3 18.9 0.440 3.2 9.3 ... 36.4 6.38 -1.57 4.81 15.54 19.0 102.98 54 26.50 best 2 3 Isaiah Thomas PG 27 33.8 9.0 19.4 0.463 3.2 8.5 ... 33.8 5.72 -3.89 1.83 8.19 16.1 99.84 51 6.59 best 3 4 Anthony Davis C 23 36.1 10.3 20.3 0.505 0.5 1.8 ... 36.1 0.45 3.90 4.35 12.81 19.2 100.19 31 22.12 young 4 6 DeMarcus Cousins C 26 34.2 9.0 19.9 0.452 1.8 5.0 ... 34.2 3.56 0.64 4.20 11.26 17.8 97.11 30 16.96 best
5 rows × 39 columns
data[ "cut" ] = 1
data. loc[ data. age_cut == "best" ] . SALARY_MILLIONS. head( )
0 26.50
1 26.50
2 6.59
4 16.96
5 24.33
Name: SALARY_MILLIONS, dtype: float64
sns. set_style( "darkgrid" )
plt. figure( figsize= ( 10 , 10 ) , dpi= 100 )
plt. title( "RPM and Salary" )
x1 = data. loc[ data. age_cut == "old" ] . SALARY_MILLIONS
y1 = data. loc[ data. age_cut == "old" ] . RPM
plt. plot( x1, y1, "^" )
x2 = data. loc[ data. age_cut == "best" ] . SALARY_MILLIONS
y2 = data. loc[ data. age_cut == "best" ] . RPM
plt. plot( x2, y2, "^" )
x3 = data. loc[ data. age_cut == "young" ] . SALARY_MILLIONS
y3 = data. loc[ data. age_cut == "young" ] . RPM
plt. plot( x3, y3, "." )
multi_data2 = data. loc[ : , [ 'RPM' , 'POINTS' ,
'TRB' , 'AST' , 'STL' , 'BLK' , 'age_cut' ] ]
sns. pairplot( multi_data2, hue= "age_cut" )
球队数据分析
球队薪资排行
data. groupby( by= "age_cut" ) . agg( { "SALARY_MILLIONS" : np. max } )
SALARY_MILLIONS age_cut best 26.54 old 30.96 young 22.12
data_team = data. groupby( by= "TEAM" ) . agg( { "SALARY_MILLIONS" : np. mean} )
data_team. sort_values( by= "SALARY_MILLIONS" ,
ascending= False ) . head( 10 )
SALARY_MILLIONS TEAM CLE 17.095000 HOU 13.432000 GS 12.701429 ORL/TOR 11.125000 POR 9.730000 WSH 9.628889 ORL 9.490000 MIL/CHA 9.425000 SA 9.347273 NO/SAC 8.970000
data_rpm = data. groupby( by= [ "TEAM" ,
"age_cut" ] ) . agg( { "SALARY_MILLIONS" : np. mean,
"RPM" : np. mean, "PLAYER" : np. size} )
data_rpm. sort_values( by= [ "PLAYER" , "RPM" ] , ascending= False ) . head( )
data_rpm. head( )
SALARY_MILLIONS RPM PLAYER TEAM age_cut ATL best 4.678000 -1.768000 5 old 12.775000 0.982500 4 young 1.926667 -3.076667 3 ATL/CLE old 5.040000 -2.485000 2 ATL/PHI/OKC best 8.400000 1.720000 1
按照球队综合实力排名
data_rpm2 = data. groupby( by= [ 'TEAM' ] ,
as_index= False ) . agg( { 'SALARY_MILLIONS' : np. mean,
'RPM' : np. mean,
'PLAYER' : np. size,
'POINTS' : np. mean,
'eFG%' : np. mean,
'MPG' : np. mean,
'AGE' : np. mean} )
data_rpm2. head( )
TEAM SALARY_MILLIONS RPM PLAYER POINTS eFG% MPG AGE 0 ATL 6.689167 -1.178333 12 7.416667 0.442667 18.541667 27.000000 1 ATL/CLE 5.040000 -2.485000 2 7.650000 0.582000 21.050000 35.500000 2 ATL/PHI/OKC 8.400000 1.720000 1 13.100000 0.511000 26.100000 29.000000 3 BKN 5.704545 -1.224545 11 9.045455 0.487273 20.227273 27.636364 4 BKN/WSH 4.910000 -4.045000 2 8.150000 0.470000 17.350000 27.000000
data_rpm2. sort_values( by= "RPM" , ascending= False ) . head( )
TEAM SALARY_MILLIONS RPM PLAYER POINTS eFG% MPG AGE 18 GS 12.701429 3.478571 7 14.528571 0.575143 26.700000 28.714286 9 CLE 17.095000 2.566667 6 15.883333 0.555833 29.766667 28.000000 2 ATL/PHI/OKC 8.400000 1.720000 1 13.100000 0.511000 26.100000 29.000000 20 HOU 13.432000 1.582000 5 15.420000 0.534600 29.980000 27.200000 44 SA 9.347273 0.901818 11 9.818182 0.524182 21.472727 29.545455
利用箱线图和小提琴图进行数据分析
data. TEAM. isin( [ 'GS' , 'CLE' , 'SA' , 'LAC' ,
'OKC' , 'UTAH' , 'CHA' , 'TOR' , 'NO' , 'BOS' ] ) . head( )
0 True
1 False
2 True
3 True
4 False
Name: TEAM, dtype: bool
sns. set_style( "whitegrid" )
plt. figure( figsize= ( 20 , 10 ) )
data_team2 = data[ data. TEAM. isin( [ 'GS' , 'CLE' , 'SA' , 'LAC' ,
'OKC' , 'UTAH' , 'CHA' ,
'TOR' , 'NO' , 'BOS' ] ) ]
plt. subplot( 3 , 1 , 1 )
sns. boxplot( x= "TEAM" , y= "SALARY_MILLIONS" , data = data_team2)
plt. subplot( 3 , 1 , 2 )
sns. boxplot( x= "TEAM" , y= "AGE" , data = data_team2)
plt. subplot( 3 , 1 , 3 )
sns. boxplot( x= "TEAM" , y= "MPG" , data = data_team2)
sns. set_style( "whitegrid" )
plt. figure( figsize= ( 20 , 10 ) )
plt. subplot( 3 , 1 , 1 )
sns. violinplot( x= "TEAM" , y= "3P%" , data= data_team2)
plt. subplot( 3 , 1 , 2 )
sns. violinplot( x= "TEAM" , y= "eFG%" , data= data_team2)
plt. subplot( 3 , 1 , 3 )
sns. violinplot( x= "TEAM" , y= "POINTS" , data= data_team2)