文章目录
1、seaborn绘制单变量分布
import pandas as pd
import seaborn as sns,numpy as np
np.random.seed(0)
x=np.random.randn(100)
ax=sns.distplot(x,bins=10)
2、绘制双变量分布图行
import pandas as pd
import seaborn as sns,numpy as np
df = pd.DataFrame({"x":np.random.randn(500),"y":np.random.randn(500)})
df.head()
sns.jointplot("x","y",data=df)
sns.jointplot("x","y",data=df,kind="hex")
#核密度估计图形
sns.jointplot("x","y",data=df,kind="kde")
dataset = sns.load_dataset("iris")
dataset.head()
sns.pairplot(dataset)
3、类别散点图
import seaborn as sns,numpy as np
data = sns.load_dataset("tips")
data.head()
sns.stripplot(x="day",y="total_bill",data=data,hue="time") #1
sns.stripplot(x="day",y="total_bill",data=data,hue="time",jitter = True) #2
sns.swarmplot("day","total_bill",data=data) #3
#1
#2
#3
4、类别内的数据分布
4.1、箱型图
sns.boxplot("day","total_bill",data=data,hue="time")
sns.boxplot("day","total_bill",data=data,hue="time",palette=["g","r"],saturation=0.9)
4.2、提琴图
sns.violinplot("day","total_bill",data=data,hue="time")
5、类别内统计估计
5.1、绘制条形图
x="day",y="total_bill",data=data
5.2、绘制点图
sns.pointplot(x="day",y="total_bill",data=data)
6、NBA数据获取和相关性基本分析
6.1基础数据指标
6.2、指标相关性分析
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv("D:\\data\\seaborn-data-master\\nba_2017_nba_players_with_salary.csv")
data.head()
data.shape
data.describe()
#数据相关性
data_cor = data.loc[:,['RPM','AGE','SALARY_MILLIONS','ORB','DRB','TRB',
'AST','STL','BLK','TOV','PF','POINTS','GP','MPG','ORPM','DRPM']]
data_cor.head()
corr = data_cor.corr()
corr.head()
plt.figure(figsize=(20 , 8),dpi=100)
sns.heatmap(corr,square= True,linewidths=0.1,annot=True)
6.3、基本分析
基本数据按照效率值排名分析
data.loc[:,["PLAYER","RPM","AGE"]].sort_values(by="RPM",ascending=False)
按照球员薪资排名分析
data.loc[:,["PLAYER","RPM","AGE","SALARY_MILLIONS"]].sort_values(by="RPM",ascending=False)
6.4、可视化
6.4.1、单变量
#球员薪水、效率值、年龄这三个信息的分布情况
sns.set_style("darkgrid")
plt.figure(figsize=(10,10))
plt.subplot(3,1,1)
sns.distplot(data["SALARY_MILLIONS"])
plt.ylabel("salary")
plt.subplot(3,1,2)
sns.distplot(data["RPM"])
plt.ylabel("RPM")
plt.subplot(3,1,3)
sns.distplot(data["AGE"])
plt.ylabel("AGE")
Text(0,0.5,'AGE')
6.4.2、双变量
sns.jointplot(data.AGE,data.SALARY_MILLIONS,kind ="hex")
6.4.3、多变量
muti_data=data.loc[:,["RPM","SALARY_MILLIONS","AGE","POINTS"]]
muti_data.head()
sns.pairplot(muti_data)
6.5、衍生变量可视化
def age_cut(df):
if df.AGE <= 24:
return "young"
elif df.AGE >= 30:
return "old"
else:
return "best"
data["age_cut"]=data.apply(lambda x:age_cut(x),axis=1)
data.head()
data["cut"]=1
#方便计数
data['cut']=1
data.loc[data.age_cut == "best"].SALARY_MILLIONS.head()
sns.set_style("darkgrid")
plt.figure(figsize=(10,10),dpi=100)
plt.title("RPM and Salary")
x1=data.loc[data.age_cut=="old"].SALARY_MILLIONS
y1=data.loc[data.age_cut=="old"].RPM
plt.plot(x1,y1,"^")
x2=data.loc[data.age_cut=="best"].SALARY_MILLIONS
y2=data.loc[data.age_cut=="best"].RPM
plt.plot(x2,y2,"^")
x3=data.loc[data.age_cut=="young"].SALARY_MILLIONS
y3=data.loc[data.age_cut=="young"].RPM
plt.plot(x3,y3,"^")
muti_data2=data.loc[:,["RPM","POINTS","TRB","AST","STL","BLK","age_cut"]]
sns.pairplot(muti_data2,hue="age_cut")
6.6、球队分析
6.6.1、按照球队分组,平均薪水降序排列
data.groupby(by="age_cut").agg({"SALARY_MILLIONS":np.max})
data_team = data.groupby(by="TEAM").agg({"SALARY_MILLIONS":np.mean})
data_team.sort_values(by="SALARY_MILLIONS",ascending=False).head(10)
6.6.1、按照球队年龄结构
分球队、分年龄段,上榜学员降序排列,如果上榜球员数相同,则按效率降序排列
data.groupby(by="age_cut").agg({"SALARY_MILLIONS":np.max})
data_team = data.groupby(by="TEAM").agg({"SALARY_MILLIONS":np.mean})
data_team.sort_values(by="SALARY_MILLIONS",ascending=False).head(10)
data_rpm = data.groupby(by=['TEAM','age_cut']).agg({"SALARY_MILLIONS":np.mean,"RPM":np.mean,"PLAYER":np.size})
data_rpm.head()
data_rpm.sort_values(by=["PLAYER","RPM"],ascending=False).head(10)
#按照球队综合实力排名
data_rpm2 = data.groupby(by=['TEAM'],as_index=False).agg({'SALARY_MILLIONS':np.mean,
'RPM':np.mean,
'PLAYER':np.size,
'POINTS':np.mean,
'FG':np.mean,
'MPG':np.mean,
'AGE':np.mean
})
data_rpm2.sort_values(by="RPM",ascending=False).head()
#利用箱线图和小提琴图进行数据分析
sns.set_style("whitegrid")
plt.figure(figsize=(20,10))
data_team2=data[data.TEAM.isin(['CS','CLE','SA','LAC','OKC','UTAH','CHA','TOR','NO','BOS'])]
plt.subplot(3,1,1)
sns.boxplot(x="TEAM",y="SALARY_MILLIONS",data=data_team2)
plt.subplot(3,1,2)
sns.boxplot(x="TEAM",y="AGE",data=data_team2)
plt.subplot(3,1,3)
sns.boxplot(x="TEAM",y="MPG",data=data_team2)
#绘制小提琴图
sns.set_style("whitegrid")
plt.figure(figsize=(20,10))
data_team2=data[data.TEAM.isin(['CS','CLE','SA','LAC','OKC','UTAH','CHA','TOR','NO','BOS'])]
plt.subplot(3,1,1)
sns.violinplot(x="TEAM",y="3P%",data=data_team2)
plt.subplot(3,1,2)
sns.violinplot(x="TEAM",y="eFG%",data=data_team2)
plt.subplot(3,1,3)
sns.violinplot(x="TEAM",y="POINTS",data=data_team2)