import pandas as pd
import numpy as np
import scipy.stats as ss
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("./data/HR.csv")
#获得以department分组后的索引值的数组dict
dp_indices = df.groupby("department").indices
sales_values = df["left"].iloc[dp_indices["sales"]].values
technical_values = df["left"].iloc[dp_indices["technical"]].values
print(ss.ttest_ind(sales_values,technical_values)[1])
dp_keys = list(dp_indices.keys())
dp_t_mat = np.zeros([len(dp_keys),len(dp_keys)])
for i in range(len(dp_keys)):
for j in range(len(dp_keys)):
p_value = ss.ttest_ind(df["left"].iloc[dp_indices[dp_keys[i]]].values,
df["left"].iloc[dp_indices[dp_keys[j]]].values)[1]
dp_t_mat[i][j] = p_value
sns.heatmap(dp_t_mat,xticklabels=dp_keys,yticklabels=dp_keys)
plt.show()
![img_46b64192502fc4005c397749948cf270.png](https://i-blog.csdnimg.cn/blog_migrate/598ec2a124a01305ae509f15fc06a1f7.png)
颜色越深的地方t值越接近于0,也就代表颜色越深的地方,二者的离职率是有显著差异的,而颜色淡的地方代表离职率没有显著差异
piv_tb = pd.pivot_table(df,values="left",index=["promotion_last_5years","salary"],
columns=["Work_accident"],aggfunc=np.mean)
print(piv_tb)
![img_4016b545f48de7108200a0174cd24156.png](https://i-blog.csdnimg.cn/blog_migrate/07588d0e35f6f52f8c4de97b07e2c7bf.png)
sns.heatmap(piv_tb,vmin=0,vmax=1)
plt.show()
![img_4b684e9b2a237b4cc569fb41cce59666.png](https://i-blog.csdnimg.cn/blog_migrate/1682f22ad1631c481bc5b7672c46ccc8.png)