分析各个变量对理赔的影响:
import pandas as pd
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
def anova(frame,qualitative):
anv=pd.DataFrame()
anv['feature']=qualitative
pvals=[]
for c in qualitative:
samples=[]
for cls in frame[c].unique():
s=frame[frame[c]==cls]['Y'].values
samples.append(s)
pval=stats.f_oneway(*samples)[1]
pvals.append(pval)
anv['pval']=pvals
return anv.sort_values('pval')
path_train='D:/compete/PINGAN-2018-train_demo.csv'
train_data=pd.read_csv(path_train)
train_data.columns = ["TERMINALNO", "TIME", "TRIP_ID", "LONGITUDE", "LATITUDE","DIRECTION","HEIGHT","SPEED","CALLSTATE", "Y"]
train_x=train_data.drop( 'Y',axis=1)
train_y=train_data['Y']
quantity = [attr for attr in train_x.columns if train_x.dtypes[attr] != 'object']
quality = [attr for attr in train_x.columns if train_x.dtypes[attr] == 'object']
a=anova(train_data,quantity)
print(a['pval'].values)
a['disparity']=np.log(1./a['pval'].values)
fig,ax=plt.subplots(figsize=(16,8))
sns.barplot(data=a,x='feature',y='disparity')
x=plt.xticks(rotation=90)
plt.show()
分析打电话的状态对理赔的影响:
发现一个很好用的数据可视化工具seaborn,是matplotlib基础上更高级的封装,作图更美观。