trainData # 数据概览
最后一列是因变量
def check_var(df, col): # 查看变量概览
'''
df: DataFrame
col: column's name
return/print: describe/value_counts and groupby.y.mean()
需要修改的部分: 把因变量y1_is_purchase 换成当前dataframe的因变量名
'''
if len(df[col].unique())>10: # 如果唯一值个数大于10,可认为是连续型变量
print(df[col].describe())
bins = np.unique(df[col].describe()[3:-1].values)
bins = np.append(bins, np.inf)
df['test'] = pd.cut(df[col], bins=bins, right=False)
print(df.groupby('test').y1_is_purchase.mean()) # 换
else: # 唯一值小于等于10,认为是离散型变量
print(df[col].value_counts().sort_values().sort_index())
print(df.groupby(col).y1_is_purchase.mean().sort_index()) # 换
if __name__ == '__main__':
check_var(trainData, col='si_tp')
>>>count 6.842830e+05
mean 8.554260e+05
std 4.836979e+05
min 0.000000e+00
25% 5.000000e+05
50% 1.000000e+06
75% 1.000000e+06
max 1.000000e+07
Name: si_tp, dtype: float64
test
[0.0, 500000.0) 0.325231
[500000.0, 1000000.0) 0.635996
[1000000.0, inf) 0.698135
Name: y1_is_purchase, dtype: float64