‘’‘无序高基数类别特征(例如城市,省份这样的):
我们用目标编码,为减小过拟合现象,采用5折交叉验证的思路,转化特征值,见下图’’’
def kfold_stats_feature(train, test, feats, k):
folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=6666) # 这里最好和后面模型的K折交叉验证保持一致
train['fold'] = None
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['isDefault'])):
train.loc[val_idx, 'fold'] = fold_
kfold_features = []
for feat in feats:
nums_columns = ['isDefault']
for f in nums_columns:
colname = feat + '_' + f + '_kfold_mean'
kfold_features.append(colname)
train[colname] = None
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['isDefault'])):
tmp_trn = train.iloc[trn_idx]
order_label = tmp_trn.groupby([feat])[f].mean()
tmp = train.loc[train.fold == fold_, [feat]]
train.loc[train.fold == fold_, colname] = tmp[feat].map(order_label)
# fillna
global_mean = train[f].mean()
train.loc[train.fold == fold_, colname] = train.loc[train.fold == fold_, colname].fillna(global_mean)
train[colname] = train[colname].astype(float)
for f in nums_columns:
colname = feat + '_' + f + '_kfold_mean'
test[colname] = None
order_label = train.groupby([feat])[f].mean()
test[colname] = test[feat].map(order_label)
# fillna
global_mean = train[f].mean()
test[colname] = test[colname].fillna(global_mean)
test[colname] = test[colname].astype(float)
del train['fold']
return train, test
target_encode_cols = ['postCode', 'regionCode', 'homeOwnership', 'employmentTitle','title']
kflod_num=5 #5折交叉验证
train, test = kfold_stats_feature(train, test, target_encode_cols, kflod_num)