摘要
Stacking方法是模型融合的一个方法,当然流派很多,我知道的有:
(然而我分不清Blend和Stack的区别, 手动滑稽)
- 使用有放回无重复的取样,训练多个子模型,通过子模型生成新的训练集和测试集,再进行下一波训练
- 使用多个弱模型的输出作为下一层模型的输入
- Kaggle中的这个方法
Kaggle原文链接
示例代码
说明:这段代码用于Kaggle Toxic 分类比赛:
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge
import pandas as pd
import numpy as np
from functools import reduce
files = [
'./submission_NewData/[base_22_1]ronghe_v4_0.9865.csv',
'./submission_NewData/[base_19]blend_it_all_[0.9868].csv',
'./submission_NewData/[base_20]blend0.9870_[0.9870].csv',
'./submission_NewData/DL_fusion/[fusion_21_3]_[Base_(6+10_2+13+14)]+[Fusion_(4_3+13+22_1+22_2)]submission_fusion_[13411111]_[0.9866].csv',
'./submission_NewData/DL_fusion/[9873]_submission_fusion_[Weighted].csv',
'./submission_NewData/DL_fusion/[9872_top]submission_fusion_[Weighted].csv',
'./submission_NewData/DL_fusion/[fusion_0.9871]_[Base(6+10_2+13+14+15+17_1+17_2)_DL(14_2+16_2+17+18)_DL_Fusion(4_3+13+22_1+22_2+23)]_[11.57.563.57311.51.51.511112].csv'
]
dataframes = [pd.read_csv(file) for file in files]
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
def get_MinMax_Mean_Stacking(label, concat_sub):
concat_sub[label] = np.where(np.all(concat_sub.iloc[:, 0:file_number] > cutoff_lo, axis=1),
concat_sub[label + '_max'],
np.where(np.all(concat_sub.iloc[:, 0:file_number] < cutoff_hi, axis=1),
concat_sub[label + '_min'],
concat_sub[label + '_mean']))
return concat_sub
def get_MinMax_Median_Stacking(label, concat_sub):
concat_sub[label] = np.where(np.all(concat_sub.iloc[:, 0:file_number] > cutoff_lo, axis=1),
concat_sub[label + '_max'],
np.where(np.all(concat_sub.iloc[:, 0:file_number] < cutoff_hi, axis=1),
concat_sub[label + '_min'],
concat_sub[label + '_median']))
return concat_sub
def get_MinMax_BestBase_Stacking(label, concat_sub):
sub_base = pd.read_csv('./submission_NewData/DL_fusion/[9874]_submission_fusion_[Weighted].csv')
concat_sub[label+'_base'] = sub_base[label]
concat_sub[label] = np.where(np.all(concat_sub.iloc[:, 0:file_number] > cutoff_lo, axis=1),
concat_sub[label+'_max'],
np.where(np.all(concat_sub.iloc[:, 0:file_number] < cutoff_hi, axis=1),
concat_sub[label+'_min'],
concat_sub[label+'_base']))
return concat_sub
result = []
file_number = len(files)
for label in label_cols:
dfs = [df[label] for df in dataframes]
concat_sub = pd.concat(dfs, axis=1)
cols = list(map(lambda x: label + str(x), range(len(concat_sub.columns))))
concat_sub.columns = cols
concat_sub[label+'_max'] = concat_sub.iloc[:, 0:file_number].max(axis=1)
concat_sub[label+'_min'] = concat_sub.iloc[:, 0:file_number].min(axis=1)
concat_sub[label+'_mean'] = concat_sub.iloc[:, 0:file_number].mean(axis=1)
concat_sub[label+'_median'] = concat_sub.iloc[:, 0:file_number].median(axis=1)
cutoff_lo = 0.8
cutoff_hi = 0.2
concat_sub = get_MinMax_Mean_Stacking(label, concat_sub)
print(concat_sub.head())
result.append(concat_sub[label])
ids = dataframes[0]['id']
concat_res = pd.concat([ids] + result, axis=1)
concat_res.columns = ['id'] + label_cols
print(concat_res.head())
concat_res.to_csv('./submission_NewData/DL_fusion/submission_fusion_[Stacking].csv', index=False)