模型融合
方法1: 模型平均
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
def __init__(self, models):
self.models = models
def fit(self, X, y):
self.clone_models = [clone(x) for x in self.models]
for model in self.clone_models:
model.fit(X, y)
return self
def predict(self, X):
predictions = np.column_stack([model.predict(X) for model in self.clone_models])
return np.mean(predictions, axis=1)
测试案例:
averaged_models = AveragingModels(models = [ENet, GBoost, KRR, lasso])
score = nmse_cv(averaged_models)
print('Averaged base models score: {:.4f} ({:.4f}) \n'.format(score.mean(), score.std()))
方法2: 模型叠加
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
def __init__(self, base_models, meta_model, n_folds=5):
self.base_models = base_models
self.meta_model = meta_model
self.n_folds = n_folds
# 将原来的模型clone出来,并且实现fit功能
def fit(self, X, y):
self.clone_base_models = [list() for x in self.base_models]
self.clone_meta_model = clone(self.meta_model)
kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
# 使用K-fold的方法来进行交叉验证,将每次验证的结果作为新的特征来进行处理
for i, model in enumerate(self.base_models):
for train_index, test_index in kfold.split(X, y):
instance = clone(model)
self.clone_base_models[i].append(instance)
instance.fit(X[train_index], y[train_index])
y_pred = instance.predict(X[test_index])
out_of_fold_predictions[test_index, i] = y_pred
# 将交叉验证预测出的结果(标签)和训练集中的标签值用元模型进行训练
self.clone_meta_model.fit(out_of_fold_predictions, y)
return self
def predict(self, X):
# 得到各模型预测结果平均值的二维数组
meta_features = np.column_stack([
np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
for base_models in self.clone_base_models
])
return self.clone_meta_model.predict(meta_features)
测试案例:
stacked_averaged_models = StackingAveragedModels(base_models=(ENet, GBoost, KRR), meta_model=lasso)
score = nmse_cv(stacked_averaged_models)
print('Stacking Averaged models score: {:.4f} ({:.4f})'.format(score.mean(), score.std()))
# 结果模型叠加分数比模型平均分数更低,模型效果更好
测试项目链接: