零基础入门数据挖掘-Task5 模型融合


1. 学习内容

2. 为什么要进行模型融合

3. 回归或分类的概率融合

3.1 简单平均加权

3.2 Stacking

4. 分类模型的融合

4.1 导入相关模块

4.2 投票法

4.3 Stacking

5. 参考文献

import numpy as np
import pandas as pd
from sklearn import metrics
import warnings


# 生成一些简单的样本数据,test_prei 代表第i个模型的预测值
test_pre1 = [1.2, 3.2, 2.1, 6.2]
test_pre2 = [0.9, 3.1, 2.0, 5.9]
test_pre3 = [1.1, 2.9, 2.2, 6.0]

# y_test_true 代表第模型的真实值
y_test_true = [1, 3, 2, 6]

# 定义结果的加权平均函数
def Weighted_method(test_pre1, test_pre2, test_pre3, w = [1/3, 1/3, 1/3]):
    Weighted_result = w[0] * pd.Series(test_pre1) + \
                    w[1] * pd.Series(test_pre2) + w[2] * pd.Series(test_pre3)
    return Weighted_result

# 各模型的预测结果计算MAE
print('Pred1 MAE:',metrics.mean_absolute_error(y_test_true, test_pre1))
print('Pred2 MAE:',metrics.mean_absolute_error(y_test_true, test_pre2))
print('Pred3 MAE:',metrics.mean_absolute_error(y_test_true, test_pre3))
## 根据加权计算MAE
w = [0.3,0.4,0.3] # 定义比重权值
Weighted_pre = Weighted_method(test_pre1, test_pre2, test_pre3, w)
print('Weighted_pre MAE:', metrics.mean_absolute_error(y_test_true, Weighted_pre))
Pred1 MAE: 0.1750000000000001
Pred2 MAE: 0.07499999999999993
Pred3 MAE: 0.10000000000000009
Weighted_pre MAE: 0.05750000000000027
# 定义基于结果平均值的加权平均函数
def Mean_method(test_pre1, test_pre2, test_pre3):
    Mean_result = pd.concat([pd.Series(test_pre1), pd.Series(test_pre2), \
                             pd.Series(test_pre3)], axis = 1).mean(axis = 1)
    return Mean_result

Mean_pre = Mean_method(test_pre1, test_pre2, test_pre3)
print('Mean_pre MAE:', metrics.mean_absolute_error(y_test_true, Mean_pre))
Mean_pre MAE: 0.06666666666666693
# 定义基于结果中位数的加权平均函数
def Median_method(test_pre1, test_pre2, test_pre3):
    Median_result = pd.concat([pd.Series(test_pre1), pd.Series(test_pre2), \
                               pd.Series(test_pre3)], axis = 1).median(axis = 1)
    return Median_result

Median_pre = Median_method(test_pre1, test_pre2, test_pre3)
print('Median_pre MAE:', metrics.mean_absolute_error(y_test_true, Median_pre))
Median_pre MAE: 0.07500000000000007

3.2 Stacking




from sklearn import linear_model

def Stacking_method(train_reg1, train_reg2, train_reg3, y_train_true, \
                    test_pre1, test_pre2, test_pre3, \
                    model_L2 = linear_model.LinearRegression()):
    model_L2.fit(pd.concat([pd.Series(train_reg1), pd.Series(train_reg2), \
                            pd.Series(train_reg3)], axis = 1 ).values, y_train_true)
    Stacking_result = model_L2.predict(pd.concat([pd.Series(test_pre1), \
                                                  pd.Series(test_pre2), \
                                                  pd.Series(test_pre3)], \
                                                 axis = 1).values)
    return Stacking_result

# 生成一些简单的样本数据,test_prei 代表第i个模型的预测值
train_reg1 = [3.2, 8.2, 9.1, 5.2]
train_reg2 = [2.9, 8.1, 9.0, 4.9]
train_reg3 = [3.1, 7.9, 9.2, 5.0]
# y_test_true 代表第模型的真实值
y_train_true = [3, 8, 9, 5] 

test_pre1 = [1.2, 3.2, 2.1, 6.2]
test_pre2 = [0.9, 3.1, 2.0, 5.9]
test_pre3 = [1.1, 2.9, 2.2, 6.0]

# y_test_true 代表第模型的真实值
y_test_true = [1, 3, 2, 6]

model_L2 = linear_model.LinearRegression()
Stacking_pre = Stacking_method(train_reg1, train_reg2, train_reg3, y_train_true,
                               test_pre1, test_pre2, test_pre3, model_L2)
print('Stacking_pre MAE:', metrics.mean_absolute_error(y_test_true, Stacking_pre))
Stacking_pre MAE: 0.042134831460675204

4. 分类模型的融合

4.1 导入相关模块

from sklearn.datasets import make_blobs
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

4.2 投票法


# 硬投票:对多个模型直接进行投票,不区分模型结果的相对重要度,最终投票数最多的类为最终被预测的类。
iris = datasets.load_iris()

x = iris.data
y = iris.target
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.3)

clf1 = XGBClassifier(learning_rate = 0.1, n_estimators = 150, max_depth = 3, \
                     min_child_weight = 2, subsample = 0.7, \
                     colsample_bytree = 0.6, objective = 'binary:logistic')
clf2 = RandomForestClassifier(n_estimators = 50, max_depth = 1, \
                              min_samples_split = 4,
                              min_samples_leaf = 63,oob_score = True)
clf3 = SVC(C = 0.1)

# 硬投票
eclf = VotingClassifier(estimators = [('xgb', clf1), ('rf', clf2), \
                                      ('svc', clf3)], voting = 'hard')
for clf, label in zip([clf1, clf2, clf3, eclf], \
                      ['XGBBoosting', 'Random Forest', 'SVM', 'Ensemble']):
    scores = cross_val_score(clf, x, y, cv = 5, scoring = 'accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
Accuracy: 0.96 (+/- 0.02) [XGBBoosting]
Accuracy: 0.33 (+/- 0.00) [Random Forest]
Accuracy: 0.95 (+/- 0.03) [SVM]
Accuracy: 0.95 (+/- 0.03) [Ensemble]
# 软投票:和硬投票原理相同,增加了设置权重的功能,可以为不同模型设置不同权重,进而区别模型不同的重要度。
x = iris.data
y = iris.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

clf1 = XGBClassifier(learning_rate = 0.1, n_estimators = 150, max_depth = 3, \
                     min_child_weight = 2, subsample = 0.8, \
                     colsample_bytree = 0.8, objective = 'binary:logistic')
clf2 = RandomForestClassifier(n_estimators = 50, max_depth = 1, \
                              min_samples_split = 4,
                              min_samples_leaf = 63, oob_score = True)
clf3 = SVC(C = 0.1, probability = True)

# 软投票
eclf = VotingClassifier(estimators = [('xgb', clf1), ('rf', clf2), \
                                      ('svc', clf3)], voting = 'soft', \
                        weights = [2, 1, 1])
clf1.fit(x_train, y_train)

for clf, label in zip([clf1, clf2, clf3, eclf], \
                      ['XGBBoosting', 'Random Forest', 'SVM', 'Ensemble']):
    scores = cross_val_score(clf, x, y, cv = 5, scoring = 'accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
Accuracy: 0.96 (+/- 0.02) [XGBBoosting]
Accuracy: 0.33 (+/- 0.00) [Random Forest]
Accuracy: 0.95 (+/- 0.03) [SVM]
Accuracy: 0.96 (+/- 0.02) [Ensemble]

4.3 Stacking

# 5-Fold Stacking
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier,GradientBoostingClassifier
import pandas as pd

data_0 = iris.data
data = data_0[:100,:]

target_0 = iris.target
target = target_0[:100]

# 模型融合中使用到的各个单模型
clfs = [LogisticRegression(solver = 'lbfgs'),
        RandomForestClassifier(n_estimators = 5, n_jobs = -1, criterion = 'gini'),
        ExtraTreesClassifier(n_estimators = 5, n_jobs = -1, criterion = 'gini'),
        ExtraTreesClassifier(n_estimators = 5, n_jobs = -1, criterion = 'entropy'),
        GradientBoostingClassifier(learning_rate = 0.05, subsample = 0.5, \
                                   max_depth = 6, n_estimators = 5)]
# 切分一部分数据作为测试集
X, X_predict, y, y_predict = train_test_split(data, target, test_size = 0.3, \
                                              random_state = 2020)

dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_predict.shape[0], len(clfs)))

# 5折stacking
n_splits = 5
skf = StratifiedKFold(n_splits)
skf = skf.split(X, y)

for j, clf in enumerate(clfs):
    # 依次训练各个单模型
    dataset_blend_test_j = np.zeros((X_predict.shape[0], 5))
    for i, (train, test) in enumerate(skf):
        # 5-Fold交叉训练,使用第i个部分作为预测,
        # 剩余的部分来训练模型,获得其预测的输出作为第i部分的新特征。
        X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
        clf.fit(X_train, y_train)
        y_submission = clf.predict_proba(X_test)[:, 1]
        dataset_blend_train[test, j] = y_submission
        dataset_blend_test_j[:, i] = clf.predict_proba(X_predict)[:, 1]
    # 对于测试集,直接用这k个模型的预测值均值作为新的特征。
    dataset_blend_test[:, j] = dataset_blend_test_j.mean(1)
    print("val auc Score: %f" % roc_auc_score(y_predict, dataset_blend_test[:, j]))

clf = LogisticRegression(solver = 'lbfgs')
clf.fit(dataset_blend_train, y)
y_submission = clf.predict_proba(dataset_blend_test)[:, 1]

print("Val auc Score of Stacking: %f" % (roc_auc_score(y_predict, y_submission)))
val auc Score: 1.000000
val auc Score: 0.500000
val auc Score: 0.500000
val auc Score: 0.500000
val auc Score: 0.500000
Val auc Score of Stacking: 1.000000


5. 参考文献

