Python - 模型集成（挂袋法、权重提升法和梯度提升法）

最新推荐文章于 2024-01-26 22:39:36 发布

Alvin Ai

最新推荐文章于 2024-01-26 22:39:36 发布

阅读量1.8k

点赞数

分类专栏：机器学习

机器学习专栏收录该内容

11 篇文章 3 订阅

订阅专栏

当我们想在数据集上构建许多个模型，便可考虑使用集成的方法：
1. 挂袋法：并行进行，挂袋法集成中的每一个模型只使用训练集的一部分，它们的思路是减少对数据产生过度拟合，但前提是每个模型的差别不能太大，挂袋法对如线性回归之类的线性预测器无效。对于一些很稳定的模型，挂袋法的效果不明显，它适合那些对很小的改变也十分敏感的分类器，例如决策树，它很不稳定，未剪枝决策树就十分适合挂袋法。而KNN分类器则是一种很稳定的模型，不过我们可以使用随机子空间方法，为最近邻方法引入不稳定性。

2. 赋权重提升法：顺序进行，产生一个逐步复杂的模型序列，它按顺序基于前一个模型的错误训练新的模型，每次训练得到的模型被赋予一个权重，这个权重依据模型再给定数据的效果而定。最终的预测值产生时，这些权重值就是每个特定模型对于最终输出结果的影响力的判据。整体来说就是把错误率低的分类器赋予更大的权重。
3. 梯度提升法：由于赋权提升法使根据赋予错误实例更大的权重，然后是的下一个模型更可能选中这些错误分类的实例再次训练，而这也存在不足之处，这里梯度提升法采用梯度而不是权重来鉴别缺陷，基于调整残差（真实值y与预测值y'）来完善上一个模型的缺陷。

挂袋法+随机子空间法：

# -*- coding: utf-8 -*-
"""
Created on Mon Apr 09 17:31:14 2018

@author: Alvin AI
"""

from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split

#加载数据
def get_data():
    no_features = 30
    #冗余参数，它们产生了高信息量特征的线性组合以构成特征之间的关联
    redundant_features = int(0.1*no_features)
    #设定到时候从总特征中挑出60%的具有高信息量来分类的特征个数
    informative_features = int(0.6*no_features)
    #重复特征是从高信息量特征和冗杂特征中随机选择的副本
    repeated_features = int(0.1*no_features)
    print no_features,redundant_features,informative_features,repeated_features
    #创建数据，本例需要500个实例，30个特征，要求互换实例的3%（即产生写噪声）
    x,y = make_classification(n_samples=500,n_features=no_features,\
                              flip_y=0.03,n_informative=informative_features,\
                              n_redundant=redundant_features,\
                              n_repeated=repeated_features,random_state=7)
    return x,y

#构建一个KNN模型
def build_single_model(x,y):
    model = KNeighborsClassifier()
    model.fit(x,y)
    return model

#实现挂袋法过程
def build_bagging_model(x,y):
    #评估器数量是100个，max_samples是指从输入数据集里自举时每个评估器要选择的实例数量
    #我让挂袋法选择所有实例，因为max_samples=1.0，从样本训练集中抽取1个实例去训练每个评估器
    #k-neighbors <= max_samplaes <= n_samples
    #max_features=0.7指定每个评估器自举得时候要包含得属性数量，这里时70%，这就是随机空间法
    #自举是指，用自举产生m个不同得数据集，然后用它们中得每一个构建一个模型。
    bagging = BaggingClassifier(KNeighborsClassifier(),n_estimators=100,\
                random_state=9,max_samples=1.0,max_features=0.7,bootstrap\
                =True,bootstrap_features=True)
    bagging.fit(x,y)
    return bagging


def view_model(model):
    print "\n sampled attributes in top 10 estimators\n"
    for i,feature_set in enumerate(model.estimators_features_[0:10]):
        print "estimator %d" % (i+1),feature_set
    
#调用之前的所有函数
if __name__=="__main__":
    x,y = get_data()
    x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,\
                                            test_size=0.3,random_state=9)
    x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,\
                                            test_size=0.3,random_state=9)
    
    model = build_single_model(x_train,y_train)
    predicted_y = model.predict(x_train)
    print "\n single model accuracy on training data\n"
    print classification_report(y_train,predicted_y)
    
    bagging = build_bagging_model(x_train,y_train)
    predicted_y = bagging.predict(x_train)
    print "\n bagging model accuracy on training data\n"
    print classification_report(y_train,predicted_y)
    view_model(bagging)
    
    predicted_y = model.predict(x_dev)
    print "\n single model accuracy on dev data\n"
    print classification_report(y_dev,predicted_y)
    
    print "\n bagging model accuracy on dev data\n"
    predicted_y = bagging.predict(x_dev)
    print classification_report(y_dev,predicted_y)

赋权重提升法：

# -*- coding: utf-8 -*-
"""
Created on Tue Apr 10 09:13:56 2018

@author: Alvin AI
"""

from sklearn.datasets import make_classification
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report,zero_one_loss
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
import numpy as np
import matplotlib.pyplot as plt
#import itertools

#载入数据
def get_data():
    no_features = 30
    redundant_features = int(0.1*no_features)#残余特征
    informative_features = int(0.6*no_features)#信息特征
    repeated_features = int(0.1*no_features)#重复特征
    print no_features,redundant_features,\
    informative_features,repeated_features
    #生成一个多分类的数据集
    x,y = make_classification(n_samples=500,n_features=no_features,\
                              flip_y=0.03,n_informative=informative_features,\
                              n_redundant=redundant_features,\
                              n_repeated=repeated_features,random_state=7)
    return x,y

#构建决策树模型
def build_single_model(x,y):
    model = DecisionTreeClassifier()
    model.fit(x,y)
    return model

#构建基于决策树的提升法AdaBoost模型-SAMME轻微修正版
def build_boosting_model(x,y,no_estimators=20):
    #max_depth:我们不需要决策树完全生长，只需要树桩-只有2个叶节点和1个分支节点
    #min_samples_leaf:在叶子节点上需要的最小样本数,默认为1
    #n_estimators指定需要生成的树的数量
    #SAMME:stage wise additive modeling using multi-class exponential loss function
    #SAMME:使用多分类指数损失函数的逐步叠加建模，是AdaBoosting算法的增强版，它给错误分类的记录添加更多的权重
    #如果alforithm使用模型中的权重中K值=2，则SAMME会退化为AdaBoost,这也是它俩之间的区别
    boosting = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1,\
                    min_samples_leaf=1),random_state=9,\
        n_estimators=no_estimators,algorithm="SAMME")
    boosting.fit(x,y)
    return boosting

#查看模型
def view_model(model):
    print "\n estimator weights and error\n"
    for i,weight in enumerate(model.estimator_weights_):
        print "estimator %d weight = %0.4f error = %0.4f"\
        % (i+1,weight,model.estimator_errors_[i])
     
    plt.figure(1)
    plt.title("model weight vs error")
    plt.xlabel("weight")
    plt.ylabel("error")
    plt.plot(model.estimator_weights_,model.estimator_errors_)
        
def number_estimators_vs_err_rate(x,y,x_dev,y_dev):
    no_estimators =range(20,120,10)
    misclassy_rate = []
    misclassy_rate_dev = []
    
    for no_estimator in no_estimators:
        boosting = build_boosting_model(x,y,no_estimators=no_estimator)
        predicted_y = boosting.predict(x)
        predicted_y_dev =  boosting.predict(x_dev)
        misclassy_rate.append(zero_one_loss(y,predicted_y))#zero_one_loss是指错误分类比例
        misclassy_rate_dev.append(zero_one_loss(y_dev,predicted_y_dev))
    
    plt.figure(2)
    plt.title("no estimators vs mis-classification rate")
    plt.xlabel("no of extimators")
    plt.ylabel("mis-classification rate")
    plt.plot(no_estimators,misclassy_rate,label='Train')#使用错分类率最低的评估器个数，可以使提升法模型效果更好
    plt.plot(no_estimators,misclassy_rate_dev,label='Dev')
    
    plt.show()
    
if __name__=="__main__":
    x,y = get_data()
    
    
    #将数据集划分为训练集、dev集和测试集
    x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,\
                                                test_size = 0.3,random_state=9)
    x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,\
                                                 test_size = 0.3,random_state=9)
    
    #构建一个单独的模型
    model = build_single_model(x_train,y_train)
    predicted_y = model.predict(x_train)
    print "\nsingle model accuracy on training data\n"
    print classification_report(y_train,predicted_y)
    print "fraction of misclassfication = %0.2f" % \
             (zero_one_loss(y_train,predicted_y)*100),"%"
             
    #构建多个模型
    boosting = build_boosting_model(x_train,y_train,no_estimators=85)
    predicted_y = boosting.predict(x_train)
    print "\n boosting model accuracy on training data\n"
    print classification_report(y_train,predicted_y)
    print "fraction of misclassfication = %0.2f" % \
             (zero_one_loss(y_train,predicted_y)*100),"%"    
    view_model(boosting)
    
    #查看dev集上运行的情况
    predicted_y = model.predict(x_dev)
    print "\n single model accuracy on dev data\n"
    print classification_report(y_dev,predicted_y)
    print "fraction of misclassfication = %0.2f" % \
             (zero_one_loss(y_dev,predicted_y)*100),"%"    
    
    print "\n boosting model accuracy on dev data\n"
    predicted_y = boosting.predict(x_dev)
    print classification_report(y_dev,predicted_y)
    print "fraction of misclassfication = %0.2f" % \
             (zero_one_loss(y_dev,predicted_y)*100),"%"   
    
    number_estimators_vs_err_rate(x_train,y_train,x_dev,y_dev)

梯度提升法：

# -*- coding: utf-8 -*-
"""
Created on Tue Apr 10 14:24:07 2018

@author: Alvin AI
"""

from sklearn.datasets import load_boston
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import matplotlib.pyplot as plt

def get_data():
    data = load_boston()
    x = data['data']
    y = data['target']
    return x,y

def build_model(x,y,n_estimators=500):
    #当verbose设为大于1时，每个模型或者树构建时都把进展情况打印出来
    #subsample：指定了模型要采用的训练集数据量的百分比
    #learning_rate：学习率，用来控制每棵树的贡献
    model = GradientBoostingRegressor(n_estimators=n_estimators,\
              verbose=10,subsample=0.7,learning_rate=0.15,\
              max_depth=3,random_state=77)
    model.fit(x,y)
    return model

#verbose显示的结果中的对应解释：
#train loss:包内样本的偏差（损失）
#OOB improve:包外样本，相比于上一次迭代，有多少改善

def view_model(model):
    print "\n training scores"
    print "=================="
    for i,score in enumerate(model.train_score_):
        print "\t estimator %d  score %0.3f" % (i+1,score)
    plt.cla()
    plt.figure(1)
    plt.plot(range(1,model.estimators_.shape[0]+1),model.train_score_)
    plt.xlabel("model sequence")
    plt.ylabel("model score")
    plt.show()
    
    print "\n feature importance"
    print "====================="
    for i,score in enumerate(model.feature_importances_):
        print "\t feature %d importance %0.3f" % (i+1,score)

def model_worth(true_y,predicted_y):
    print "\t mean squared error = %0.2f" \
           % (mean_squared_error(true_y,predicted_y))
           
if __name__== "__main__":
    #加入数据
    x,y = get_data()
    #数据集拆分
    x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,\
                                            test_size=0.3,random_state=9)
    x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,\
                                            test_size=0.3,random_state=9)
    #准备多项式特征
    poly_features = PolynomialFeatures(2,interaction_only=True)
    poly_features.fit(x_train)
    x_train_poly = poly_features.transform(x_train)
    x_dev_poly = poly_features.transform(x_dev)
    #用多项式特征建模
    model_poly = build_model(x_train_poly,y_train)
    predicted_y = model_poly.predict(x_train_poly)
    print "\n model performance in training set (polynomialfeatures)\n"
    model_worth(y_train,predicted_y)#均平方误差
    view_model(model_poly)
    #把模型运用到dev集上
    predicted_y = model_poly.predict(x_dev_poly)
    print "\n model performance in training set (polynomialfeatures)\n"
    model_worth(y_dev,predicted_y)#均平方误差
    #把模型运用到测试集上
    x_test_poly = poly_features.transform(x_test)
    predicted_y = model_poly.predict(x_test_poly)
    print "\n model performance in test set (polynomialfeatures)\n"
    model_worth(y_test,predicted_y)#均平方误差"