当我们想在数据集上构建许多个模型,便可考虑使用集成的方法:
1. 挂袋法:并行进行,挂袋法集成中的每一个模型只使用训练集的一部分,它们的思路是减少对数据产生过度拟合,但前提是每个模型的差别不能太大,挂袋法对如线性回归之类的线性预测器无效。对于一些很稳定的模型,挂袋法的效果不明显,它适合那些对很小的改变也十分敏感的分类器,例如决策树,它很不稳定,未剪枝决策树就十分适合挂袋法。而KNN分类器则是一种很稳定的模型,不过我们可以使用随机子空间方法,为最近邻方法引入不稳定性。
1. 挂袋法:并行进行,挂袋法集成中的每一个模型只使用训练集的一部分,它们的思路是减少对数据产生过度拟合,但前提是每个模型的差别不能太大,挂袋法对如线性回归之类的线性预测器无效。对于一些很稳定的模型,挂袋法的效果不明显,它适合那些对很小的改变也十分敏感的分类器,例如决策树,它很不稳定,未剪枝决策树就十分适合挂袋法。而KNN分类器则是一种很稳定的模型,不过我们可以使用随机子空间方法,为最近邻方法引入不稳定性。
2. 赋权重提升法:顺序进行,产生一个逐步复杂的模型序列,它按顺序基于前一个模型的错误训练新的模型,每次训练得到的模型被赋予一个权重,这个权重依据模型再给定数据的效果而定。最终的预测值产生时,这些权重值就是每个特定模型对于最终输出结果的影响力的判据。整体来说就是把错误率低的分类器赋予更大的权重。
3. 梯度提升法:由于赋权提升法使根据赋予错误实例更大的权重,然后是的下一个模型更可能选中这些错误分类的实例再次训练,而这也存在不足之处,这里梯度提升法采用梯度而不是权重来鉴别缺陷,基于调整残差(真实值y与预测值y')来完善上一个模型的缺陷。
挂袋法+随机子空间法:
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 09 17:31:14 2018
@author: Alvin AI
"""
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
#加载数据
def get_data():
no_features = 30
#冗余参数,它们产生了高信息量特征的线性组合以构成特征之间的关联
redundant_features = int(0.1*no_features)
#设定到时候从总特征中挑出60%的具有高信息量来分类的特征个数
informative_features = int(0.6*no_features)
#重复特征是从高信息量特征和冗杂特征中随机选择的副本
repeated_features = int(0.1*no_features)
print no_features,redundant_features,informative_features,repeated_features
#创建数据,本例需要500个实例,30个特征,要求互换实例的3%(即产生写噪声)
x,y = make_classification(n_samples=500,n_features=no_features,\
flip_y=0.03,n_informative=informative_features,\
n_redundant=redundant_features,\
n_repeated=repeated_features,random_state=7)
return x,y
#构建一个KNN模型
def build_single_model(x,y):
model = KNeighborsClassifier()
model.fit(x,y)
return model
#实现挂袋法过程
def build_bagging_model(x,y):
#评估器数量是100个,max_samples是指从输入数据集里自举时每个评估器要选择的实例数量
#我让挂袋法选择所有实例,因为max_samples=1.0,从样本训练集中抽取1个实例去训练每个评估器
#k-neighbors <= max_samplaes <= n_samples
#max_features=0.7指定每个评估器自举得时候要包含得属性数量,这里时70%,这就是随机空间法
#自举是指,用自举产生m个不同得数据集,然后用它们中得每一个构建一个模型。
bagging = BaggingClassifier(KNeighborsClassifier(),n_estimators=100,\
random_state=9,max_samples=1.0,max_features=0.7,bootstrap\
=True,bootstrap_features=True)
bagging.fit(x,y)
return bagging
def view_model(model):
print "\n sampled attributes in top 10 estimators\n"
for i,feature_set in enumerate(model.estimators_features_[0:10]):
print "estimator %d" % (i+1),feature_set
#调用之前的所有函数
if __name__=="__main__":
x,y = get_data()
x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,\
test_size=0.3,random_state=9)
x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,\
test_size=0.3,random_state=9)
model = build_single_model(x_train,y_train)
predicted_y = model.predict(x_train)
print "\n single model accuracy on training data\n"
print classification_report(y_train,predicted_y)
bagging = build_bagging_model(x_train,y_train)
predicted_y = bagging.predict(x_train)
print "\n bagging model accuracy on training data\n"
print classification_report(y_train,predicted_y)
view_model(bagging)
predicted_y = model.predict(x_dev)
print "\n single model accuracy on dev data\n"
print classification_report(y_dev,predicted_y)
print "\n bagging model accuracy on dev data\n"
predicted_y = bagging.predict(x_dev)
print classification_report(y_dev,predicted_y)
赋权重提升法:
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 10 09:13:56 2018
@author: Alvin AI
"""
from sklearn.datasets import make_classification
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report,zero_one_loss
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
import numpy as np
import matplotlib.pyplot as plt
#import itertools
#载入数据
def get_data():
no_features = 30
redundant_features = int(0.1*no_features)#残余特征
informative_features = int(0.6*no_features)#信息特征
repeated_features = int(0.1*no_features)#重复特征
print no_features,redundant_features,\
informative_features,repeated_features
#生成一个多分类的数据集
x,y = make_classification(n_samples=500,n_features=no_features,\
flip_y=0.03,n_informative=informative_features,\
n_redundant=redundant_features,\
n_repeated=repeated_features,random_state=7)
return x,y
#构建决策树模型
def build_single_model(x,y):
model = DecisionTreeClassifier()
model.fit(x,y)
return model
#构建基于决策树的提升法AdaBoost模型-SAMME轻微修正版
def build_boosting_model(x,y,no_estimators=20):
#max_depth:我们不需要决策树完全生长,只需要树桩-只有2个叶节点和1个分支节点
#min_samples_leaf:在叶子节点上需要的最小样本数,默认为1
#n_estimators指定需要生成的树的数量
#SAMME:stage wise additive modeling using multi-class exponential loss function
#SAMME:使用多分类指数损失函数的逐步叠加建模,是AdaBoosting算法的增强版,它给错误分类的记录添加更多的权重
#如果alforithm使用模型中的权重中K值=2,则SAMME会退化为AdaBoost,这也是它俩之间的区别
boosting = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1,\
min_samples_leaf=1),random_state=9,\
n_estimators=no_estimators,algorithm="SAMME")
boosting.fit(x,y)
return boosting
#查看模型
def view_model(model):
print "\n estimator weights and error\n"
for i,weight in enumerate(model.estimator_weights_):
print "estimator %d weight = %0.4f error = %0.4f"\
% (i+1,weight,model.estimator_errors_[i])
plt.figure(1)
plt.title("model weight vs error")
plt.xlabel("weight")
plt.ylabel("error")
plt.plot(model.estimator_weights_,model.estimator_errors_)
def number_estimators_vs_err_rate(x,y,x_dev,y_dev):
no_estimators =range(20,120,10)
misclassy_rate = []
misclassy_rate_dev = []
for no_estimator in no_estimators:
boosting = build_boosting_model(x,y,no_estimators=no_estimator)
predicted_y = boosting.predict(x)
predicted_y_dev = boosting.predict(x_dev)
misclassy_rate.append(zero_one_loss(y,predicted_y))#zero_one_loss是指错误分类比例
misclassy_rate_dev.append(zero_one_loss(y_dev,predicted_y_dev))
plt.figure(2)
plt.title("no estimators vs mis-classification rate")
plt.xlabel("no of extimators")
plt.ylabel("mis-classification rate")
plt.plot(no_estimators,misclassy_rate,label='Train')#使用错分类率最低的评估器个数,可以使提升法模型效果更好
plt.plot(no_estimators,misclassy_rate_dev,label='Dev')
plt.show()
if __name__=="__main__":
x,y = get_data()
#将数据集划分为训练集、dev集和测试集
x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,\
test_size = 0.3,random_state=9)
x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,\
test_size = 0.3,random_state=9)
#构建一个单独的模型
model = build_single_model(x_train,y_train)
predicted_y = model.predict(x_train)
print "\nsingle model accuracy on training data\n"
print classification_report(y_train,predicted_y)
print "fraction of misclassfication = %0.2f" % \
(zero_one_loss(y_train,predicted_y)*100),"%"
#构建多个模型
boosting = build_boosting_model(x_train,y_train,no_estimators=85)
predicted_y = boosting.predict(x_train)
print "\n boosting model accuracy on training data\n"
print classification_report(y_train,predicted_y)
print "fraction of misclassfication = %0.2f" % \
(zero_one_loss(y_train,predicted_y)*100),"%"
view_model(boosting)
#查看dev集上运行的情况
predicted_y = model.predict(x_dev)
print "\n single model accuracy on dev data\n"
print classification_report(y_dev,predicted_y)
print "fraction of misclassfication = %0.2f" % \
(zero_one_loss(y_dev,predicted_y)*100),"%"
print "\n boosting model accuracy on dev data\n"
predicted_y = boosting.predict(x_dev)
print classification_report(y_dev,predicted_y)
print "fraction of misclassfication = %0.2f" % \
(zero_one_loss(y_dev,predicted_y)*100),"%"
number_estimators_vs_err_rate(x_train,y_train,x_dev,y_dev)
梯度提升法:
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 10 14:24:07 2018
@author: Alvin AI
"""
from sklearn.datasets import load_boston
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import matplotlib.pyplot as plt
def get_data():
data = load_boston()
x = data['data']
y = data['target']
return x,y
def build_model(x,y,n_estimators=500):
#当verbose设为大于1时,每个模型或者树构建时都把进展情况打印出来
#subsample:指定了模型要采用的训练集数据量的百分比
#learning_rate:学习率,用来控制每棵树的贡献
model = GradientBoostingRegressor(n_estimators=n_estimators,\
verbose=10,subsample=0.7,learning_rate=0.15,\
max_depth=3,random_state=77)
model.fit(x,y)
return model
#verbose显示的结果中的对应解释:
#train loss:包内样本的偏差(损失)
#OOB improve:包外样本,相比于上一次迭代,有多少改善
def view_model(model):
print "\n training scores"
print "=================="
for i,score in enumerate(model.train_score_):
print "\t estimator %d score %0.3f" % (i+1,score)
plt.cla()
plt.figure(1)
plt.plot(range(1,model.estimators_.shape[0]+1),model.train_score_)
plt.xlabel("model sequence")
plt.ylabel("model score")
plt.show()
print "\n feature importance"
print "====================="
for i,score in enumerate(model.feature_importances_):
print "\t feature %d importance %0.3f" % (i+1,score)
def model_worth(true_y,predicted_y):
print "\t mean squared error = %0.2f" \
% (mean_squared_error(true_y,predicted_y))
if __name__== "__main__":
#加入数据
x,y = get_data()
#数据集拆分
x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,\
test_size=0.3,random_state=9)
x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,\
test_size=0.3,random_state=9)
#准备多项式特征
poly_features = PolynomialFeatures(2,interaction_only=True)
poly_features.fit(x_train)
x_train_poly = poly_features.transform(x_train)
x_dev_poly = poly_features.transform(x_dev)
#用多项式特征建模
model_poly = build_model(x_train_poly,y_train)
predicted_y = model_poly.predict(x_train_poly)
print "\n model performance in training set (polynomialfeatures)\n"
model_worth(y_train,predicted_y)#均平方误差
view_model(model_poly)
#把模型运用到dev集上
predicted_y = model_poly.predict(x_dev_poly)
print "\n model performance in training set (polynomialfeatures)\n"
model_worth(y_dev,predicted_y)#均平方误差
#把模型运用到测试集上
x_test_poly = poly_features.transform(x_test)
predicted_y = model_poly.predict(x_test_poly)
print "\n model performance in test set (polynomialfeatures)\n"
model_worth(y_test,predicted_y)#均平方误差"