关注微信公共号:小程在线
关注CSDN博客:程志伟的博客
梯度提升树
1 提升集成算法:重要参数n_estimators
1. 导入需要的库,模块以及数据
import xgboost as xgb
from xgboost import XGBRegressor as XGBR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.linear_model import LinearRegression as LinearR
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold, cross_val_score as CVS, train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
import datetime
data = load_boston()
X = data.data
y = data.target
X.shape
Out[6]: (506, 13)
2. 建模,查看其他接口和属性
Xtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420)
reg = XGBR(n_estimators=100).fit(Xtrain,Ytrain)
reg.predict(Xtest) #传统接口predict
[20:19:46] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Out[8]:
array([ 8.165384 , 21.919355 , 29.843645 , 11.874415 , 8.833874 ,
20.698246 , 15.456877 , 15.544203 , 15.273806 , 13.444421 ,
22.130966 , 35.072395 , 21.383947 , 27.477697 , 20.449163 ,
......
20.396807 , 13.163115 , 40.93572 , 25.202625 , 21.823097 ,
14.690604 , 26.191984 ], dtype=float32)
#使用R²来作为模型的评估指标
reg.score(Xtest,Ytest)
Out[9]: 0.9197580267581366
MSE(Ytest,reg.predict(Xtest))
Out[10]: 7.466827353555599
#树模型的优势之一:能够查看模型的重要性分数,可以使用嵌入法进行特征选择
reg.feature_importances_
Out[11]:
array([0.02474326, 0.00233919, 0.00895177, 0.01757721, 0.04847462,
0.25909728, 0.0120366 , 0.0429231 , 0.01358514, 0.02558688,
0.04455473, 0.01763431, 0.48249587], dtype=float32)
3. 交叉验证,与线性回归&随机森林回归进行对比
CVS(reg,Xtrain,Ytrain,cv=5).mean()
Out[12]: 0.8017863029875325
CVS(reg,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()
Out[13]: -16.041115480238048
#来查看一下sklearn中所有的模型评估指标
import sklearn
sorted(sklearn.metrics.SCORERS.keys())
Out[14]:
['accuracy',
'adjusted_mutual_info_score',
'adjusted_rand_score',
'average_precision',
'balanced_accuracy',
'brier_score_loss',
'completeness_score',
'explained_variance',
'f1',
'f1_macro',
'f1_micro',
'f1_samples',
'f1_weighted',
'fowlkes_mallows_score',
'homogeneity_score',
'jaccard',
'jaccard_macro',
'jaccard_micro',
'jaccard_samples',
'jaccard_weighted',
'max_error',
'mutual_info_score',
'neg_log_loss',
'neg_mean_absolute_error',
'neg_mean_squared_error',
'neg_mean_squared_log_error',
'neg_median_absolute_error',
'normalized_mutual_info_score',
'precision',
'precision_macro',
'precision_micro',
'precision_samples',
'precision_weighted',
'r2',
'recall',
'recall_macro',
'recall_micro',
'recall_samples',
'recall_weighted',
'roc_auc',
'v_measure_score']
#使用随机森林和线性回归进行一个对比
rfr = RFR(n_estimators=100)
CVS(rfr,Xtrain,Ytrain,cv=5).mean()
Out[15]: 0.7934188014881
CVS(rfr,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()
Out[16]: -16.26653824285714
lr = LinearR()
CVS(lr,Xtrain,Ytrain,cv=5).mean()
Out[17]: 0.6835070597278092
CVS(lr,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()
Out[18]: -25.349507493648364
#开启参数slient=Fase在数据巨大,预料到算法运行会非常缓慢的时候可以使用这个参数来监控模型的训练进度
reg = XGBR(n_estimators=10,silent=True)
CVS(reg,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()
Out[19]: -92.67865836936579
4. 定义绘制以训练样本数为横坐标的学习曲线的函数
def plot_learning_curve(estimator,title, X, y,
ax=None, #选择子图
ylim=None, #设置纵坐标的取值范围
cv=None, #交叉验证
n_jobs=None #设定索要使用的线程
):
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
import numpy as np
train_sizes, train_scores, test_scores = learning_curve(estimator, X, y
,shuffle=True
,cv=cv
# ,random_state=420
,n_jobs=n_jobs)
if ax == None:
ax = plt.gca()
else:
ax = plt.figure()
ax.set_title(title)
if ylim is not None:
ax.set_ylim(*ylim)
ax.set_xlabel("Training examples")
ax.set_ylabel("Score")
ax.grid() #绘制网格,不是必须
ax.plot(train_sizes, np.mean(train_scores, axis=1), 'o-', color="r",label="Training score")
ax.plot(train_sizes, np.mean(test_scores, axis=1), 'o-', color="g",label="Test score")
ax.legend(loc="best")
return ax
5. 使用学习曲线观察XGB在波士顿数据集上的潜力
cv = KFold(n_splits=5, shuffle = True, random_state=42)
plot_learning_curve(XGBR(n_estimators=100,random_state=420) ,"XGB",Xtrain,Ytrain,ax=None,cv=cv)
plt.show()
训练集上的表现展示了模型的学习能力,测试集上的表现展示了模型的泛化能力,通常模型在测试集上的表现不太可
能超过训练集,因此我们希望我们的测试集的学习曲线能够努力逼近我们的训练集的学习曲线
6. 使用参数学习曲线观察n_estimators对模型的影响
axisx = range(10,1010,50)
rs = []
for i in axisx:
reg = XGBR(n_estimators=i,random_state=420)
rs.append(CVS(reg,Xtrain,Ytrain,cv=cv).mean())
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="red",label="XGB")
plt.legend()
plt.show()
660 0.8046775284172915
数据集一共有506条数据,但是需要在660颗树的时候达到了最大值,R²的值时0.804,出现了很奇怪的现象。
7. 进化的学习曲线:方差与泛化误差
axisx = range(50,1050,50)
rs = []
var = []
ge = []
for i in axisx:
reg = XGBR(n_estimators=i,silent=True,random_state=420)
cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)
#记录1-偏差
rs.append(cvresult.mean())
#记录方差
var.append(cvresult.var())
#计算泛化误差的可控部分
ge.append((1 - cvresult.mean())**2+cvresult.var())
#打印R2最高所对应的参数取值,并打印这个参数下的方差
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
#打印方差最低时对应的参数取值,并打印这个参数下的R2
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
#打印泛化误差可控部分的参数取值,并打印这个参数下的R2,方差以及泛化误差的可控部分
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="red",label="XGB")
plt.legend()
plt.show()
650 0.80476050359201 0.01053673846018678
50 0.7857724708830981 0.009072727885598212
150 0.8032842414878519 0.009747694343514357 0.04844478399052411

偏差最低的是650颗树,偏差0.8047,方差是0.01
方差最低的是50颗树,偏差0.7857,方差是0.0090
泛化最低是0.048,150颗树的时候
8. 细化学习曲线,找出最佳n_estimators
axisx = range(100,300,10)
rs = []
var = []
ge = []
for i in axisx:
reg = XGBR(n_estimators=i,random_state=420)
cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)
rs.append(cvresult.mean())
var.append(cvresult.var())
ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)*0.01
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
#添加方差线
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()
180 0.8038787848970184 0.00959321570484315
180 0.8038787848970184 0.00959321570484315
180 0.8038787848970184 0.00959321570484315 0.04805674671831314
plt.figure(figsize=(20,5))
plt.plot(axisx,ge,c="gray",linestyle='-.')
plt.show()

从上面的两幅图中可以看出在180的点上,泛化误差最小
9. 检测模型效果
time0 = time()
print(XGBR(n_estimators=100,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest))
print(time()-time0)
[21:06:57] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
0.9197580267581366
0.05806231498718262
time0 = time()
print(XGBR(n_estimators=660,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest))
print(time()-time0)
[21:07:00] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
0.9208745746309475
0.34624648094177246
time0 = time()
print(XGBR(n_estimators=180,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest))
print(time()-time0)
[21:07:05] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
0.9231068620728082
0.09406781196594238
从上面的对此可以看出在n_estimators=180时,模型的效果最好。
############ 2 有放回随机抽样:重要参数subsample ###############
正常来说样本量越大,模型才不容易过拟合
使用波士顿房价数据集,来看学习曲线
axisx = np.linspace(0,1,20)
rs = []
for i in axisx:
reg = XGBR(n_estimators=180,subsample=i,silent=True,random_state=420)
rs.append(CVS(reg,Xtrain,Ytrain,cv=cv).mean())
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="green",label="XGB")
plt.legend()
plt.show()
0.5789473684210527 0.8318837280265565
#细化学习曲线
axisx = np.linspace(0.05,1,20)
rs = []
var = []
ge = []
for i in axisx:
reg = XGBR(n_estimators=180,subsample=i,silent=True,random_state=420)
cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)
rs.append(cvresult.mean())
var.append(cvresult.var())
ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()
0.6 0.8314601729351381 0.009260717014609041
0.49999999999999994 0.8259911145591625 0.0060593719507855415
0.65 0.8313588174285554 0.006081711564499892 0.03452156002359519
根据上面的最小值0.65,再次细化范围
axisx = np.linspace(0.6,0.7,10)
rs = []
var = []
ge = []
for i in axisx:
reg = XGBR(n_estimators=180,subsample=i,random_state=420,silent=True)
cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)
rs.append(cvresult.mean())
var.append(cvresult.var())
ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()
0.6555555555555556 0.8413109225668833 0.004993581748467868
0.6555555555555556 0.8413109225668833 0.004993581748467868
0.6555555555555556 0.8413109225668833 0.004993581748467868 0.030175805045041564

#看看泛化误差的情况如何
reg = XGBR(n_estimators=180
,subsample=0.6555555555555556
,random_state=420).fit(Xtrain,Ytrain)
reg.score(Xtest,Ytest)
[21:47:56] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
Out[37]: 0.917734803309711
MSE(Ytest,reg.predict(Xtest))
Out[38]: 7.6550961557387565
################## 3 迭代决策树:重要参数eta #############
#首先我们先来定义一个评分函数,这个评分函数能够帮助我们直接打印Xtrain上的交叉验证结果
def regassess(reg, Xtrain, Ytrain, cv, scoring=["r2"], show=True):
score = []
for i in range(len(scoring)):
if show:
print("{}:{:.2f}".format(scoring[i],CVS(reg, Xtrain, Ytrain,cv=cv, scoring=scoring[i]).mean()))
score.append(CVS(reg, Xtrain, Ytrain,cv=cv, scoring=scoring[i]).mean())
return score
#运行一下函数来看看效果
reg = XGBR(n_estimators=180,random_state=420,silent=True)
regassess(reg, Xtrain, Ytrain, cv, scoring=["r2","neg_mean_squared_error"])
r2:0.80
neg_mean_squared_error:-13.48
Out[42]: [0.8038787848970184, -13.482301822063182]
#观察一下eta如何影响我们的模型
from time import time
import datetime
for i in [0,0.2,0.5, 1]:
time0 = time()
reg = XGBR(n_estimators=150, random_state=420, learning_rate=i,silent=True)
print("learning_rate = {}".format(i))
regassess(reg, Xtrain, Ytrain, cv, scoring=["r2","neg_mean_squared_error"])
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
print("\t")
learning_rate = 0
r2:-6.76
neg_mean_squared_error:-567.55
00:01:307949
learning_rate = 0.2
r2:0.81
neg_mean_squared_error:-13.38
00:01:390988
learning_rate = 0.5
r2:0.81
neg_mean_squared_error:-13.22
00:01:379980
learning_rate = 1
r2:0.72
neg_mean_squared_error:-19.11
00:01:389006
从结果上看,步长不为0时,R²的效果增加;随着步长的变大,运行时间增加