目录
第一步:学习率为0.1,粗调基学习器的数目n_estimators
第二步:调整树的参数:max_depth & min_child_weight
XGBoost本身训练数据比较慢,所以直接输入Otto的原始特征。
XGBoost的主要超参数包括:
- 树的数目n_estimators和学习率learning_rate
- 树的最大深度max_depth
- 叶子节点的最小样本数:min_child_weight
- 每棵树的列采样比例:colsample_bytree
- 每棵树的行采样比例:subsample
- 正则化参数lambda_ll(reg_alpha),lambda_l2(reg_lambda)
对n_estimators,XGBoost学习的过程内嵌了cv,速度快。
其他参数用GridSearchCV。
***模型训练部分***
1.工具类
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpty as np
import matplotlib.pyplot as plt
2.读取数据
dpath='./data/'
train=pd.read_csv(dpath+"Otto_train.csv")
#XGBoost只接受数字型标签
y_train=y_train.map(lambda s:s[6:])
y_train=y_trian.map(lambda s:int(s)-1)
X_train=train.drop(["id","target"],axis=1)
#保存特征名字以备后用(可视化)
feat_names=X_train.columns
#sklearn的学习器大多之一稀疏数据输入,模型训练会快很多
#查看一个学习器是否支持稀疏数据,可以查看fit函数是否支持:
#x:(array-like,sparse-matrix)
#可自行用timeit比较稠密数据和稀疏数据的训练时间
from scipy.sparse import csr_matrix
X_train=csr_matrix(X_train)
不同参数调优用相同的交叉验证数据分析
from sklearn.model_selection import StratifiedKFold
kfold=StratifiedKFold(n_splits=3,shuffle=True,random_state3)
第一步:学习率为0.1,粗调基学习器的数目n_estimators
MAX_ROUNDS=10000#最大迭代次数
#直接调用xgboost内嵌的交叉验证(cv)可对连续的n_estimators参数进行快速交叉验证
#而GridSearchCV只能对有限个参数进行交叉验证
def get_n_estimators(params,X_train,y_train,early_stopping_rounds=10):
xgb_params=params.copy()
#直接调用xgboost,而非sklearn的wrapper类
xgb_train=xgb.DMatrix(X_train,label=y_trian)
cvresult=xgb.cv(xgb_params,xgb_train,num_boost_round=MAX_ROUNDS,nfold=3,
metrics='mlogloss',early_stopping_rounds=
early_stpping_rounds,seed=3)
cvresults.to_csv('1_estimators.csv',index_label='n_estimators')
#最佳参数n_estimators
n_estimators=cvresults.shape[0]
print('best n_estimators:',n_estimators)
print('best cv score:',cvresult['test-mlogloss-mean'][n_estimators-1]
return n_estimators
params={'learning_rate':0.1,
'min_child_weight':1,
'max_depth':5,
'subsample':0.7
'colsample_bytree':0.7
'objective':'multi:softprob',
'num_class':9,
'n_jobs':4
}
n_estimators_1=get_n_estimators(params,X_train,y_train)
第二步:调整树的参数:max_depth & min_child_weight
这两个采纳数尽可能一起调,因为max_depth和min_child_weight都直接影响树模型的复杂度
如果计算资源有限,也可类似坐标轴下降,先调其中一个,然后调另一个。
如果是分类任务,且不同类的样本数目不均衡,最好先调min_child_weight,
以免max_depth对少数类样本过拟合
粗调,参数的步长为2
设置参数的搜索范围
#max_depth 建议3-10 min_child_weight=1/sqrt(ratio_rare_event=5.5)
max_depth=range(5,10,2)
min_child_weight=range(1,6,2)
tuned_params=dict(max_depth=max_depth,min_child_weight=min_child_weight)
设置其他参数值
第一轮参数调整得到的n_estimators最优值(636),其余参数继续默认值
params={'learning_rate':0.1,
'n_estimators':636,
#'min_child_weight':1,
#'max_depth':5,
'subsample':0.7
'colsample_bytree':0.7
'objective':'multi:softprob',
# 'num_class':9,
#'n_jobs':4
'nthread':4
}
xgb_g=XGBClassifier(silent=False,**params)
交叉验证GridSearchCV
等待(XGBoost在特征数目少一倍的情况下,一次fit的时间差不多是和LitGBM的两倍)
用交叉验证评价模型性能时,用scoring参数定义评价指标,评价指标是越高越好,
因此用一些损失函数当评价指标时需要再加负号
grid_search=GridSearchCV(xgb_g,param_grid=tuned_params,scoring=
'neg_los_loss',n_jobs=4,cv=kfold,verbose=5,refit=False)
grid_search.fit(X_train,y_train)
交叉验证结果可视化
print("Best:%f using %s"%(grid_search.best_score_,grid_search.best_params_)
test_means=grid_search.cv_results_['mean_test_score']
pd.DataFrame(grid_search.cv_results_).to_csv(
'maxdepth_min_child_weights_1.csv")
#plot results
test_scores=np.array(test_means).reshape(len(max_depth),len(min_child_weight))
for i,value in enumerate(max_depth):
plt.plot(min_child_weight,- test_scores[i],
label='test_max_depth'+str(values))
plt.legend()
plt.xlabel('min_child_weight')
plt.ylabel('Log Loss')
plt.savefig('max_depth_and_min_child_weight_1.png')
plt.show()
微调,参数的步长为1
#step=1,继续微调
max_depth=range(4,7,1)
min_child_weight=range(2,5,1)
tuned_params=dict(max_depth=max_depth,min_child_weight=min_child_weight)
params={'learning_rate':0.1,
'n_estimators':636,
'subsample':0.7,
'colsample_bytree':0.7,
'objective':'multi:softprob'
'nthread':4
}
xgb_g=XGBClassifier(silent=False,**params)
grid_search=GridSearchCV(xgb_g,param_grid=tuned_params,scoring='neg_log_loss',
n_jobs=4,cv=kfold,refit=False)
grid_search.fit(X_train,y_train)
print("Best:%f using %s"%(grid_search.best_score_,grid_search.best_params_)
test_means=grid_search.cv_results_['mean_test_score']
pd.DataFrame(grid_search.cv_results_).to_csv(
'maxdepth_min_child_weights_1.csv")
#plot results
test_scores=np.array(test_means).reshape(len(max_depth),len(min_child_weight))
for i,value in enumerate(max_depth):
plt.plot(min_child_weight,- test_scores[i],
label='test_max_depth'+str(values))
plt.legend()
plt.xlabel('min_child_weight')
plt.ylabel('Log Loss')
plt.savefig('max_depth_and_min_child_weight_1.png')
plt.show()
第三步 行采样比例
sub_samples=[i/10.0 for i in range(1,10)]
tuned_params=dict(subsamples=sub_samples)
params={'learning_rate':0.1,
'n_estimators':636,
'min_child_weight':3,
'max_depth':5,
'colsample_bytree':0.7,
'objective':'multi:softprob',
'nthread':4
}
xgb_g=XGBClassifier(silent=False,**params)
grid_search=GridSearchCV(xgb_g,param_grid=tuned_params,scoring='neg_log_loss',
n_jobs=4,cv=kfold,refit=False)
grid_search.fit(X_train,y_train)
print("Best:%f using %s"% (grid_search.best_score_,grid_search.best_params_))
test_means=grid_search.cv_results_['mean_test_score']
pd.DataFrame(grid_search.cv_results_).to_csv('subsample.csv')
pd.plot(subsample_s,-test_means)
plt.legend()
plt.xlabel('subsample')
plt.ylabel('Log Loss')
plt.savefig('subsample.png')
输出结果:
第五步 列采样比例
colsamples_bytree_s=[i/10.0 for i in range(5,10)]
tuned_params=dict(colsample_bytree=colsample_bytree_s)
params={'learning_rate':0.1,
'n_estimators':636,
'min_child_weight':3,
'max_depth':5,
'subsample':0.7,
'objective':'multi:softprob',
'nthread':4
}
xgb_g=XGBClassifier(silent=False,**params)
grid_search=GridSearchCV(xgb_g,param_grid=tuned_params,scoring='neg_log_loss',
n_jobs=4,cv=kfold,refit=False)
grid_search.fit(X_train,y_train)
print("Best:%f using %s"% (grid_search.best_score_,grid_search.best_params_))
test_means=grid_search.cv_results_['mean_test_score']
pd.DataFrame(grid_search.cv_results_).to_csv('subsample.csv')
pd.plot(subsample_s,-test_means)
plt.legend()
plt.xlabel('subsample')
plt.ylabel('Log Loss')
plt.savefig('subsample.png'
第五步 正则系数reg_lambda
reg_lambda:L2正则 默认为1
reg_lambda_s=range(0,6,1)
tuned_params=dict(reg_lambda=reg_lambda_s)
params={'learning_rate':0.1,
'n_estimators':636,
'min_child_weight':3,
'max_depth':5,
'subsample':0.7,
'colsample_bytree':0.7,
'objective':'multi:softprob',
'nthread':4
}
xgb_g=XGBClassifier(**params)
grid_search=GridSearchCV(xgb_g,param_grid=tuned_params,n_jobs=4,
scoring='neg_los_loss',cv=kfold,refit=False)
grid_search.fit(X_train,y_train)
#图形化输出结果
print("Best:%f using %s"% (grid_search.best_score_,grid_search.best_params_))
test_means=grid_search.cv_results_['mean_test_score']
pd.DataFrame(grid_search.cv_results_).to_csv('subsample.csv')
pd.plot(subsample_s,-test_means)
plt.legend()
plt.xlabel('subsample')
plt.ylabel('Log Loss')
plt.savefig('subsample.png'
输出结果:
第六步 正则参数reg_alpha
reg_alpha:L1正则
默认为0
reg_alpha_s=range(1,5,1)
tuned_params=dict(reg_alpha=reg_alpha_s)
params={'learning_rate':0.1,
'n_estimators':636,
'min_child_weight':3,
'max_depth':5,
'subsample':0.7,
'colsample_bytree':0.7,
'reg_lambda':2,
'objective':'multi:softprob',
'nthread':4
}
xgb_g=XGBClassifier(**params)
grid_search=GridSearch(xgb_s,param_grid=tuned_params,scoring='neg_log_loss',
n_jobs=4,refit=False)
grid_search.fit(X_train,y_train)
#图形化输出结果
print("Best:%f using %s"% (grid_search.best_score_,grid_search.best_params_))
test_means=grid_search.cv_results_['mean_test_score']
pd.DataFrame(grid_search.cv_results_).to_csv('subsample.csv')
pd.plot(subsample_s,-test_means)
plt.legend()
plt.xlabel('subsample')
plt.ylabel('Log Loss')
plt.savefig('subsample.png'
最后,用树的最佳参数,再次调整学习率和基学习的数目
params={'num_class':9,
'learning_rate':0.05,
'min_child_weight':3,
'max_depth':5,
'subsample':0.7,
'colsample_bytree':0.7,
'reg_lambda':2,
'reg_alpha':1,
'objective':'multi:softprob',
'nthread':4
}
n_estimators_2=get_n_estimators(params,X_train,y_train)
print(n_estimators_2)
保存模型,用于后续测试
import cPickle
cPickle.dump(xgb_g,open("Otto_XGBoost_org.pkl",'wb'))
#特征重要性
df=pd.DataFrame({"columns":list(feat_names),"importance":list(
xgb_g.feature_importances_T)})
df=df.sort_values(by=['importance'],ascending=False)
print(df)
柱状图显示:
plt.bar(range(xgb_g.feature_importances_)),xgb_g.feature_importances_T)
plt.show()
***模型测试部分***
准备数据
dpath='./data/'
test=pd.read_csv(dpath+"Otto_train.csv")
test_id=test['id']
X_test=test.drop(['id'],axis=1)
feature_names=X_test.columns
from scipy.sparse import csr_matrix
X_test=csr_matrix(X_test)
#导入训练好的模型
model=cPickle.load(open("Otto_XGBoost_org.pkl"),'rb')
#输出每类的概率
y_test_pred=model.predict_proba(X_test)
print(y_test_pred.shape)
生成提交结果:
out_df=pd.DataFrame(y_test_pred)
columns=np.empty(9,dtype=object)
for i in range(9):
columns[i]="Class_"+str(i+1)
out_df=pd.concat([test_id,out_df],axis=1)
out_df.to_csv("XGBoost_org.csv",index=False)
Kaggle网站上给出的排名为: