1、学习曲线
axisx=np.arange(0.27,0.37,0.02)
rs=[]
for i in axisx:
clf=XGBC(n_estimators=340,learning_rate=i)
rs.append(CVS(clf,xtrain,ytrain,cv=cv).mean())
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(16,8))
plt.plot(axisx,rs,label='XGB')
plt.legend()
plt.show()
2、样本数学习曲线
def plot_learning_curve(estimator,title, X, y, ax=None, ylim=None, cv=None,n_jobs=None):
train_sizes, train_scores, test_scores = learning_curve(estimator, X, y,shuffle=True,cv=cv,n_jobs=n_jobs)
if ax == None:
ax = plt.gca()
else:
ax = plt.figure()
ax.set_title(title)
if ylim is not None:
ax.set_ylim(*ylim)
ax.set_xlabel("Training examples")
ax.set_ylabel("Score")
#ax.grid() #绘制网格,不是必须
ax.plot(train_sizes, np.mean(train_scores, axis=1), 'o-',color="r",label="Training score")
ax.plot(train_sizes, np.mean(test_scores, axis=1), 'o-', color="g",label="Test score")
ax.legend(loc="best")
return ax
cv = KFold(n_splits=5, shuffle = True, random_state=42)
plot_learning_curve(XGBC(),"XGB",xtrain,ytrain,ax=None,cv=cv)
plt.show()
3、弱评估器的选择
for booster in ["gbtree","gblinear","dart"]:
clf=XGBC(n_estimators=340,learning_rate=0.31,booster=booster).fit(xtrain,ytrain)
print(booster)
print(clf.score(xtest,ytest))
4、ROC曲线
fpr,tpr, thresholds = roc_curve(ytest,clf.predict_proba(xtest)[:,1])
plt.plot(fpr,tpr,label='ROC')
plt.xlabel('FPR')
plt.ylabel('TPR')
5、交叉验证的普适方法
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import KFold
cv = KFold(n_splits=5,shuffle=T r u e,random_state=1412)
clf = RFC(n_estimators=200,random_state=1412)
acc = cross_val_score(clf,Xtrain,Ytrain,cv=cv)

该博客探讨了机器学习模型的优化过程,包括XGBoost的学习曲线调整,通过改变学习率找到最佳参数。同时,展示了样本数学习曲线,以理解不同训练样本数量对模型性能的影响。此外,比较了不同弱评估器(gbtree, gblinear, dart)的效果,并利用ROC曲线评估模型的分类性能。最后,通过交叉验证的方法评估了随机森林分类器的泛化能力。
1492

被折叠的 条评论
为什么被折叠?



