交叉验证
#实习公司一次部门分享,利用XGBoost进行调参
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import metrics
import matplotlib.pyplot as plt
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, precision_recall_curve, average_precision_score
breast = load_breast_cancer()
X = breast.data
y = breast.target
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,test_size=0.3,random_state=2020)
del breast
del X
del y
def ks_auc(target, predprob):
"""使用AUC查看模型效果,使用KS值查看模型过拟合程度"""
fpr,tpr,thresholds=roc_curve(target, predprob)
roc_auc=metrics.auc(fpr,tpr)
ks=max(abs(tpr-fpr))
return ks, roc_auc
def modelfit(alg, Xtrain, ytrain, Xtest, ytest,
useTrainCV=True, cv_folds=5, early_stopping_rounds=10):
Xtrain=np.array(Xtrain)
ytrain=np.array(ytrain)
Xtest=np.array(Xtest)
ytest=np.array(ytest)
if useTrainCV:
xgb_param=alg.get_xgb_params()
xgtrain=xgb.DMatrix(Xtrain, label=ytrain, missing=np.nan)
cvresult=xgb.cv(xgb_param,xgtrain,num_boost_round=alg.get_params()['n_estimators'],
nfold=cv_folds,metrics=['auc'],early_stopping_rounds=early_stopping_rounds)
alg.set_params(n_estimators=cvresult.shape[0])
model=alg.fit(Xtrain, ytrain, eval_metric='auc')
ytrain_predprob=alg.predict_proba(Xtrain)[:,1]
yest_predprob=alg.predict_proba(Xtest)[:,1]
ks_train, auc_train = ks_auc(ytrain, ytrain_predprob)
ks_test, auc_test = ks_auc(ytest, yest_predprob)
print('KS_train:', round(ks_train,2), 'AUC_train:', round(auc_train,2))
print('KS_test:', round(ks_test,2), 'AUC_test:', round(auc_test,2))
feat_imp=pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=True)
return model,feat_imp
#转化成xgb里面储存对象,方便计算
xgtrain = xgb.DMatrix(Xtrain, label=ytrain, missing=np.nan)
"""
第一步:给定学习率,选择此学习率最优的树的个数
初始参数:
gamma=0, 起始值可以较小,也可以选择0.1-0.2之间
max_depth=5, 这个参数最好在3-10之间,(4-6都是不错的选择)
min_children_weight=3
subsample,colsample_bytree=0.8, 起始在0.5-0.9之间
scale_pos_weight=1, 类别不平衡时使用
learning_rate=0.1
"""
alg=XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=3, gamma=0,
subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', n_jobs=4,
random_state=321)
model,feat_imp=modelfit(alg, Xtrain, ytrain, Xtest, ytest)
print('当学习率为0.1时,最佳n_estimators:', model.n_estimators)
print('='*50)
"""
第二步:使用网格搜索调整max_depth和min_children_weight
"""
param_test1={'max_depth':range(3,10,2),'min_child_weight':range(1,6,2)}
alg=XGBClassifier(learning_rate=0.1, n_estimators=46, gamma=0, subsample=0.8,
colsample_bytree=0.8, objective='binary:logistic', n_jobs=4, random_state=321)
gsearch1=GridSearchCV(estimator=alg,param_grid=param_test1,scoring='roc_auc',
n_jobs=4,iid=False,cv=5)
gsearch1.fit(Xtrain,ytrain)
print('gsearch1 best params:',gsearch1.best_params_)
print('gsearch1 best score:',gsearch1.best_score_)
print('='*50)
"""
第二步:在上面的基础上细化max_depth和min_children_weight参数
"""
param_test2={'max_depth':[1,2,3,4,5],'min_child_weight':[1,2,3]}
alg=XGBClassifier(learning_rate=0.1, n_estimators=46, gamma=0, subsample=0.8,
colsample_bytree=0.8, objective='binary:logistic', n_jobs=4, random_state=321)
gsearch2=GridSearchCV(estimator=alg,param_grid=param_test2,scoring='roc_auc',
n_jobs=4,iid=False,cv=5)
gsearch2.fit(Xtrain,ytrain)
print('gsearch2 best params:',gsearch2.best_params_)
print('gsearch2 best score:',gsearch2.best_score_)
print('='*50)
"""
第三步:gamma参数
"""
param_test3={'gamma':[i/10 for i in range(0,5)]}
alg=XGBClassifier(learning_rate=0.1, n_estimators=46, subsample=0.8, max_depth=3, min_child_weight=1,
colsample_bytree=0.8, objective='binary:logistic', n_jobs=4, random_state=321)
gsearch3=GridSearchCV(estimator=alg,param_grid=param_test3,scoring='roc_auc',
n_jobs=4,iid=False,cv=5)
gsearch3.fit(Xtrain,ytrain)
print('gsearch2 best params:',gsearch3.best_params_)
print('gsearch2 best score:',gsearch3.best_score_)
print('='*50)
"""
第三步:gamma参数
"""
param_test3={'gamma':[i/10 for i in range(0,5)]}
alg=XGBClassifier(learning_rate=0.1, n_estimators=46, subsample=0.8, max_depth=3, min_child_weight=1,
gamma=0.2, colsample_bytree=0.8, objective='binary:logistic', n_jobs=4, random_state=321)
gsearch3=GridSearchCV(estimator=alg,param_grid=param_test3,scoring='roc_auc',
n_jobs=4,iid=False,cv=5)
gsearch3.fit(Xtrain,ytrain)
print('gsearch3 best params:',gsearch3.best_params_)
print('gsearch3 best score:',gsearch3.best_score_)
print('='*50)
"""
第四步:再次调整树的个数,因为其他参数在变化,最佳的树的个数也在变化,根据最新的参数,重新确定最佳的树的个数
"""
alg=XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=3, min_child_weight=1, gamma=0.2,
subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', n_jobs=4,
random_state=321)
model,feat_imp=modelfit(alg, Xtrain, ytrain, Xtest, ytest)
print('更新几个参数后,最佳n_estimators::', model.n_estimators)
print('='*50)
"""
第五步:调整subsample和colsample_bytree
"""
param_test4={'subsample':[i/10 for i in range(6,10)],
'colsample_bytree':[i/10 for i in range(6,10)]}
alg=XGBClassifier(learning_rate=0.1, n_estimators=48, max_depth=3, min_child_weight=1,
gamma=0.2, objective='binary:logistic', n_jobs=4, random_state=321)
gsearch4=GridSearchCV(estimator=alg,param_grid=param_test4,scoring='roc_auc',
n_jobs=4,iid=False,cv=5)
gsearch4.fit(Xtrain,ytrain)
print('gsearch4 best params:',gsearch4.best_params_)
print('gsearch4 best score:',gsearch4.best_score_)
print('='*50)
"""
第六步:细化subsample和colsample_bytree
"""
param_test5={'subsample':[0.8,0.85,0.9,0.95],
'colsample_bytree':[0.3,0.4,0.5,0.6]}
alg=XGBClassifier(learning_rate=0.1, n_estimators=48, max_depth=3, min_child_weight=1,
gamma=0.2, objective='binary:logistic', n_jobs=4, random_state=321)
gsearch5=GridSearchCV(estimator=alg,param_grid=param_test5,scoring='roc_auc',
n_jobs=4,iid=False,cv=5)
gsearch5.fit(Xtrain,ytrain)
print('gsearch5 best params:',gsearch5.best_params_)
print('gsearch5 best score:',gsearch5.best_score_)
print('='*50)
"""
第七步:调整正则化参数reg_alpha
"""
param_test6={'reg_alpha':[1e-2,0.1,1,10,100]}
alg=XGBClassifier(learning_rate=0.1, n_estimators=48, max_depth=3, min_child_weight=1, subsample=0.85,
colsample_bytree=0.5, gamma=0.2, objective='binary:logistic', n_jobs=4, random_state=321)
gsearch6=GridSearchCV(estimator=alg,param_grid=param_test6,scoring='roc_auc',
n_jobs=4,iid=False,cv=5)
gsearch6.fit(Xtrain,ytrain)
print('gsearch6 best params:',gsearch6.best_params_)
print('gsearch6 best score:',gsearch6.best_score_)
print('='*50)
"""
第八步:细化正则化参数reg_alpha
"""
param_test7={'reg_alpha':[1e-2,2e-2,3e-2,4e-2,5e-2]}
alg=XGBClassifier(learning_rate=0.1, n_estimators=48, max_depth=3, min_child_weight=1, subsample=0.85,
colsample_bytree=0.5, gamma=0.2, objective='binary:logistic', n_jobs=4, random_state=321)
gsearch7=GridSearchCV(estimator=alg,param_grid=param_test7,scoring='roc_auc',
n_jobs=4,iid=False,cv=5)
gsearch7.fit(Xtrain,ytrain)
print('gsearch7 best params:',gsearch7.best_params_)
print('gsearch7 best score:',gsearch7.best_score_)
print('='*50)
"""
第九步:调整正则化参数reg_lambda
"""
param_test8={'reg_lambda':[1e-2,0.1,1,10,100]}
alg=XGBClassifier(learning_rate=0.1, n_estimators=48, max_depth=3, min_child_weight=1, subsample=0.85,
colsample_bytree=0.5, gamma=0.2, reg_alpha=0.03, objective='binary:logistic', n_jobs=4, random_state=321)
gsearch8=GridSearchCV(estimator=alg,param_grid=param_test8,scoring='roc_auc',
n_jobs=4,iid=False,cv=5)
gsearch8.fit(Xtrain,ytrain)
print('gsearch6 best params:',gsearch8.best_params_)
print('gsearch6 best score:',gsearch8.best_score_)
print('='*50)
"""
第十步:细化正则化参数reg_lambda
"""
param_test9={'reg_lambda':[0.1,0.5,0.8,1,2,3,4,5]}
alg=XGBClassifier(learning_rate=0.1, n_estimators=48, max_depth=3, min_child_weight=1, subsample=0.85,
colsample_bytree=0.5, gamma=0.2, reg_alpha=0.03, objective='binary:logistic', n_jobs=4, random_state=321)
gsearch9=GridSearchCV(estimator=alg,param_grid=param_test9,scoring='roc_auc',
n_jobs=4,iid=False,cv=5)
gsearch9.fit(Xtrain,ytrain)
print('gsearch9 best params:',gsearch9.best_params_)
print('gsearch9 best score:',gsearch9.best_score_)
print('='*50)
"""
第十一步:再次调整树的个数,因为其他参数在变化,最佳的树的个数也在变化,根据最新的参数,重新确定最佳的树的个数
"""
alg=XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=3, min_child_weight=1, subsample=0.85,
colsample_bytree=0.5, gamma=0.2, reg_alpha=0.03, reg_lambda=0.8, objective='binary:logistic', n_jobs=4, random_state=321)
model,feat_imp=modelfit(alg, Xtrain, ytrain, Xtest, ytest)
print('更新几个参数后,最佳n_estimators::', model.n_estimators)
print('='*50)
"""
第十二步:调整学习率
"""
param_test10={'learning_rate':[0.01,0.05,0.1,0.3,0.5,0.8,1]}
alg=XGBClassifier(n_estimators=48, max_depth=3, min_child_weight=1, subsample=0.85, colsample_bytree=0.5,
gamma=0.2, reg_alpha=0.03, reg_lambda=0.8, objective='binary:logistic', n_jobs=4, random_state=321)
gsearch10=GridSearchCV(estimator=alg,param_grid=param_test10,scoring='roc_auc',
n_jobs=4,iid=False,cv=5)
gsearch10.fit(Xtrain,ytrain)
print('gsearch10 best params:',gsearch10.best_params_)
print('gsearch10 best score:',gsearch10.best_score_)
print('='*50)
"""最终模型输出"""
alg=XGBClassifier(learning_rate=0.1, n_estimators=48, max_depth=3, min_child_weight=1, subsample=0.85, colsample_bytree=0.5,
gamma=0.2, reg_alpha=0.03, reg_lambda=0.8, objective='binary:logistic', n_jobs=4, random_state=321)
alg.fit(Xtrain,ytrain)
ytest_pred=alg.predict(Xtest)
ytest_predprob=alg.predict_proba(Xtest)
sum(ytest==ytest_pred)/len(ytest)
confusion_matrix(ytest,ytest_pred)
评估指标
混淆矩阵
真实值是positive,模型认为是positive的数量(True Positive=TP)
真实值是positive,模型认为是negative的数量(False Negative=FN):这就是统计学上的第一类错误(Type I Error)
真实值是negative,模型认为是positive的数量(False Positive=FP):这就是统计学上的第二类错误(Type II Error)
真实值是negative,模型认为是negative的数量(True Negative=TN)
from sklearn.metrics import confusion_matrix
y_true = [1, 0, 1, 1, 0, 1]
y_pred = [0, 0, 1, 1, 0, 1]
confusion_matrix(y_true, y_pred)
array([[2, 0],
[1, 3]])
p r e c i s i o n ( 准 确 率 ) = T P T P + F P precision(准确率)=\displaystyle\frac{TP}{TP+FP} precision(准确率)=TP+FPTP
r e c a l l ( 召 回 率 ) = T P T P + F N recall(召回率)=\displaystyle\frac{TP}{TP+FN} recall(召回率)=TP+FNTP
精确率与召回率是互斥的,综合指标F1
F 1 − measure = 2 ∗ precision ∗ recall ( precision + recall ) F1-\text { measure }=\displaystyle\frac{2 * \text { precision } * \text { recall}}{(\text {precision}+\text { recall})} F1− measure =(precision+ recall)2∗ precision ∗ recall
from sklearn.metrics import classification_report
y_true = [1, 0, 1, 1, 0, 1]
y_pred = [0, 0, 1, 1, 0, 1]
target_names = ['class 0', 'class 1']
print(classification_report(y_true, y_pred, target_names=target_names))
precision recall f1-score support
class 0 0.67 1.00 0.80 2
class 1 1.00 0.75 0.86 4
accuracy 0.83 6
macro avg 0.83 0.88 0.83 6
weighted avg 0.89 0.83 0.84 6
如果是二分类问题,使用上述F1指标就足够了,如果是多分类问题,则需要用到微平均(Micro-averaging)和宏平均(Macro-averaging)
y_true = [0, 1, 2, 0, 1, 2]
y_pred = [0, 2, 1, 0, 0, 1]
首先得到上面的各个类别的混淆矩阵,如在计算类别0的precision、recall时,将类别1和类别2当作一类处理,从而计算出类别0的混淆矩阵。
参考文档
指标 | 0类 | 1类 | 2类 | 合计 |
---|---|---|---|---|
TP | 2 | 0 | 0 | 2 |
FP | 2 | 2 | 1 | 4 |
FN | 0 | 2 | 2 | 4 |
微平均(Micro-averaging):
p
r
e
c
i
s
i
o
n
=
2
2
+
4
=
1
3
precision=\displaystyle\frac{2}{2+4}=\displaystyle\frac{1}{3}
precision=2+42=31
r e c a l l = 2 2 + 4 = 1 3 recall=\displaystyle\frac{2}{2+4}=\displaystyle\frac{1}{3} recall=2+42=31
F
M
i
c
r
o
=
2
∗
1
3
∗
1
3
(
1
3
+
1
3
)
=
1
3
F_{Micro}=\displaystyle\frac{2 * \displaystyle\frac{1}{3} * \displaystyle\frac{1}{3}}{(\displaystyle\frac{1}{3}+\displaystyle\frac{1}{3})}=\displaystyle\frac{1}{3}
FMicro=(31+31)2∗31∗31=31
宏平均(Macro-averaging):
类别0:
p
r
e
c
i
s
i
o
n
=
2
3
precision=\displaystyle\frac{2}{3}
precision=32,
r
e
c
a
l
l
=
1
recall=1
recall=1,
F
=
0.8
F=0.8
F=0.8
类别1:
p
r
e
c
i
s
i
o
n
=
0
precision=0
precision=0,
r
e
c
a
l
l
=
0
recall=0
recall=0,
F
=
0
F=0
F=0
类别2:
p
r
e
c
i
s
i
o
n
=
0
precision=0
precision=0,
r
e
c
a
l
l
=
0
recall=0
recall=0,
F
=
0
F=0
F=0
F
M
a
c
r
o
=
0.8
+
0
+
0
3
=
0.26
F_{Macro}=\displaystyle\frac{0.8+0+0}{3}=0.26
FMacro=30.8+0+0=0.26
各类平均加权(Weighted-averaging):
权重为各类别数在y_true中所占比例,上面的例子中0类、1类、2类的权重均为1/3
F
W
e
i
g
h
t
e
d
=
0.8
3
+
0
3
+
0
3
=
0.26
F_{Weighted}=\displaystyle\frac{0.8}{3}+\displaystyle\frac{0}{3}+\displaystyle\frac{0}{3}=0.26
FWeighted=30.8+30+30=0.26
参考文档
#sklearn官网例子
from sklearn.metrics import f1_score
y_true = [0, 1, 2, 0, 1, 2]
y_pred = [0, 2, 1, 0, 0, 1]
print("Micro-averaging:",f1_score(y_true, y_pred, average='macro'))
print("Macro-averaging:",f1_score(y_true, y_pred, average='micro'))
print("Weighted-averaging:",f1_score(y_true, y_pred, average='weighted'))
#如果是二分类,使用binary就可以了
#f1_score(y_true, y_pred, average='binary')
Micro-averaging: 0.26666666666666666
Macro-averaging: 0.3333333333333333
Weighted-averaging: 0.26666666666666666
宏平均和微平均的对比
- 如果每个class的样本数量差不多,那么宏平均和微平均没有太大差异
- 如果每个class的样本数量差异很大,而且你想:
更注重样本量多的class:使用微平均
更注重样本量少的class:使用宏平均 - 如果微平均大大低于宏平均,检查样本量多的class
- 如果宏平均大大低于微平均,检查样本量少的class
PR、ROC
模型预测之后都会产生一个概率,一般来说会将概率大于0.5作为1类,小于0.5作为0类。
PR、ROC曲线的绘制,首先先确定一个较大的概率阈值,然后计算该阈值下的precision、recall、
T
P
R
=
T
P
T
P
+
F
N
TPR=\displaystyle\frac{TP}{TP+FN}
TPR=TP+FNTP、
F
P
R
=
F
P
F
P
+
T
N
FPR=\displaystyle\frac{FP}{FP+TN}
FPR=FP+TNFP,然后变化阈值,再次计算上述指标,重复操作,最后连成线。
AUC是ROC曲线的面积,面积越大越好。
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, precision_recall_curve, average_precision_score
y_true=[1,0,1,1,1,1,0,0,0,0]
y_predprob=[0.5,0.4,0.7,0.9,0.1,0.4,0.3,0.4,0.3,0.1]
fpr,tpr,thresholds=roc_curve(y_true, y_predprob)
precision,recall,thresholds=precision_recall_curve(y_true, y_predprob)
roc_auc=metrics.auc(fpr,tpr)
ap=average_precision_score(y_true, y_predprob)
plt.rcParams[u'font.sans-serif'] = ['Arial Unicode MS']
fig = plt.figure(figsize=(8,6))
ax1 = fig.add_subplot(2,2,1)
plt.plot(recall,precision,'r',label='average_precision:{}'.format(round(ap,4)))
plt.title('PR')
plt.ylabel('recall')
plt.xlabel('precision')
plt.legend(loc='lower right')
ax2=fig.add_subplot(2,2,2) # 第一行的右图
plt.plot(fpr,tpr,'r',label='AUC:{s}'.format(s=round(roc_auc,4)))
plt.title('ROC')
plt.ylabel('TPR')
plt.xlabel('FPR')
plt.legend(loc='lower right')
随手写的数据,图像画的比较丑(汗)
KS值
KS值是MAX(TPR - FPR),可以用来评价模型的过拟合程度,
举个例子:
训练集的KS值为0.8,测试集的KS值为0.4,从训练集到测试集KS降的比较多,模型过拟合严重;
训练集的KS值为0.8,测试集的KS值为0.79,从训练集到测试集KS差不多,模型较好;
从实习公司的一次部门内部分享中了解到,实际操作中,从训练集到测试集KS值不能下降超过3个点(仅供参考)
def ks_auc(target, predprob):
fpr,tpr,thresholds=roc_curve(target, predprob)
roc_auc=metrics.auc(fpr,tpr)
ks=max(abs(tpr-fpr))
return ks, roc_auc
一个较好的包Yellowbrick
Yellowbrick官网
可以较快的画出上述图像
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ConfusionMatrix,ROCAUC,PrecisionRecallCurve,ClassificationReport
breast = load_breast_cancer()
X = breast.data
y = breast.target
Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,test_size=0.3,random_state=2020)
model=LogisticRegression(tol=10,max_iter=4)
#混淆矩阵
visualizer=ConfusionMatrix(model, classes=['no_cancer','cancer'], cmap='Blues')
visualizer.fit(Xtrain, ytrain)
visualizer.score(Xtest, ytest)
visualizer.show()
#ROC曲线
visualizer=ROCAUC(model,classes=['no_cancer','cancer'],title=' ')#title=' '可以取消标题
visualizer.fit(Xtrain, ytrain)
visualizer.score(Xtest, ytest)
visualizer.show()
#分类报告
visualizer = ClassificationReport(model, classes=['no_cancer','cancer'], cmap='Blues')
visualizer.fit(Xtrain, ytrain)
visualizer.score(Xtest, ytest)
visualizer.show()