机器学习中杂七杂八指标总结

交叉验证

#实习公司一次部门分享,利用XGBoost进行调参
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import metrics
import matplotlib.pyplot as plt
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import roc_curve, precision_recall_curve, average_precision_score

breast = load_breast_cancer()
X = breast.data
y = breast.target

Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,test_size=0.3,random_state=2020)
del breast
del X
del y

def ks_auc(target, predprob):
	"""使用AUC查看模型效果,使用KS值查看模型过拟合程度"""
    fpr,tpr,thresholds=roc_curve(target, predprob)
    roc_auc=metrics.auc(fpr,tpr)
    ks=max(abs(tpr-fpr))
    return ks, roc_auc

def modelfit(alg, Xtrain, ytrain, Xtest, ytest, 
             useTrainCV=True, cv_folds=5, early_stopping_rounds=10):
    Xtrain=np.array(Xtrain)
    ytrain=np.array(ytrain)
    Xtest=np.array(Xtest)
    ytest=np.array(ytest)
    if useTrainCV:
        xgb_param=alg.get_xgb_params()
        xgtrain=xgb.DMatrix(Xtrain, label=ytrain, missing=np.nan)
        cvresult=xgb.cv(xgb_param,xgtrain,num_boost_round=alg.get_params()['n_estimators'],
                        nfold=cv_folds,metrics=['auc'],early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
        
    model=alg.fit(Xtrain, ytrain, eval_metric='auc')
    
    ytrain_predprob=alg.predict_proba(Xtrain)[:,1]
    yest_predprob=alg.predict_proba(Xtest)[:,1]

    ks_train, auc_train = ks_auc(ytrain, ytrain_predprob)
    ks_test, auc_test = ks_auc(ytest, yest_predprob)
    
    print('KS_train:', round(ks_train,2), 'AUC_train:', round(auc_train,2))
    print('KS_test:', round(ks_test,2), 'AUC_test:', round(auc_test,2))
    
    feat_imp=pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=True)
    
    return model,feat_imp


#转化成xgb里面储存对象,方便计算
xgtrain = xgb.DMatrix(Xtrain, label=ytrain, missing=np.nan)

"""
第一步:给定学习率,选择此学习率最优的树的个数
初始参数:
gamma=0,                            起始值可以较小,也可以选择0.1-0.2之间
max_depth=5,                        这个参数最好在3-10之间,(4-6都是不错的选择)
min_children_weight=3
subsample,colsample_bytree=0.8,     起始在0.5-0.9之间
scale_pos_weight=1,                 类别不平衡时使用
learning_rate=0.1
"""
alg=XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=3, gamma=0,
                  subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', n_jobs=4, 
                  random_state=321)
model,feat_imp=modelfit(alg, Xtrain, ytrain, Xtest, ytest)
print('当学习率为0.1时,最佳n_estimators:', model.n_estimators)
print('='*50)

"""
第二步:使用网格搜索调整max_depth和min_children_weight
"""
param_test1={'max_depth':range(3,10,2),'min_child_weight':range(1,6,2)}
alg=XGBClassifier(learning_rate=0.1, n_estimators=46, gamma=0, subsample=0.8,
                  colsample_bytree=0.8, objective='binary:logistic', n_jobs=4, random_state=321)

gsearch1=GridSearchCV(estimator=alg,param_grid=param_test1,scoring='roc_auc',
                      n_jobs=4,iid=False,cv=5)
gsearch1.fit(Xtrain,ytrain)
print('gsearch1 best params:',gsearch1.best_params_)
print('gsearch1 best score:',gsearch1.best_score_)
print('='*50)

"""
第二步:在上面的基础上细化max_depth和min_children_weight参数
"""
param_test2={'max_depth':[1,2,3,4,5],'min_child_weight':[1,2,3]}
alg=XGBClassifier(learning_rate=0.1, n_estimators=46, gamma=0, subsample=0.8,
                  colsample_bytree=0.8, objective='binary:logistic', n_jobs=4, random_state=321)

gsearch2=GridSearchCV(estimator=alg,param_grid=param_test2,scoring='roc_auc',
                      n_jobs=4,iid=False,cv=5)
gsearch2.fit(Xtrain,ytrain)
print('gsearch2 best params:',gsearch2.best_params_)
print('gsearch2 best score:',gsearch2.best_score_)
print('='*50)

"""
第三步:gamma参数
"""
param_test3={'gamma':[i/10 for i in range(0,5)]}
alg=XGBClassifier(learning_rate=0.1, n_estimators=46, subsample=0.8, max_depth=3, min_child_weight=1, 
                  colsample_bytree=0.8, objective='binary:logistic', n_jobs=4, random_state=321)

gsearch3=GridSearchCV(estimator=alg,param_grid=param_test3,scoring='roc_auc',
                      n_jobs=4,iid=False,cv=5)
gsearch3.fit(Xtrain,ytrain)
print('gsearch2 best params:',gsearch3.best_params_)
print('gsearch2 best score:',gsearch3.best_score_)
print('='*50)

"""
第三步:gamma参数
"""
param_test3={'gamma':[i/10 for i in range(0,5)]}
alg=XGBClassifier(learning_rate=0.1, n_estimators=46, subsample=0.8, max_depth=3, min_child_weight=1, 
                  gamma=0.2, colsample_bytree=0.8, objective='binary:logistic', n_jobs=4, random_state=321)

gsearch3=GridSearchCV(estimator=alg,param_grid=param_test3,scoring='roc_auc',
                      n_jobs=4,iid=False,cv=5)
gsearch3.fit(Xtrain,ytrain)
print('gsearch3 best params:',gsearch3.best_params_)
print('gsearch3 best score:',gsearch3.best_score_)
print('='*50)

"""
第四步:再次调整树的个数,因为其他参数在变化,最佳的树的个数也在变化,根据最新的参数,重新确定最佳的树的个数
"""
alg=XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=3, min_child_weight=1, gamma=0.2,
                  subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', n_jobs=4, 
                  random_state=321)
model,feat_imp=modelfit(alg, Xtrain, ytrain, Xtest, ytest)
print('更新几个参数后,最佳n_estimators::', model.n_estimators)
print('='*50)

"""
第五步:调整subsample和colsample_bytree
"""
param_test4={'subsample':[i/10 for i in range(6,10)],
             'colsample_bytree':[i/10 for i in range(6,10)]}
alg=XGBClassifier(learning_rate=0.1, n_estimators=48, max_depth=3, min_child_weight=1, 
                  gamma=0.2, objective='binary:logistic', n_jobs=4, random_state=321)

gsearch4=GridSearchCV(estimator=alg,param_grid=param_test4,scoring='roc_auc',
                      n_jobs=4,iid=False,cv=5)
gsearch4.fit(Xtrain,ytrain)
print('gsearch4 best params:',gsearch4.best_params_)
print('gsearch4 best score:',gsearch4.best_score_)
print('='*50)

"""
第六步:细化subsample和colsample_bytree
"""
param_test5={'subsample':[0.8,0.85,0.9,0.95],
             'colsample_bytree':[0.3,0.4,0.5,0.6]}
alg=XGBClassifier(learning_rate=0.1, n_estimators=48, max_depth=3, min_child_weight=1, 
                  gamma=0.2, objective='binary:logistic', n_jobs=4, random_state=321)

gsearch5=GridSearchCV(estimator=alg,param_grid=param_test5,scoring='roc_auc',
                      n_jobs=4,iid=False,cv=5)
gsearch5.fit(Xtrain,ytrain)
print('gsearch5 best params:',gsearch5.best_params_)
print('gsearch5 best score:',gsearch5.best_score_)
print('='*50)

"""
第七步:调整正则化参数reg_alpha
"""
param_test6={'reg_alpha':[1e-2,0.1,1,10,100]}
alg=XGBClassifier(learning_rate=0.1, n_estimators=48, max_depth=3, min_child_weight=1, subsample=0.85, 
                  colsample_bytree=0.5, gamma=0.2, objective='binary:logistic', n_jobs=4, random_state=321)

gsearch6=GridSearchCV(estimator=alg,param_grid=param_test6,scoring='roc_auc',
                      n_jobs=4,iid=False,cv=5)
gsearch6.fit(Xtrain,ytrain)
print('gsearch6 best params:',gsearch6.best_params_)
print('gsearch6 best score:',gsearch6.best_score_)
print('='*50)

"""
第八步:细化正则化参数reg_alpha
"""
param_test7={'reg_alpha':[1e-2,2e-2,3e-2,4e-2,5e-2]}
alg=XGBClassifier(learning_rate=0.1, n_estimators=48, max_depth=3, min_child_weight=1, subsample=0.85, 
                  colsample_bytree=0.5, gamma=0.2, objective='binary:logistic', n_jobs=4, random_state=321)

gsearch7=GridSearchCV(estimator=alg,param_grid=param_test7,scoring='roc_auc',
                      n_jobs=4,iid=False,cv=5)
gsearch7.fit(Xtrain,ytrain)
print('gsearch7 best params:',gsearch7.best_params_)
print('gsearch7 best score:',gsearch7.best_score_)
print('='*50)

"""
第九步:调整正则化参数reg_lambda
"""
param_test8={'reg_lambda':[1e-2,0.1,1,10,100]}
alg=XGBClassifier(learning_rate=0.1, n_estimators=48, max_depth=3, min_child_weight=1, subsample=0.85, 
                  colsample_bytree=0.5, gamma=0.2, reg_alpha=0.03, objective='binary:logistic', n_jobs=4, random_state=321)

gsearch8=GridSearchCV(estimator=alg,param_grid=param_test8,scoring='roc_auc',
                      n_jobs=4,iid=False,cv=5)
gsearch8.fit(Xtrain,ytrain)
print('gsearch6 best params:',gsearch8.best_params_)
print('gsearch6 best score:',gsearch8.best_score_)
print('='*50)

"""
第十步:细化正则化参数reg_lambda
"""
param_test9={'reg_lambda':[0.1,0.5,0.8,1,2,3,4,5]}
alg=XGBClassifier(learning_rate=0.1, n_estimators=48, max_depth=3, min_child_weight=1, subsample=0.85, 
                  colsample_bytree=0.5, gamma=0.2, reg_alpha=0.03, objective='binary:logistic', n_jobs=4, random_state=321)

gsearch9=GridSearchCV(estimator=alg,param_grid=param_test9,scoring='roc_auc',
                      n_jobs=4,iid=False,cv=5)
gsearch9.fit(Xtrain,ytrain)
print('gsearch9 best params:',gsearch9.best_params_)
print('gsearch9 best score:',gsearch9.best_score_)
print('='*50)

"""
第十一步:再次调整树的个数,因为其他参数在变化,最佳的树的个数也在变化,根据最新的参数,重新确定最佳的树的个数
"""
alg=XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=3, min_child_weight=1, subsample=0.85, 
                  colsample_bytree=0.5, gamma=0.2, reg_alpha=0.03, reg_lambda=0.8, objective='binary:logistic', n_jobs=4, random_state=321)
model,feat_imp=modelfit(alg, Xtrain, ytrain, Xtest, ytest)
print('更新几个参数后,最佳n_estimators::', model.n_estimators)
print('='*50)

"""
第十二步:调整学习率
"""
param_test10={'learning_rate':[0.01,0.05,0.1,0.3,0.5,0.8,1]}
alg=XGBClassifier(n_estimators=48, max_depth=3, min_child_weight=1, subsample=0.85, colsample_bytree=0.5, 
                  gamma=0.2, reg_alpha=0.03, reg_lambda=0.8, objective='binary:logistic', n_jobs=4, random_state=321)

gsearch10=GridSearchCV(estimator=alg,param_grid=param_test10,scoring='roc_auc',
                      n_jobs=4,iid=False,cv=5)
gsearch10.fit(Xtrain,ytrain)
print('gsearch10 best params:',gsearch10.best_params_)
print('gsearch10 best score:',gsearch10.best_score_)
print('='*50)

"""最终模型输出"""
alg=XGBClassifier(learning_rate=0.1, n_estimators=48, max_depth=3, min_child_weight=1, subsample=0.85, colsample_bytree=0.5, 
                  gamma=0.2, reg_alpha=0.03, reg_lambda=0.8, objective='binary:logistic', n_jobs=4, random_state=321)
alg.fit(Xtrain,ytrain)
ytest_pred=alg.predict(Xtest)
ytest_predprob=alg.predict_proba(Xtest)

sum(ytest==ytest_pred)/len(ytest)
confusion_matrix(ytest,ytest_pred)

评估指标

混淆矩阵

真实值是positive,模型认为是positive的数量(True Positive=TP)
真实值是positive,模型认为是negative的数量(False Negative=FN):这就是统计学上的第一类错误(Type I Error)
真实值是negative,模型认为是positive的数量(False Positive=FP):这就是统计学上的第二类错误(Type II Error)
真实值是negative,模型认为是negative的数量(True Negative=TN)
在这里插入图片描述

from sklearn.metrics import confusion_matrix
y_true = [1, 0, 1, 1, 0, 1]
y_pred = [0, 0, 1, 1, 0, 1]
confusion_matrix(y_true, y_pred)
array([[2, 0],
       [1, 3]])

p r e c i s i o n ( 准 确 率 ) = T P T P + F P precision(准确率)=\displaystyle\frac{TP}{TP+FP} precision()=TP+FPTP

r e c a l l ( 召 回 率 ) = T P T P + F N recall(召回率)=\displaystyle\frac{TP}{TP+FN} recall()=TP+FNTP

精确率与召回率是互斥的,综合指标F1

F 1 −  measure  = 2 ∗  precision  ∗  recall ( precision +  recall ) F1-\text { measure }=\displaystyle\frac{2 * \text { precision } * \text { recall}}{(\text {precision}+\text { recall})} F1 measure =(precision+ recall)2 precision  recall

from sklearn.metrics import classification_report
y_true = [1, 0, 1, 1, 0, 1]
y_pred = [0, 0, 1, 1, 0, 1]
target_names = ['class 0', 'class 1']
print(classification_report(y_true, y_pred, target_names=target_names))
             precision    recall  f1-score   support

     class 0       0.67      1.00      0.80         2
     class 1       1.00      0.75      0.86         4

    accuracy                           0.83         6
   macro avg       0.83      0.88      0.83         6
weighted avg       0.89      0.83      0.84         6

如果是二分类问题,使用上述F1指标就足够了,如果是多分类问题,则需要用到微平均(Micro-averaging)和宏平均(Macro-averaging)

y_true = [0, 1, 2, 0, 1, 2]
y_pred = [0, 2, 1, 0, 0, 1]

首先得到上面的各个类别的混淆矩阵,如在计算类别0的precision、recall时,将类别1和类别2当作一类处理,从而计算出类别0的混淆矩阵。
参考文档

指标0类1类2类合计
TP2002
FP2214
FN0224

微平均(Micro-averaging):
p r e c i s i o n = 2 2 + 4 = 1 3 precision=\displaystyle\frac{2}{2+4}=\displaystyle\frac{1}{3} precision=2+42=31

r e c a l l = 2 2 + 4 = 1 3 recall=\displaystyle\frac{2}{2+4}=\displaystyle\frac{1}{3} recall=2+42=31

F M i c r o = 2 ∗ 1 3 ∗ 1 3 ( 1 3 + 1 3 ) = 1 3 F_{Micro}=\displaystyle\frac{2 * \displaystyle\frac{1}{3} * \displaystyle\frac{1}{3}}{(\displaystyle\frac{1}{3}+\displaystyle\frac{1}{3})}=\displaystyle\frac{1}{3} FMicro=(31+31)23131=31
宏平均(Macro-averaging):
类别0:
p r e c i s i o n = 2 3 precision=\displaystyle\frac{2}{3} precision=32 r e c a l l = 1 recall=1 recall=1 F = 0.8 F=0.8 F=0.8
类别1:
p r e c i s i o n = 0 precision=0 precision=0 r e c a l l = 0 recall=0 recall=0 F = 0 F=0 F=0
类别2:
p r e c i s i o n = 0 precision=0 precision=0 r e c a l l = 0 recall=0 recall=0 F = 0 F=0 F=0
F M a c r o = 0.8 + 0 + 0 3 = 0.26 F_{Macro}=\displaystyle\frac{0.8+0+0}{3}=0.26 FMacro=30.8+0+0=0.26
各类平均加权(Weighted-averaging):
权重为各类别数在y_true中所占比例,上面的例子中0类、1类、2类的权重均为1/3
F W e i g h t e d = 0.8 3 + 0 3 + 0 3 = 0.26 F_{Weighted}=\displaystyle\frac{0.8}{3}+\displaystyle\frac{0}{3}+\displaystyle\frac{0}{3}=0.26 FWeighted=30.8+30+30=0.26
参考文档

#sklearn官网例子
from sklearn.metrics import f1_score
y_true = [0, 1, 2, 0, 1, 2]
y_pred = [0, 2, 1, 0, 0, 1]
print("Micro-averaging:",f1_score(y_true, y_pred, average='macro'))
print("Macro-averaging:",f1_score(y_true, y_pred, average='micro'))
print("Weighted-averaging:",f1_score(y_true, y_pred, average='weighted'))
#如果是二分类,使用binary就可以了
#f1_score(y_true, y_pred, average='binary')
Micro-averaging: 0.26666666666666666
Macro-averaging: 0.3333333333333333
Weighted-averaging: 0.26666666666666666

宏平均和微平均的对比

  1. 如果每个class的样本数量差不多,那么宏平均和微平均没有太大差异
  2. 如果每个class的样本数量差异很大,而且你想:
    更注重样本量多的class:使用微平均
    更注重样本量少的class:使用宏平均
  3. 如果微平均大大低于宏平均,检查样本量多的class
  4. 如果宏平均大大低于微平均,检查样本量少的class

PR、ROC

模型预测之后都会产生一个概率,一般来说会将概率大于0.5作为1类,小于0.5作为0类。
PR、ROC曲线的绘制,首先先确定一个较大的概率阈值,然后计算该阈值下的precision、recall、 T P R = T P T P + F N TPR=\displaystyle\frac{TP}{TP+FN} TPR=TP+FNTP F P R = F P F P + T N FPR=\displaystyle\frac{FP}{FP+TN} FPR=FP+TNFP,然后变化阈值,再次计算上述指标,重复操作,最后连成线。

AUC是ROC曲线的面积,面积越大越好。

from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, precision_recall_curve, average_precision_score
y_true=[1,0,1,1,1,1,0,0,0,0]
y_predprob=[0.5,0.4,0.7,0.9,0.1,0.4,0.3,0.4,0.3,0.1]
fpr,tpr,thresholds=roc_curve(y_true, y_predprob)
precision,recall,thresholds=precision_recall_curve(y_true, y_predprob)
roc_auc=metrics.auc(fpr,tpr)
ap=average_precision_score(y_true, y_predprob)

plt.rcParams[u'font.sans-serif'] = ['Arial Unicode MS']
fig = plt.figure(figsize=(8,6))
ax1 = fig.add_subplot(2,2,1)  
plt.plot(recall,precision,'r',label='average_precision:{}'.format(round(ap,4)))
plt.title('PR')
plt.ylabel('recall')
plt.xlabel('precision')    
plt.legend(loc='lower right')

ax2=fig.add_subplot(2,2,2)  # 第一行的右图
plt.plot(fpr,tpr,'r',label='AUC:{s}'.format(s=round(roc_auc,4)))
plt.title('ROC')
plt.ylabel('TPR')
plt.xlabel('FPR')    
plt.legend(loc='lower right')

随手写的数据,图像画的比较丑(汗)
在这里插入图片描述

KS值

KS值是MAX(TPR - FPR),可以用来评价模型的过拟合程度,
举个例子:
训练集的KS值为0.8,测试集的KS值为0.4,从训练集到测试集KS降的比较多,模型过拟合严重;
训练集的KS值为0.8,测试集的KS值为0.79,从训练集到测试集KS差不多,模型较好;
从实习公司的一次部门内部分享中了解到,实际操作中,从训练集到测试集KS值不能下降超过3个点(仅供参考)

def ks_auc(target, predprob):
    fpr,tpr,thresholds=roc_curve(target, predprob)
    roc_auc=metrics.auc(fpr,tpr)
    ks=max(abs(tpr-fpr))
    return ks, roc_auc

一个较好的包Yellowbrick

Yellowbrick官网
可以较快的画出上述图像

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from yellowbrick.classifier import ConfusionMatrix,ROCAUC,PrecisionRecallCurve,ClassificationReport

breast = load_breast_cancer()
X = breast.data
y = breast.target

Xtrain,Xtest,ytrain,ytest=train_test_split(X,y,test_size=0.3,random_state=2020)

model=LogisticRegression(tol=10,max_iter=4)

#混淆矩阵
visualizer=ConfusionMatrix(model, classes=['no_cancer','cancer'], cmap='Blues')
visualizer.fit(Xtrain, ytrain)
visualizer.score(Xtest, ytest)
visualizer.show()

#ROC曲线
visualizer=ROCAUC(model,classes=['no_cancer','cancer'],title=' ')#title=' '可以取消标题
visualizer.fit(Xtrain, ytrain)
visualizer.score(Xtest, ytest)
visualizer.show()

#分类报告
visualizer = ClassificationReport(model, classes=['no_cancer','cancer'], cmap='Blues')
visualizer.fit(Xtrain, ytrain)
visualizer.score(Xtest, ytest)
visualizer.show()

在这里插入图片描述

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值