第六章 模型评估和超参数调优的最佳实践

6.1 管道方法简化工作流

6.1.1 威斯康星乳腺癌数据集

569个恶性和良性细胞的样本

数据集前两列:ID和诊断结果(M=恶性,B=良性)

列3-32包含30个根据细胞核的数字化图像计算出的特征值

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df=pd.read_csv('wdbc.data',header=None)
df.head()
0123456789...22232425262728293031
0842302M17.9910.38122.801001.00.118400.277600.30010.14710...25.3817.33184.602019.00.16220.66560.71190.26540.46010.11890
1842517M20.5717.77132.901326.00.084740.078640.08690.07017...24.9923.41158.801956.00.12380.18660.24160.18600.27500.08902
284300903M19.6921.25130.001203.00.109600.159900.19740.12790...23.5725.53152.501709.00.14440.42450.45040.24300.36130.08758
384348301M11.4220.3877.58386.10.142500.283900.24140.10520...14.9126.5098.87567.70.20980.86630.68690.25750.66380.17300
484358402M20.2914.34135.101297.00.100300.132800.19800.10430...22.5416.67152.201575.00.13740.20500.40000.16250.23640.07678

5 rows × 32 columns

df.iloc[:,1].describe()
count     569
unique      2
top         B
freq      357
Name: 1, dtype: object
df.describe?
#将分类标签转换
from sklearn.preprocessing import LabelEncoder
X=df.iloc[:,2:].values
y=df.iloc[:,1].values
le=LabelEncoder()
y=le.fit_transform(y)
le.classes_#获得每一类的label
array(['B', 'M'], dtype=object)
le.inverse_transform([0,1])#确定B对应0,M对应于1
array(['B', 'M'], dtype=object)
#划分数据集:8;2比例划分
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,stratify=y,
                                              random_state=1)

np.bincount(y_train)
array([285, 170], dtype=int64)
np.bincount(y_test)
array([72, 42], dtype=int64)
6.1.1 集成管道中的转换器和评估器
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

pipeline的fit方法,将在中间步骤调用fit和transform方法完成一系列转换器的传递,直到管道中最后一个

pipe_lr=make_pipeline(StandardScaler(),PCA(n_components=2),
                     LogisticRegression(random_state=1))
pipe_lr.fit(X_train,y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('pca', PCA(n_components=2)),
                ('logisticregression', LogisticRegression(random_state=1))])

在执行前面示例代码中的pipe_lr管道的fit方法时,StandardScaler首先在
训练集上调用fit和transform方法。然后将转换后的训练数据传递给管道的下
一个对象,即PCA。与前面的步骤类似,PCA也在调整后的输入数据基础上
调用fit和transform,并将其传递给管道中的最后一个环节,即评估器。PCA和标准化完成后,逻辑回归(评估器)完成拟合。

y_pred=pipe_lr.predict(X_test)
print('{}{:.3f}'.format('test accuracy:',pipe_lr.score(X_test,y_test)))
test accuracy:0.956

6.2使用k折交叉验证评估模型性能

from sklearn.model_selection import StratifiedKFold
kfold=StratifiedKFold(n_splits=10).split(X_train,y_train)
kfold
<generator object _BaseKFold.split at 0x00000276ECC89900>
#记录平均准确度和标准偏差
scores=[]
for k ,(train,test) in enumerate(kfold):#train,test表示原始training data的子集
    pipe_lr.fit(X_train[train],y_train[train])
    score=pipe_lr.score(X_train[test],y_train[test])
    scores.append(score)
    print('{}{}{}{}{}{:.3f}'.format('Fold:',k+1,'class :',
                    np.bincount(y_train[train]),'score:',score))
Fold:1class :[256 153]score:0.935
Fold:2class :[256 153]score:0.935
Fold:3class :[256 153]score:0.957
Fold:4class :[256 153]score:0.957
Fold:5class :[256 153]score:0.935
Fold:6class :[257 153]score:0.956
Fold:7class :[257 153]score:0.978
Fold:8class :[257 153]score:0.933
Fold:9class :[257 153]score:0.956
Fold:10class :[257 153]score:0.956
#分层交叉验证方法
from sklearn.model_selection import cross_val_score
scores=cross_val_score(estimator=pipe_lr,X=X_train,y=y_train,
                      cv=10,
                      n_jobs=1)
print('CV accuracy scores:',scores)
CV accuracy scores: [0.93478261 0.93478261 0.95652174 0.95652174 0.93478261 0.95555556
 0.97777778 0.93333333 0.95555556 0.95555556]
print('{}{:.3f}{}{:.3f}'.format('CV accuracy:',np.mean(scores),'+/-',np.std(scores)))
CV accuracy:0.950+/-0.014

6.3 用学习和验证曲线调试算法

import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
#使用管道方法
pipe_lr=make_pipeline(StandardScaler(),
             LogisticRegression(penalty='l2',random_state=1))
train_sizes,train_scores,test_scores=learning_curve(estimator=pipe_lr,
                                                   X=X_train,
                                                   y=y_train,
                                                   train_sizes=np.linspace(0.1,1,10),
                                                   cv=10,
                                                   n_jobs=1)
#获得训练子集和测试子集的精确度
train_sizes
array([ 40,  81, 122, 163, 204, 245, 286, 327, 368, 409])
train_scores.shape
(10, 10)
test_scores
array([[0.84782609, 0.93478261, 0.93478261, 0.97826087, 0.91304348,
        0.95555556, 0.97777778, 0.93333333, 0.97777778, 1.        ],
       [0.95652174, 0.97826087, 0.93478261, 0.97826087, 0.93478261,
        0.95555556, 0.97777778, 0.91111111, 0.97777778, 0.93333333],
       [0.95652174, 0.97826087, 0.93478261, 0.97826087, 0.93478261,
        0.95555556, 1.        , 0.91111111, 0.97777778, 0.97777778],
       [0.95652174, 0.97826087, 0.95652174, 0.97826087, 0.95652174,
        0.95555556, 0.97777778, 1.        , 0.97777778, 0.97777778],
       [0.97826087, 0.97826087, 0.95652174, 0.95652174, 0.95652174,
        0.95555556, 0.97777778, 1.        , 0.97777778, 0.95555556],
       [0.95652174, 0.97826087, 0.97826087, 0.97826087, 0.95652174,
        0.95555556, 0.97777778, 0.97777778, 0.97777778, 0.97777778],
       [0.97826087, 0.97826087, 0.95652174, 0.97826087, 0.95652174,
        0.97777778, 0.97777778, 0.97777778, 0.97777778, 0.97777778],
       [1.        , 0.97826087, 1.        , 0.97826087, 0.95652174,
        0.95555556, 0.97777778, 0.97777778, 0.97777778, 0.97777778],
       [1.        , 0.97826087, 1.        , 0.97826087, 0.95652174,
        0.95555556, 0.97777778, 0.97777778, 1.        , 1.        ],
       [1.        , 0.97826087, 1.        , 0.97826087, 0.93478261,
        0.95555556, 0.97777778, 0.97777778, 1.        , 1.        ]])
train_mean=np.mean(train_scores,axis=1)
train_mean
array([1.        , 1.        , 0.99344262, 0.99754601, 0.99166667,
       0.98571429, 0.98601399, 0.98746177, 0.9888587 , 0.98924205])
train_std=np.std(train_scores,axis=1)
train_std
array([0.        , 0.        , 0.00327869, 0.00300551, 0.00441176,
       0.00329072, 0.00312737, 0.00214067, 0.00146336, 0.00119779])
test_mean=np.mean(test_scores,axis=1)
test_mean
array([0.94531401, 0.95381643, 0.96048309, 0.97149758, 0.96927536,
       0.97144928, 0.9736715 , 0.97797101, 0.98241546, 0.98024155])
test_std=np.std(test_scores,axis=1)
test_std
array([0.04147213, 0.02306853, 0.02567195, 0.01396127, 0.0145413 ,
       0.00998832, 0.00857743, 0.01390465, 0.01645096, 0.02063368])
plt.plot(train_sizes,train_mean,color='b',
        marker='o',markersize=5,label='training accuracy')
plt.fill_between(train_sizes,train_mean+train_std,
                train_mean-train_std,alpha=0.15,
                color='b')

plt.plot(train_sizes,test_mean,color='g',
        marker='s',markersize=5,label='validation accuracy',
        linestyle='--')
plt.fill_between(train_sizes,test_mean+test_std,
                test_mean-test_std,alpha=0.15,
                color='g')

plt.grid()#生成网格
plt.xlabel('number of trainnig samples')
plt.ylabel('accuracy')
plt.legend(loc='best')
plt.ylim([0.85,1.01])
plt.show()

在这里插入图片描述

验证曲线解决过拟合和欠拟合问题
from sklearn.model_selection import validation_curve
param_range=[0.001,0.01,0.1,1.0,10.0,100.0]
train_scores,test_scores=validation_curve(estimator=pipe_lr,
                                          X=X_train,
                                            y=y_train,
                                            param_name='logisticregression__C',
                                            param_range=param_range,
                                            cv=10)
train_mean=np.mean(train_scores,axis=1)
train_std=np.mean(train_scores,axis=1)
train_mean.shape,train_std.shape
((6,), (6,))
test_scores.shape
(6, 10)
test_mean=np.sum(test_scores,axis=1)/10
test_mean.shape
(6,)
test_std=np.std(test_scores,axis=1)
test_std
array([0.05078637, 0.03282807, 0.01815352, 0.02063368, 0.02197085,
       0.01690883])
test_mean.shape
(6,)
test_mean
array([0.88574879, 0.94289855, 0.97589372, 0.98024155, 0.97149758,
       0.95608696])
plt.plot(param_range,train_mean,
        color='b',marker='o',
        markersize=5,label='training accuracy')
plt.fill_between(param_range,train_mean+train_std,
                train_mean-train_std,alpha=0.15,
                color='b')

plt.plot(param_range,test_mean,
        color='g',linestyle='--',marker='s',
        markersize=5,label='validation accuracy')
plt.fill_between(param_range,test_mean+test_std,
                test_mean-test_std,alpha=0.15,
                color='g')


plt.xscale('log')
plt.ylim([0.8,1.03])
plt.xlabel('C')
plt.ylabel('accuracy')
plt.grid()
plt.legend(loc='best')
plt.show()

在这里插入图片描述

最佳C值在0.1到1之间

6.4网格搜索为机器学习模型调优

网格搜索超参数调优
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
#使用管道方法
pipe_svc=make_pipeline(StandardScaler(),
                      SVC())
#参数范围
param_range=[0.0001,0.001,0.01,0.1,1.0,10.0,100.0,1000.0]
#定义调优的各个参数:线性支持向量机和rbf
param_grid=[{'svc__C':param_range,
            'svc__kernel':['linear']},
            {'svc__C':param_range,
            'svc__gamma':param_range,
            'svc__kernel':['rbf']}]
#初始化gridsearchcv
gs=GridSearchCV(estimator=pipe_svc,
               param_grid=param_grid,
               scoring='accuracy',
               cv=10,
               n_jobs=-1)
gs=gs.fit(X_train,y_train)
#获得最佳模型的准确率
print(gs.best_score_)
0.9846859903381642
#获得模型的最好参数
print(gs.best_params_)
{'svc__C': 100.0, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}
print(gs.best_estimator_)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(C=100.0, gamma=0.001))])
clf=gs.best_estimator_
clf.fit(X_train,y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(C=100.0, gamma=0.001))])
print('{}{:.3f}'.format('test accuracy:',clf.score(X_test,y_test)))
test accuracy:0.974
嵌套交叉验证
#内循环
gs=GridSearchCV(estimator=pipe_svc,
               param_grid=param_grid,
               scoring='accuracy',
               cv=2)
#外循环
scores=cross_val_score(gs,X_train,y_train,
                      scoring='accuracy',cv=5)
print('{}{:.3f}{}{:.3f}'.format('accuracy:',np.mean(scores),'+/-',np.std(scores)))
accuracy:0.974+/-0.015
#比较svm和决策树
from sklearn.tree import DecisionTreeClassifier
#内循环
gs=GridSearchCV(estimator=DecisionTreeClassifier(),
               param_grid=[{'max_depth':[1,2,3,4,5,6,7,None]}],
               scoring='accuracy',
               cv=2)
#外循环
scores=cross_val_score(gs,X_train,y_train,
                     scoring='accuracy',cv=5)
print('{}{:.3f}{}{:.3f}'.format('accuracy:',np.mean(scores),'+/-',np.std(scores)))
accuracy:0.941+/-0.016

可以看出svm的嵌套式>决策树的嵌套式交叉验证

6.5性能评价

混淆矩阵
from sklearn.metrics import confusion_matrix
pipe_svc.fit(X_train,y_train)
y_pred=pipe_svc.predict(X_test)
confmat=confusion_matrix(y_true=y_test,y_pred=y_pred)
print(confmat)
[[71  1]
 [ 2 40]]
#可视化混淆矩阵
plt.matshow(confmat,cmap=plt.cm.Blues,alpha=0.2)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        plt.text(x=j,y=i,s=confmat[i,j],
                va='center',ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label')
plt.show()

在这里插入图片描述

准确率和召回率
#计算PRE和REC
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score,f1_score
print('{}{:.3f}'.format('precsion:',precision_score(y_true=y_test,y_pred=y_pred)))
precsion:0.976
print('{}{:.3f}'.format('recall:',recall_score(y_true=y_test,y_pred=y_pred)))
recall:0.952
print('{}{:.3f}'.format('F1:',f1_score(y_true=y_test,y_pred=y_pred)))
F1:0.964
#将网格搜索与score结合
from sklearn.metrics import make_scorer,f1_score
scorer=make_scorer(f1_score,pos_label=0)#label=0的f-score1
gs=GridSearchCV(estimator=pipe_svc,
               param_grid=param_grid,
               scoring=scorer,cv=10)
gs=gs.fit(X_train,y_train)
print(gs.best_score_)
0.9880771478667446
print(gs.best_params_)
{'svc__C': 100.0, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}
roc曲线
from sklearn.metrics import roc_curve,auc
from scipy import interp
pipe_lr=make_pipeline(StandardScaler(),PCA(n_components=2),
                     LogisticRegression(penalty='l2',C=100,
                                       random_state=1))
X_train2=X_train[:]
#交叉验证
cv=list(StratifiedKFold(n_splits=3).split(X_train,y_train))#实体化
cv
[(array([143, 145, 147, 148, 150, 152, 153, 154, 155, 156, 158, 159, 161,
         163, 164, 165, 166, 168, 170, 171, 172, 173, 174, 175, 176, 177,
         178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
         191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203,
         204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216,
         217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229,
         230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242,
         243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
         256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268,
         269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
         282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294,
         295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307,
         308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320,
         321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333,
         334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346,
         347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
         360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372,
         373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385,
         386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398,
         399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411,
         412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424,
         425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437,
         438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450,
         451, 452, 453, 454]),
  array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
          13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
          26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
          39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
          52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
          65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
          78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
          91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
         104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
         117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
         130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
         144, 146, 149, 151, 157, 160, 162, 167, 169])),
 (array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
          13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
          26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
          39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
          52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
          65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
          78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
          91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
         104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
         117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
         130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
         144, 146, 149, 151, 157, 160, 162, 167, 169, 296, 300, 301, 302,
         303, 304, 305, 306, 307, 310, 314, 315, 316, 317, 318, 319, 320,
         321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333,
         334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346,
         347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
         360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372,
         373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385,
         386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398,
         399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411,
         412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424,
         425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437,
         438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450,
         451, 452, 453, 454]),
  array([143, 145, 147, 148, 150, 152, 153, 154, 155, 156, 158, 159, 161,
         163, 164, 165, 166, 168, 170, 171, 172, 173, 174, 175, 176, 177,
         178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
         191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203,
         204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216,
         217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229,
         230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242,
         243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
         256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268,
         269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
         282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294,
         295, 297, 298, 299, 308, 309, 311, 312, 313])),
 (array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
          13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
          26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
          39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
          52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
          65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
          78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
          91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
         104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
         117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
         130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
         143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
         156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
         169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181,
         182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
         195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
         208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220,
         221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233,
         234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246,
         247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259,
         260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272,
         273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285,
         286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 297, 298, 299,
         308, 309, 311, 312, 313]),
  array([296, 300, 301, 302, 303, 304, 305, 306, 307, 310, 314, 315, 316,
         317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
         330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342,
         343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355,
         356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368,
         369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381,
         382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394,
         395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407,
         408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420,
         421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433,
         434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446,
         447, 448, 449, 450, 451, 452, 453, 454]))]
mean_tpr=0.0
mean_fpr=np.linspace(0,1,100)
all_tpr=[]
fig=plt.figure(figsize=(7,5))
for i,(train,test) in enumerate(cv):
    probas=pipe_lr.fit(X_train2[train],
                       y_train[train]).predict_proba(X_train2[test])#得到每样本属于每类的概率
    fpr,tpr,thresholds=roc_curve(y_train[test],probas[:,1],pos_label=1)
    mean_tpr+=interp(mean_fpr,fpr,tpr)#插值得到fpr
    mean_tpr[0]=0
    roc_auc=auc(fpr,tpr)
    plt.plot(fpr,
            tpr,
            label='{}{}{}{:.3f}{}'.format('ROC fold ',i+1,'(area:',roc_auc,')'))

#画random guessing
plt.plot([0,1],
        [0,1],
        linestyle='--',
        color=(0.6,0.6,0.6),
        label='random guesing')

#画meanauc部分
mean_tpr=mean_tpr/len(cv)
mean_tpr[-1]=1.0
mean_auc=auc(mean_fpr,mean_tpr)

plt.plot(mean_fpr,mean_tpr,'k--',
        label='{}{}{:.3f}{}'.format('mean roc ','(area:',mean_auc,')'))

#画出完美分类器
plt.plot([0,0,1],
        [0,1,1],
        linestyle=':',
        color='k',
        label='perfect performance')


plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.legend(loc='best')
plt.show()

<ipython-input-85-909988fb7697>:6: DeprecationWarning: scipy.interp is deprecated and will be removed in SciPy 2.0.0, use numpy.interp instead
  mean_tpr+=interp(mean_fpr,fpr,tpr)#插值得到fpr
<ipython-input-85-909988fb7697>:6: DeprecationWarning: scipy.interp is deprecated and will be removed in SciPy 2.0.0, use numpy.interp instead
  mean_tpr+=interp(mean_fpr,fpr,tpr)#插值得到fpr
<ipython-input-85-909988fb7697>:6: DeprecationWarning: scipy.interp is deprecated and will be removed in SciPy 2.0.0, use numpy.interp instead
  mean_tpr+=interp(mean_fpr,fpr,tpr)#插值得到fpr

在这里插入图片描述

6.6处理类不平衡问题

#创建一个类不平衡的数据集:357个良性+40个恶性
X_imb=np.vstack((X[y==0],X[y==1][:40]))
y_imb=np.hstack((y[y==0],y[y==1][:40]))
#使用resample反复提取新样本,包含与良性(0)同样的样本
from sklearn.utils import resample

print('number of class 1 samples before:',X_imb[y_imb==1].shape[0])
number of class 1 samples before: 40
X_unsample,y_unsample=resample(X_imb[y_imb==1],
        y_imb[y_imb==1],
        n_samples=X_imb[y_imb==0].shape[0],
        random_state=123)
X_unsample.shape
(357, 30)
y_unsample.shape
(357,)
print('number of class 1 samples after:',X_unsample.shape[0])
number of class 1 samples after: 357
#采样样本与0类堆叠
X_bal=np.vstack((X[y==0],X_unsample))
y_bal=np.hstack((y[y==0],y_unsample))
#多投票规则最多到50%
y_pred=np.zeros(y_bal.shape[0])
np.mean(y_pred==y_bal)*100
50.0

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值