第六章模型评估和超参数调优的最佳实践_test pred的三个基本表达-CSDN博客

本文链接：https://blog.csdn.net/m0_45055763/article/details/124543947

6.1 管道方法简化工作流

6.1.1 威斯康星乳腺癌数据集

569个恶性和良性细胞的样本

数据集前两列：ID和诊断结果（M=恶性，B=良性）

列3-32包含30个根据细胞核的数字化图像计算出的特征值

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df=pd.read_csv('wdbc.data',header=None)

df.head()

	0	1	2	3	4	5	6	7	8	9	...	22	23	24	25	26	27	28	29	30	31
0	842302	M	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	...	25.38	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890
1	842517	M	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	...	24.99	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902
2	84300903	M	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	...	23.57	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758
3	84348301	M	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	...	14.91	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300
4	84358402	M	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	...	22.54	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678

5 rows × 32 columns

df.iloc[:,1].describe()

count     569
unique      2
top         B
freq      357
Name: 1, dtype: object

df.describe?

#将分类标签转换
from sklearn.preprocessing import LabelEncoder
X=df.iloc[:,2:].values
y=df.iloc[:,1].values
le=LabelEncoder()
y=le.fit_transform(y)

le.classes_#获得每一类的label

array(['B', 'M'], dtype=object)

le.inverse_transform([0,1])#确定B对应0，M对应于1

array(['B', 'M'], dtype=object)

#划分数据集：8；2比例划分
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,stratify=y,
                                              random_state=1)

np.bincount(y_train)

array([285, 170], dtype=int64)

np.bincount(y_test)

array([72, 42], dtype=int64)

6.1.1 集成管道中的转换器和评估器

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

pipeline的fit方法，将在中间步骤调用fit和transform方法完成一系列转换器的传递，直到管道中最后一个

pipe_lr=make_pipeline(StandardScaler(),PCA(n_components=2),
                     LogisticRegression(random_state=1))

pipe_lr.fit(X_train,y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('pca', PCA(n_components=2)),
                ('logisticregression', LogisticRegression(random_state=1))])

在执行前面示例代码中的pipe_lr管道的fit方法时，StandardScaler首先在
训练集上调用fit和transform方法。然后将转换后的训练数据传递给管道的下
一个对象，即PCA。与前面的步骤类似，PCA也在调整后的输入数据基础上
调用fit和transform，并将其传递给管道中的最后一个环节，即评估器。PCA和标准化完成后，逻辑回归（评估器）完成拟合。

y_pred=pipe_lr.predict(X_test)
print('{}{:.3f}'.format('test accuracy:',pipe_lr.score(X_test,y_test)))

test accuracy:0.956

6.2使用k折交叉验证评估模型性能

from sklearn.model_selection import StratifiedKFold

kfold=StratifiedKFold(n_splits=10).split(X_train,y_train)

kfold

<generator object _BaseKFold.split at 0x00000276ECC89900>

#记录平均准确度和标准偏差
scores=[]
for k ,(train,test) in enumerate(kfold):#train，test表示原始training data的子集
    pipe_lr.fit(X_train[train],y_train[train])
    score=pipe_lr.score(X_train[test],y_train[test])
    scores.append(score)
    print('{}{}{}{}{}{:.3f}'.format('Fold:',k+1,'class :',
                    np.bincount(y_train[train]),'score:',score))

Fold:1class :[256 153]score:0.935
Fold:2class :[256 153]score:0.935
Fold:3class :[256 153]score:0.957
Fold:4class :[256 153]score:0.957
Fold:5class :[256 153]score:0.935
Fold:6class :[257 153]score:0.956
Fold:7class :[257 153]score:0.978
Fold:8class :[257 153]score:0.933
Fold:9class :[257 153]score:0.956
Fold:10class :[257 153]score:0.956

#分层交叉验证方法
from sklearn.model_selection import cross_val_score

scores=cross_val_score(estimator=pipe_lr,X=X_train,y=y_train,
                      cv=10,
                      n_jobs=1)

print('CV accuracy scores:',scores)

CV accuracy scores: [0.93478261 0.93478261 0.95652174 0.95652174 0.93478261 0.95555556
 0.97777778 0.93333333 0.95555556 0.95555556]

print('{}{:.3f}{}{:.3f}'.format('CV accuracy:',np.mean(scores),'+/-',np.std(scores)))

CV accuracy:0.950+/-0.014

6.3 用学习和验证曲线调试算法

import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve

#使用管道方法
pipe_lr=make_pipeline(StandardScaler(),
             LogisticRegression(penalty='l2',random_state=1))

train_sizes,train_scores,test_scores=learning_curve(estimator=pipe_lr,
                                                   X=X_train,
                                                   y=y_train,
                                                   train_sizes=np.linspace(0.1,1,10),
                                                   cv=10,
                                                   n_jobs=1)
#获得训练子集和测试子集的精确度

train_sizes

array([ 40,  81, 122, 163, 204, 245, 286, 327, 368, 409])

train_scores.shape

(10, 10)

test_scores

array([[0.84782609, 0.93478261, 0.93478261, 0.97826087, 0.91304348,
        0.95555556, 0.97777778, 0.93333333, 0.97777778, 1.        ],
       [0.95652174, 0.97826087, 0.93478261, 0.97826087, 0.93478261,
        0.95555556, 0.97777778, 0.91111111, 0.97777778, 0.93333333],
       [0.95652174, 0.97826087, 0.93478261, 0.97826087, 0.93478261,
        0.95555556, 1.        , 0.91111111, 0.97777778, 0.97777778],
       [0.95652174, 0.97826087, 0.95652174, 0.97826087, 0.95652174,
        0.95555556, 0.97777778, 1.        , 0.97777778, 0.97777778],
       [0.97826087, 0.97826087, 0.95652174, 0.95652174, 0.95652174,
        0.95555556, 0.97777778, 1.        , 0.97777778, 0.95555556],
       [0.95652174, 0.97826087, 0.97826087, 0.97826087, 0.95652174,
        0.95555556, 0.97777778, 0.97777778, 0.97777778, 0.97777778],
       [0.97826087, 0.97826087, 0.95652174, 0.97826087, 0.95652174,
        0.97777778, 0.97777778, 0.97777778, 0.97777778, 0.97777778],
       [1.        , 0.97826087, 1.        , 0.97826087, 0.95652174,
        0.95555556, 0.97777778, 0.97777778, 0.97777778, 0.97777778],
       [1.        , 0.97826087, 1.        , 0.97826087, 0.95652174,
        0.95555556, 0.97777778, 0.97777778, 1.        , 1.        ],
       [1.        , 0.97826087, 1.        , 0.97826087, 0.93478261,
        0.95555556, 0.97777778, 0.97777778, 1.        , 1.        ]])

train_mean=np.mean(train_scores,axis=1)
train_mean

array([1.        , 1.        , 0.99344262, 0.99754601, 0.99166667,
       0.98571429, 0.98601399, 0.98746177, 0.9888587 , 0.98924205])

train_std=np.std(train_scores,axis=1)
train_std

array([0.        , 0.        , 0.00327869, 0.00300551, 0.00441176,
       0.00329072, 0.00312737, 0.00214067, 0.00146336, 0.00119779])

test_mean=np.mean(test_scores,axis=1)
test_mean

array([0.94531401, 0.95381643, 0.96048309, 0.97149758, 0.96927536,
       0.97144928, 0.9736715 , 0.97797101, 0.98241546, 0.98024155])

test_std=np.std(test_scores,axis=1)
test_std

array([0.04147213, 0.02306853, 0.02567195, 0.01396127, 0.0145413 ,
       0.00998832, 0.00857743, 0.01390465, 0.01645096, 0.02063368])

plt.plot(train_sizes,train_mean,color='b',
        marker='o',markersize=5,label='training accuracy')
plt.fill_between(train_sizes,train_mean+train_std,
                train_mean-train_std,alpha=0.15,
                color='b')

plt.plot(train_sizes,test_mean,color='g',
        marker='s',markersize=5,label='validation accuracy',
        linestyle='--')
plt.fill_between(train_sizes,test_mean+test_std,
                test_mean-test_std,alpha=0.15,
                color='g')

plt.grid()#生成网格
plt.xlabel('number of trainnig samples')
plt.ylabel('accuracy')
plt.legend(loc='best')
plt.ylim([0.85,1.01])
plt.show()

在这里插入图片描述

验证曲线解决过拟合和欠拟合问题

from sklearn.model_selection import validation_curve

param_range=[0.001,0.01,0.1,1.0,10.0,100.0]

train_scores,test_scores=validation_curve(estimator=pipe_lr,
                                          X=X_train,
                                            y=y_train,
                                            param_name='logisticregression__C',
                                            param_range=param_range,
                                            cv=10)

train_mean=np.mean(train_scores,axis=1)
train_std=np.mean(train_scores,axis=1)

train_mean.shape,train_std.shape

((6,), (6,))

test_scores.shape

(6, 10)

test_mean=np.sum(test_scores,axis=1)/10

test_mean.shape

(6,)

test_std=np.std(test_scores,axis=1)
test_std

array([0.05078637, 0.03282807, 0.01815352, 0.02063368, 0.02197085,
       0.01690883])

test_mean.shape

(6,)

test_mean

array([0.88574879, 0.94289855, 0.97589372, 0.98024155, 0.97149758,
       0.95608696])

plt.plot(param_range,train_mean,
        color='b',marker='o',
        markersize=5,label='training accuracy')
plt.fill_between(param_range,train_mean+train_std,
                train_mean-train_std,alpha=0.15,
                color='b')

plt.plot(param_range,test_mean,
        color='g',linestyle='--',marker='s',
        markersize=5,label='validation accuracy')
plt.fill_between(param_range,test_mean+test_std,
                test_mean-test_std,alpha=0.15,
                color='g')


plt.xscale('log')
plt.ylim([0.8,1.03])
plt.xlabel('C')
plt.ylabel('accuracy')
plt.grid()
plt.legend(loc='best')
plt.show()

在这里插入图片描述

最佳C值在0.1到1之间

6.4网格搜索为机器学习模型调优

网格搜索超参数调优

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

#使用管道方法
pipe_svc=make_pipeline(StandardScaler(),
                      SVC())

#参数范围
param_range=[0.0001,0.001,0.01,0.1,1.0,10.0,100.0,1000.0]

#定义调优的各个参数:线性支持向量机和rbf
param_grid=[{'svc__C':param_range,
            'svc__kernel':['linear']},
            {'svc__C':param_range,
            'svc__gamma':param_range,
            'svc__kernel':['rbf']}]

#初始化gridsearchcv
gs=GridSearchCV(estimator=pipe_svc,
               param_grid=param_grid,
               scoring='accuracy',
               cv=10,
               n_jobs=-1)

gs=gs.fit(X_train,y_train)

#获得最佳模型的准确率
print(gs.best_score_)

0.9846859903381642

#获得模型的最好参数
print(gs.best_params_)

{'svc__C': 100.0, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}

print(gs.best_estimator_)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(C=100.0, gamma=0.001))])

clf=gs.best_estimator_
clf.fit(X_train,y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(C=100.0, gamma=0.001))])

print('{}{:.3f}'.format('test accuracy:',clf.score(X_test,y_test)))

test accuracy:0.974

嵌套交叉验证

#内循环
gs=GridSearchCV(estimator=pipe_svc,
               param_grid=param_grid,
               scoring='accuracy',
               cv=2)

#外循环
scores=cross_val_score(gs,X_train,y_train,
                      scoring='accuracy',cv=5)

print('{}{:.3f}{}{:.3f}'.format('accuracy:',np.mean(scores),'+/-',np.std(scores)))

accuracy:0.974+/-0.015

#比较svm和决策树
from sklearn.tree import DecisionTreeClassifier

#内循环
gs=GridSearchCV(estimator=DecisionTreeClassifier(),
               param_grid=[{'max_depth':[1,2,3,4,5,6,7,None]}],
               scoring='accuracy',
               cv=2)

#外循环
scores=cross_val_score(gs,X_train,y_train,
                     scoring='accuracy',cv=5)

print('{}{:.3f}{}{:.3f}'.format('accuracy:',np.mean(scores),'+/-',np.std(scores)))

accuracy:0.941+/-0.016

可以看出svm的嵌套式>决策树的嵌套式交叉验证

6.5性能评价

混淆矩阵

from sklearn.metrics import confusion_matrix

pipe_svc.fit(X_train,y_train)
y_pred=pipe_svc.predict(X_test)

confmat=confusion_matrix(y_true=y_test,y_pred=y_pred)

print(confmat)

[[71  1]
 [ 2 40]]

#可视化混淆矩阵
plt.matshow(confmat,cmap=plt.cm.Blues,alpha=0.2)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        plt.text(x=j,y=i,s=confmat[i,j],
                va='center',ha='center')
plt.xlabel('predicted label')
plt.ylabel('true label')
plt.show()

在这里插入图片描述

准确率和召回率

#计算PRE和REC
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score,f1_score

print('{}{:.3f}'.format('precsion:',precision_score(y_true=y_test,y_pred=y_pred)))

precsion:0.976

print('{}{:.3f}'.format('recall:',recall_score(y_true=y_test,y_pred=y_pred)))

recall:0.952

print('{}{:.3f}'.format('F1:',f1_score(y_true=y_test,y_pred=y_pred)))

F1:0.964

#将网格搜索与score结合
from sklearn.metrics import make_scorer,f1_score

scorer=make_scorer(f1_score,pos_label=0)#label=0的f-score1

gs=GridSearchCV(estimator=pipe_svc,
               param_grid=param_grid,
               scoring=scorer,cv=10)

gs=gs.fit(X_train,y_train)

print(gs.best_score_)

0.9880771478667446

print(gs.best_params_)

{'svc__C': 100.0, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}

roc曲线

from sklearn.metrics import roc_curve,auc
from scipy import interp

pipe_lr=make_pipeline(StandardScaler(),PCA(n_components=2),
                     LogisticRegression(penalty='l2',C=100,
                                       random_state=1))

X_train2=X_train[:]

#交叉验证
cv=list(StratifiedKFold(n_splits=3).split(X_train,y_train))#实体化
cv

[(array([143, 145, 147, 148, 150, 152, 153, 154, 155, 156, 158, 159, 161,
         163, 164, 165, 166, 168, 170, 171, 172, 173, 174, 175, 176, 177,
         178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
         191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203,
         204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216,
         217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229,
         230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242,
         243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
         256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268,
         269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
         282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294,
         295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307,
         308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320,
         321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333,
         334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346,
         347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
         360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372,
         373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385,
         386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398,
         399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411,
         412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424,
         425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437,
         438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450,
         451, 452, 453, 454]),
  array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
          13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
          26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
          39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
          52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
          65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
          78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
          91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
         104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
         117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
         130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
         144, 146, 149, 151, 157, 160, 162, 167, 169])),
 (array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
          13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
          26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
          39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
          52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
          65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
          78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
          91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
         104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
         117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
         130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
         144, 146, 149, 151, 157, 160, 162, 167, 169, 296, 300, 301, 302,
         303, 304, 305, 306, 307, 310, 314, 315, 316, 317, 318, 319, 320,
         321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333,
         334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346,
         347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
         360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372,
         373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385,
         386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398,
         399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411,
         412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424,
         425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437,
         438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450,
         451, 452, 453, 454]),
  array([143, 145, 147, 148, 150, 152, 153, 154, 155, 156, 158, 159, 161,
         163, 164, 165, 166, 168, 170, 171, 172, 173, 174, 175, 176, 177,
         178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
         191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203,
         204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216,
         217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229,
         230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242,
         243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
         256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268,
         269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
         282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294,
         295, 297, 298, 299, 308, 309, 311, 312, 313])),
 (array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
          13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
          26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
          39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
          52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
          65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
          78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
          91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
         104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
         117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
         130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
         143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
         156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
         169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181,
         182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
         195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
         208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220,
         221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233,
         234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246,
         247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259,
         260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272,
         273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285,
         286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 297, 298, 299,
         308, 309, 311, 312, 313]),
  array([296, 300, 301, 302, 303, 304, 305, 306, 307, 310, 314, 315, 316,
         317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
         330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342,
         343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355,
         356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368,
         369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381,
         382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394,
         395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407,
         408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420,
         421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433,
         434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446,
         447, 448, 449, 450, 451, 452, 453, 454]))]

mean_tpr=0.0
mean_fpr=np.linspace(0,1,100)
all_tpr=[]

fig=plt.figure(figsize=(7,5))
for i,(train,test) in enumerate(cv):
    probas=pipe_lr.fit(X_train2[train],
                       y_train[train]).predict_proba(X_train2[test])#得到每样本属于每类的概率
    fpr,tpr,thresholds=roc_curve(y_train[test],probas[:,1],pos_label=1)
    mean_tpr+=interp(mean_fpr,fpr,tpr)#插值得到fpr
    mean_tpr[0]=0
    roc_auc=auc(fpr,tpr)
    plt.plot(fpr,
            tpr,
            label='{}{}{}{:.3f}{}'.format('ROC fold ',i+1,'(area:',roc_auc,')'))

#画random guessing
plt.plot([0,1],
        [0,1],
        linestyle='--',
        color=(0.6,0.6,0.6),
        label='random guesing')

#画meanauc部分
mean_tpr=mean_tpr/len(cv)
mean_tpr[-1]=1.0
mean_auc=auc(mean_fpr,mean_tpr)

plt.plot(mean_fpr,mean_tpr,'k--',
        label='{}{}{:.3f}{}'.format('mean roc ','(area:',mean_auc,')'))

#画出完美分类器
plt.plot([0,0,1],
        [0,1,1],
        linestyle=':',
        color='k',
        label='perfect performance')


plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.legend(loc='best')
plt.show()

<ipython-input-85-909988fb7697>:6: DeprecationWarning: scipy.interp is deprecated and will be removed in SciPy 2.0.0, use numpy.interp instead
  mean_tpr+=interp(mean_fpr,fpr,tpr)#插值得到fpr
<ipython-input-85-909988fb7697>:6: DeprecationWarning: scipy.interp is deprecated and will be removed in SciPy 2.0.0, use numpy.interp instead
  mean_tpr+=interp(mean_fpr,fpr,tpr)#插值得到fpr
<ipython-input-85-909988fb7697>:6: DeprecationWarning: scipy.interp is deprecated and will be removed in SciPy 2.0.0, use numpy.interp instead
  mean_tpr+=interp(mean_fpr,fpr,tpr)#插值得到fpr

在这里插入图片描述

6.6处理类不平衡问题

#创建一个类不平衡的数据集:357个良性+40个恶性
X_imb=np.vstack((X[y==0],X[y==1][:40]))
y_imb=np.hstack((y[y==0],y[y==1][:40]))

#使用resample反复提取新样本，包含与良性（0）同样的样本
from sklearn.utils import resample

print('number of class 1 samples before:',X_imb[y_imb==1].shape[0])

number of class 1 samples before: 40

X_unsample,y_unsample=resample(X_imb[y_imb==1],
        y_imb[y_imb==1],
        n_samples=X_imb[y_imb==0].shape[0],
        random_state=123)

X_unsample.shape

(357, 30)

y_unsample.shape

(357,)

print('number of class 1 samples after:',X_unsample.shape[0])

number of class 1 samples after: 357

#采样样本与0类堆叠
X_bal=np.vstack((X[y==0],X_unsample))
y_bal=np.hstack((y[y==0],y_unsample))

#多投票规则最多到50%
y_pred=np.zeros(y_bal.shape[0])
np.mean(y_pred==y_bal)*100

50.0

第六章 模型评估和超参数调优的最佳实践

6.1 管道方法简化工作流

6.1.1 威斯康星乳腺癌数据集

6.1.1 集成管道中的转换器和评估器

6.2使用k折交叉验证评估模型性能

6.3 用学习和验证曲线调试算法

验证曲线解决过拟合和欠拟合问题

6.4网格搜索为机器学习模型调优

网格搜索超参数调优

嵌套交叉验证

6.5性能评价

混淆矩阵

准确率和召回率

roc曲线

6.6处理类不平衡问题

第六章模型评估和超参数调优的最佳实践