分类问题ROC曲线和KS曲线的绘制

最新推荐文章于 2025-03-24 10:48:39 发布

瑞行AI

最新推荐文章于 2025-03-24 10:48:39 发布

阅读量1.1w

点赞数

分类专栏：算法实现

本文链接：https://blog.csdn.net/cymy001/article/details/79613787

版权

算法实现专栏收录该内容

37 篇文章

订阅专栏

ROC曲线与AUC

ROC曲线下方的面积是AUC，AUC表示对于随机选择的正类别样本确实为正类别，以及随机选择的负类别样本为正类别，分类器更确信前者的概率。
分类问题的模型评估理论参考：
http://blog.csdn.net/cymy001/article/details/79366754
Python机器学习库sklearn分类问题的模型评估API参考：
http://blog.csdn.net/cymy001/article/details/79425233

import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize

# Import some data to play with
iris = datasets.load_iris()
X = iris.data   #(150,4)的二维numpy数组<class 'numpy.ndarray'> 
y = iris.target   #(150,)的一维numpy数组标签<class 'numpy.ndarray'>

# Binarize the output
#对类别标签也进行了二值化处理[0 1 2]==>[[1 0 0],[0 1 0],[0 0 1]]
n_classes = y.shape[1]   #有多少个不同的类别

# Add noisy features to make the problem harder
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape   #样本数150，特征数4
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]   
#np.c_将多个对象连接到第二个数轴上,增加了800维的特征噪声
#np.random.randn(维度大小参数)从标准正态分布中返回一个或多个样本值
#这里利用随机种子进行改造np.random.RamdonState(0).randn
#np.random.rand(维度大小参数)随机样本位于[0, 1)中

# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0)   #数据集切分

sklearn库API——decision_function(ROC阈值)

%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp

# Learn to predict each class against the other
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=random_state))   
y_score = classifier.fit(X_train, y_train).decision_function(X_test)   
#decision_function对于SVC返回每个样本距离决策边界的距离
#decision_function对于LR返回预测样本的置信度分数——该样本与超平面的有符号距离

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])   
    #y_test样例真实标签，y_score学习器预测的样例的概率 
    roc_auc[i] = auc(fpr[i], tpr[i])   
    #计算ROC曲线下方的面积，fpr假正例率数组(横坐标)，tpr真正例率数组(纵坐标） 

#Plot of a ROC curve for a specific class
plt.rcParams['figure.figsize']=(8,5)
plt.figure()
plt.plot(fpr[2], tpr[2], color='darkorange', label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')


plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")

这里写图片描述

Compute micro-average ROC curve and ROC area先计算每一类混淆矩阵，各类混淆矩阵作和除3，再算tpr和fpr

import numpy as np
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())   #ravel函数将矩阵展开成向量
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

Compute macro-average ROC curve and ROC area先计算出每一类的tpr和fpr，再除3

import numpy as np
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))   #np.concatenate将“特征维度相同数组”纵向拼接

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)   #np.zeros_like创建一个和参数all_fpr数组维度相同的全0数组
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])   
    #interp一维线性插值，fpr和tpr是插值结点横纵坐标，all_fpr是已知中间节点横坐标(得到插值曲线后，求其纵坐标)
#https://docs.scipy.org/doc/numpy/reference/generated/numpy.interp.html#numpy.interp

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn.metrics import roc_curve, auc

# Plot all ROC curves
plt.rcParams['figure.figsize']=(8,5)
plt.figure()
plt.plot(fpr["micro"], tpr["micro"], label='micro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':')

plt.plot(fpr["macro"], tpr["macro"], label='macro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["macro"]),
         color='navy', linestyle=':')

colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])   #python3里的无穷循环器
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, label='ROC curve of class {0} (area = {1:0.2f})' ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")

这里写图片描述

带交叉验证的ROC曲线

准备数据

import numpy as np
from scipy import interp
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold

# Data IO and generation

# Import some data to play with
iris = datasets.load_iris()
X = iris.data
y = iris.target
X, y = X[y != 2], y[y != 2]    #去掉一类，下述交叉验证绘制ROC曲线只考虑二分类问题
n_samples, n_features = X.shape

# Add noisy features
random_state = np.random.RandomState(0)
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]

交叉验证计算auc，画图

# Classification and ROC analysis

# Run classifier with cross-validation and plot ROC curves
cv = StratifiedKFold(n_splits=6)
classifier = svm.SVC(kernel='linear', probability=True, random_state=random_state)

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

i = 0
for train, test in cv.split(X, y):
    # train和test是交叉验证分组索引
    probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])   #predict_proba作用类似decision_function
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))   #根据每一折的fpr和tpr进行插值，得插值曲线后，对相同的mean_fpr求其对应插值曲线上的纵坐标
    #interp一维线性插值，fpr和tpr是插值结点横纵坐标，mean_fpr是已知中间节点横坐标(得到插值曲线后，求其纵坐标)
    #https://docs.scipy.org/doc/numpy/reference/generated/numpy.interp.html#numpy.interp
    tprs[-1][0] = 0.0   #tprs有6个元素，每个元素是一个长度为100的array
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)   #aucs有6个auc的值，是交叉验证每一折的auc
    plt.plot(fpr, tpr, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))

    i += 1
plt.plot([0, 1], [0, 1], linestyle='--', color='r', label='Luck', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)   #将每一折交叉验证计算的tpr求和取平均(每个位置对应相同的fpr)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")

这里写图片描述

K-S曲线与KS值

K-S曲线是正样本洛伦兹曲线与负样本洛伦兹曲线的差值曲线，用来度量阳性与阴性分类区分程度的。K-S曲线的最高点(最大值)定义为KS值，KS值越大，模型的区分度越好。
以Logistic Regression为例，说明K-S曲线的做法：
（1）把Logistic Regression模型对样本的输出概率(predict_proba)从大到小排序得 $rank-threshold$
（2）计算阈值取每个概率时对应的TPR和FPR值
（3）以 $(rank-threshold，TPR)$ 画正样本曲线，以 $(rank-threshold，FPR)$ 画负样本曲线，就构成K-S曲线
（4）K-S曲线上的KS值，即 $max(TPR-FPR)$ ，即两条曲线间的最大间隔距离。
这里写图片描述