绘制ROC曲线,计算AUC,涉及到sklearn.metrics。
一般的分类器都有predict_proba或者decision_function method。使用这个method得到预测的值。
注意:ROC和AUC使用的都是训练集的数据。
sklearn.metrics.roc_curve(y_true,y_score, pos_label=None, sample_weight=None, drop_intermediate=True)
导入
from sklearn.metrics import roc_curve, auc
ROC曲线及AUC计算
def plotROC(classifier, trainData, trainLabel):
'''
classifier: after fit
'''
#法一:使用predict_proba
#probas_ = classifier.predict_proba(trainData)
#false_positive_rate, true_positive_rate, thresholds = roc_curve(trainLabel, probas_[:,1])
#法二:使用decision_function
probas_ = classifier.decision_function(trainData)
false_positive_rate, true_positive_rate, thresholds = roc_curve(trainLabel, probas_)
roc_auc = auc(false_positive_rate, true_positive_rate)
#画ROC
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
#画对角线
plt.plot([0,1],[0,1],'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
# -*-coding:utf-8 -*-
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
def loadDataSet(fileName):
numFeat = len((open(fileName).readline().split('\t')))
dataMat = []; labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr = []
curLine = line.strip().split('\t')
for i in range(numFeat - 1):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat, labelMat
def classific0():
dataArr, classLabels = loadDataSet('horseColicTraining2.txt')
testArr, testLabelArr = loadDataSet('horseColicTest2.txt')
#train
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 2), algorithm='SAMME', n_estimators = 10)
bdt.fit(dataArr, classLabels)
predictions = bdt.predict(dataArr)
errArr = np.mat(np.ones((len(dataArr), 1)))
print('训练集的错误率:%.3f%%' % float(errArr[predictions != classLabels].sum() / len(dataArr) * 100))
predictions = bdt.predict(testArr)
errArr = np.mat(np.ones((len(testArr), 1)))
print('测试集的错误率:%.3f%%' % float(errArr[predictions != testLabelArr].sum() / len(testArr) * 100))
plotROC(bdt, dataArr, classLabels)
def plotROC(classifier, trainData, trainLabel):
'''
classifier: after fit
'''
#法一:使用predict_proba
#probas_ = classifier.predict_proba(trainData)
#false_positive_rate, true_positive_rate, thresholds = roc_curve(trainLabel, probas_[:,1])
#法二:使用decision_function
probas_ = classifier.decision_function(trainData)
false_positive_rate, true_positive_rate, thresholds = roc_curve(trainLabel, probas_)
roc_auc = auc(false_positive_rate, true_positive_rate)
#画ROC
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
#画对角线
plt.plot([0,1],[0,1],'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
if __name__ == '__main__':
classific0()
可以看出,使用法二得到的曲线更平滑,法一的更接近书上的图。
评价
Concerning the AUC, a simple rule of thumb to evaluate a classifier based on this summary value is the following:
- .90-1 = very good (A)
- .80-.90 = good (B)
- .70-.80 = not so good (C)
- .60-.70 = poor (D)
- .50-.60 = fail (F)