Python绘制混淆矩阵、P-R曲线、ROC曲线
根据二分类问题的预测结果,使用Python绘制混淆矩阵、P-R曲线和ROC曲线
Base
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.metrics import roc_curve, auc
# 训练集与测试集准备
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.1, random_state=33)
# 使用逻辑回归进行分类
clf = LogisticRegression()
clf.fit(train_x, train_y)
predict_y = clf.predict(test_x)
混淆矩阵
# 显示模型评估分数
def show_metrics():
tp = cm[1,1]
fn = cm[1,0]
fp = cm[0,1]
tn = cm[0,0]
print('TP:{}\nFN:{}\nFP:{}\nTN:{}'.format(tp,fn,fp,tn))
print('精确率: {:.3f}'.format(tp/(tp+fp)))
print('召回率: {:.3f}'.format(tp/(tp+fn)))
print('F1值: {:.3f}'.format(2*(((tp/(tp+fp))*(tp/(tp+fn)))/((tp/(tp+fp))+(tp/(tp+fn))))))
show_metrics()
TP:39
FN:21
FP:7
TN:28414
精确率: 0.848
召回率: 0.650
F1值: 0.736
# 计算混淆矩阵,并显示
cm = confusion_matrix(test_y, predict_y)
class_names = [0, 1]
# 混淆矩阵可视化
def plot_confusion_matrix(cm, classes, normalize = False, title = 'Confusion matrix"', cmap = plt.cm.Blues) :
plt.figure()
plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation = 0)
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])) :
plt.text(j, i, cm[i, j],
horizontalalignment = 'center',
color = 'white' if cm[i, j] > thresh else 'black')
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
plot_confusion_matrix(cm, classes=class_names, title='逻辑回归 混淆矩阵')
P-R 曲线
# 绘制精确率-召回率曲线,P-R曲线
def plot_precision_recall():
plt.step(recall, precision, color = 'b', alpha = 0.2, where = 'post') # 显示P-R曲线
plt.fill_between(recall, precision, step ='post', alpha = 0.2, color = 'b')#将曲线下部分面积填充
plt.plot(recall, precision, linewidth=2)
plt.xlim([0.0,1])
plt.ylim([0.0,1.05])
plt.xlabel('召回率')
plt.ylabel('精确率')
plt.title('精确率-召回率 曲线')
plt.show();
# 预测样本的置信分数
score_y = clf.decision_function(test_x)
# 计算精确确率,召回率,阈值用于可视化
precision, recall, thresholds = precision_recall_curve(test_y, score_y)
plot_precision_recall()
ROC曲线
def plot_roc_auc(test_y, score_y):
fpr,tpr,threshold = roc_curve(test_y, score_y) ###计算真正率和假正率
roc_auc = auc(fpr,tpr) ###计算auc的值
print('AUC:{}'.format(roc_auc))
plt.figure()
plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, color='darkorange', linewidth=2, label='ROC curve (area = %0.2f)' % roc_auc) ###假正率为横坐标,真正率为纵坐标做曲线
plt.plot([0, 1], [0, 1], color='navy', linewidth=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
# 预测样本的置信分数
score_y = clf.decision_function(test_x)
fpr,tpr,threshold = roc_curve(test_y, score_y) ###计算真正率和假正率
roc_auc = auc(fpr,tpr) ###计算auc的值
print('AUC:{}'.format(roc_auc))
plot_roc_auc(test_y,score_y)