1.精准度和召回率受决策边界的影响
导入数据
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
digits = datasets.load_digits()
X = digits.data
y = digits.target.copy()
y[digits.target==9] = 1
y[digits.target!=9] = 0
数据分割
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)
逻辑回归
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_predict = log_reg.predict(X_test)
各种衡量指标
# F1 Score
from sklearn.metrics import f1_score
f1_score(y_test, y_predict)
# 混淆矩阵
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_predict)
#精准率
from sklearn.metrics import precision_score
precision_score(y_test, y_predict)
#召回率
from sklearn.metrics import recall_score
recall_score(y_test, y_predict)
2.改变决策边界
逻辑回归中:以Xtheta == 0 为决策边界 Xtheta > 0 : 预测值为 1; X*theta < 0 : 预测值为 0
# 默认决策值边界为 0
log_reg.decision_function(X_test)[:10] # 结果都小于0
log_reg.predict(X_test)[:10] # 预测值都为 0
# 保存决策值
decision_scores = log_reg.decision_function(X_test)
# 设置新的 决策边界 5
y_predict_2 = np.array(decision_scores >= 5, dtype='int')
# 混合矩阵
confusion_matrix(y_test, y_predict_2)
# 精准率
precision_score(y_test, y_predict_2)
# 召回率
recall_score(y_test, y_predict_2)
# 设置新的 决策边界 -5
y_predict_3 = np.array(decision_scores >= -5, dtype='int')
# 混合矩阵
confusion_matrix(y_test, y_predict_3)
# 精准率
precision_score(y_test, y_predict_3)
# 召回率
recall_score(y_test, y_predict_3)
3.精准度-召回率曲线
from sklearn.metrics import precision_recall_curve
precisions,recalls,thresholds = precision_recall_curve(y_test,decision_scores)
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
plt.plot(thresholds, precisions[:-1],label = "精准率")
plt.plot(thresholds, recalls[:-1],label = "召回率")
plt.legend()
plt.xlabel("决策边界值-thresholds", fontsize='15')
plt.ylabel("百分率", fontsize='15')
plt.show()
plt.plot(precisions, recalls)
plt.xlabel("precisions", fontsize='15')
plt.ylabel("recalls", fontsize='15')
plt.show()
4.ROC曲线
from sklearn.metrics import roc_curve
fprs, tprs, thresholds = roc_curve(y_test, decision_scores)
plt.plot(fprs, tprs)
plt.show()
ROC曲线包裹的面积:范围从[0,1] 越大越好.
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,decision_scores)