1 分类准确度及衡量指标
1.scikit-learn中的混淆矩阵,精准率,召回率
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_log_predict)
from sklearn.metrics import precision_score
precision_score(y_test,y_log_predict)
from sklearn.metrics import recall_score
recall_score(y_test,y_log_predict)
2. F1 Score #调和平均值有一个数小,值都小
from sklearn.metrics import f1_score
f1_score(y_test,y_log_predict)
3 scikit-learn中的precision-recall曲线
from sklearn.metrics import precision_recall_curve
precisions,recalls,thresholds = precision_recall_curve(y_test,decision_score)
plt.plot(thresholds,precisions[:-1])
plt.plot(thresholds,recalls[:-1])
plt.show()
4. 精准率和召回率的平衡
log_reg.decision_function(X_test)
log_reg.decision_function(X_test)[:10] # 根据值是否大于0 二分类
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
precision = []
recalls = []
thresholds = np.arange(np.min(decision_score),np.max(decision_score),0.01)
for threshold in thresholds:
y_predict = np.array(decision_score>threshold,dtype='int')
precision.append(precision_score(y_test,y_predict))
recalls.append(recall_score(y_test,y_predict))
plt.plot(thresholds,precision)
plt.plot(thresholds,recalls)
plt.show()
5.TPR 与 FPR
6.ROC曲线
from sklearn.metrics import roc_curve
fprs,tprs,thresholds= roc_curve(y_test,decision_score)
plt.plot(fprs,tprs)
plt.show()
from sklearn.metrics import roc_auc_score #计算曲线下方的面积
roc_auc_score(y_test,decision_score)
ROC曲线下方的面积越大算法相对越好
7. 多分类中的混淆矩阵
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
digits = datasets.load_digits()
X = digits.data
y = digits.target
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.8,random_state=666)
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)
log_reg.score(X_test,y_test)
y_predict = log_reg.predict(X_test)
from sklearn.metrics import precision_score
precision_score(y_test,y_predict,average='micro') # 多分类
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_predict)
cfm = confusion_matrix(y_test,y_predict)
plt.matshow(cfm,cmap=plt.cm.gray)
plt.show() #图1
row_sums = np.sum(cfm,axis=1)
err_matrix = cfm/row_sums
np.fill_diagonal(err_matrix,0)
err_matrix # 每一行犯错误的百分比
plt.matshow(err_matrix,cmap=plt.cm.gray)
plt.show() #图2
图1表示混淆矩阵的图
图2 表示犯错误比例 按每一行算
准确率和混淆矩阵可以在多分类问题中用sklearn提供的。但召回率不可以
2 小结
衡量分类指标:混淆矩阵,准确率,召回率,f1 score,TPR,FPR, precision-recall曲线,roc曲线。