AI模型评估：准确率、召回率与F1分数的实战应用-CSDN博客

本文链接：https://blog.csdn.net/qq_16242613/article/details/148027722

在这里插入图片描述

前些天发现了一个巨牛的人工智能学习网站，通俗易懂，风趣幽默，忍不住分享一下给大家。点击跳转到网站https://www.captainbed.cn/north
在这里插入图片描述

文章目录

一、基础概念解析

1.1 混淆矩阵（Confusion Matrix）

混淆矩阵是分类模型评估的基础，它展示了模型预测结果与实际标签的对应关系：

from sklearn.metrics import confusion_matrix
import seaborn as sns

def plot_confusion_matrix(y_true, y_pred, classes):
    """绘制美观的混淆矩阵"""
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=classes, yticklabels=classes)
    plt.title('混淆矩阵')
    plt.ylabel('真实标签')
    plt.xlabel('预测标签')
    plt.show()

# 示例使用
y_true = [0, 1, 0, 1, 1, 0, 0, 1]
y_pred = [0, 1, 1, 1, 0, 0, 0, 1]
plot_confusion_matrix(y_true, y_pred, classes=['负类', '正类'])

1.2 关键术语定义

术语	公式	解释
真正例(TP)	-	模型正确预测的正例
假正例(FP)	-	模型错误预测的正例
真负例(TN)	-	模型正确预测的负例
假负例(FN)	-	模型错误预测的负例
准确率	(TP+TN)/(TP+FP+TN+FN)	所有预测正确的比例
召回率	TP/(TP+FN)	实际正例中被正确预测的比例
精确率	TP/(TP+FP)	预测为正例中实际正例的比例
F1分数	2(精确率召回率)/(精确率+召回率)	精确率和召回率的调和平均

二、核心指标实现

2.1 手动计算实现

def calculate_metrics(y_true, y_pred):
    """手动计算分类指标"""
    TP = sum((true == 1) and (pred == 1) for true, pred in zip(y_true, y_pred))
    FP = sum((true == 0) and (pred == 1) for true, pred in zip(y_true, y_pred))
    TN = sum((true == 0) and (pred == 0) for true, pred in zip(y_true, y_pred))
    FN = sum((true == 1) and (pred == 0) for true, pred in zip(y_true, y_pred))
    
    accuracy = (TP + TN) / (TP + FP + TN + FN) if (TP+FP+TN+FN) != 0 else 0
    precision = TP / (TP + FP) if (TP + FP) != 0 else 0
    recall = TP / (TP + FN) if (TP + FN) != 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
    
    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1': f1,
        'TP': TP,
        'FP': FP,
        'TN': TN,
        'FN': FN
    }

# 示例使用
metrics = calculate_metrics(y_true, y_pred)
print(pd.DataFrame.from_dict(metrics, orient='index', columns=['值']))

2.2 Scikit-learn实现

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def sklearn_metrics(y_true, y_pred):
    """使用scikit-learn计算指标"""
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1': f1_score(y_true, y_pred)
    }

# 多分类场景（宏平均/微平均）
def multiclass_metrics(y_true, y_pred, average='macro'):
    return {
        'Precision': precision_score(y_true, y_pred, average=average),
        'Recall': recall_score(y_true, y_pred, average=average),
        'F1': f1_score(y_true, y_pred, average=average)
    }

三、实战应用场景

3.1 医疗诊断场景（高召回率需求）

# 医疗诊断更关注召回率（减少漏诊）
def evaluate_medical_model(y_true, y_pred):
    metrics = sklearn_metrics(y_true, y_pred)
    
    plt.figure(figsize=(10, 5))
    plt.bar(['召回率', '精确率', 'F1'], 
            [metrics['Recall'], metrics['Precision'], metrics['F1']],
            color=['red', 'blue', 'green'])
    plt.ylim(0, 1)
    plt.title('医疗诊断模型评估（召回率优先）')
    plt.ylabel('分数')
    plt.show()
    
    return metrics

# 模拟数据：1表示患病，0表示健康
y_true_medical = [1, 0, 1, 1, 0, 1, 1, 1, 0, 1]
y_pred_medical = [1, 0, 1, 0, 0, 1, 1, 0, 0, 1]
evaluate_medical_model(y_true_medical, y_pred_medical)

3.2 金融风控场景（高精确率需求）

# 金融风控更关注精确率（减少误判）
def evaluate_fraud_model(y_true, y_pred):
    metrics = sklearn_metrics(y_true, y_pred)
    
    # 可视化对比
    fig, ax = plt.subplots(figsize=(8, 6))
    metrics_df = pd.DataFrame.from_dict(metrics, orient='index')
    metrics_df.plot(kind='bar', ax=ax, legend=False)
    ax.set_title('金融风控模型评估（精确率优先）')
    ax.set_ylabel('分数')
    ax.set_xticklabels(metrics_df.index, rotation=0)
    
    # 添加阈值线
    ax.axhline(y=0.9, color='r', linestyle='--')
    plt.show()
    
    return metrics

# 模拟数据：1表示欺诈，0表示正常
y_true_fraud = [1, 0, 0, 0, 1, 0, 0, 1, 0, 0]
y_pred_fraud = [1, 0, 0, 1, 1, 0, 0, 0, 0, 0]
evaluate_fraud_model(y_true_fraud, y_pred_fraud)

3.3 多分类场景评估

from sklearn.metrics import classification_report

def evaluate_multiclass(y_true, y_pred, classes):
    """多分类综合评估"""
    # 详细分类报告
    print("详细分类报告：")
    print(classification_report(y_true, y_pred, target_names=classes))
    
    # 可视化各类别F1分数
    report = classification_report(y_true, y_pred, target_names=classes, output_dict=True)
    f1_scores = {k: v['f1-score'] for k, v in report.items() if k in classes}
    
    plt.figure(figsize=(10, 5))
    plt.bar(f1_scores.keys(), f1_scores.values())
    plt.title('各类别F1分数')
    plt.ylim(0, 1)
    plt.ylabel('F1分数')
    plt.show()

# 示例使用（新闻分类）
y_true_news = [0, 1, 2, 0, 1, 2, 0, 1, 2]
y_pred_news = [0, 1, 1, 0, 2, 2, 0, 1, 2]
evaluate_multiclass(y_true_news, y_pred_news, ['体育', '科技', '财经'])

四、阈值调整与曲线分析

4.1 精确率-召回率曲线

from sklearn.metrics import precision_recall_curve

def plot_precision_recall_vs_threshold(y_true, y_scores):
    """绘制精确率-召回率-阈值曲线"""
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_scores)
    
    plt.figure(figsize=(10, 6))
    plt.plot(thresholds, precisions[:-1], "b--", label="精确率")
    plt.plot(thresholds, recalls[:-1], "g-", label="召回率")
    plt.xlabel("阈值")
    plt.title("精确率与召回率随阈值变化")
    plt.legend(loc="center left")
    plt.grid(True)
    plt.ylim([0, 1])
    
    # 标记F1最大点
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
    best_idx = np.argmax(f1_scores[:-1])
    plt.scatter(thresholds[best_idx], precisions[best_idx], c='red', s=100)
    plt.scatter(thresholds[best_idx], recalls[best_idx], c='red', s=100)
    plt.show()
    
    return thresholds[best_idx]

# 示例使用
y_scores = np.array([0.1, 0.3, 0.35, 0.4, 0.6, 0.65, 0.8, 0.9])
y_true_labels = [0, 0, 1, 0, 1, 1, 1, 1]
best_threshold = plot_precision_recall_vs_threshold(y_true_labels, y_scores)
print(f"最佳阈值: {best_threshold:.2f}")

4.2 ROC曲线与AUC

from sklearn.metrics import roc_curve, auc

def plot_roc_curve(y_true, y_scores):
    """绘制ROC曲线并计算AUC"""
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2,
             label='ROC曲线 (AUC = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('假阳性率 (FPR)')
    plt.ylabel('真阳性率 (TPR)')
    plt.title('接收者操作特征(ROC)曲线')
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.show()
    
    return roc_auc

# 示例使用
roc_auc = plot_roc_curve(y_true_labels, y_scores)

五、综合评估框架

5.1 自定义评估类

class ModelEvaluator:
    """综合模型评估工具类"""
    
    def __init__(self, y_true, y_pred, y_scores=None):
        self.y_true = np.array(y_true)
        self.y_pred = np.array(y_pred)
        self.y_scores = y_scores
        self.metrics = {}
        
    def compute_basic_metrics(self):
        """计算基础分类指标"""
        self.metrics['accuracy'] = accuracy_score(self.y_true, self.y_pred)
        self.metrics['precision'] = precision_score(self.y_true, self.y_pred)
        self.metrics['recall'] = recall_score(self.y_true, self.y_pred)
        self.metrics['f1'] = f1_score(self.y_true, self.y_pred)
        return self
    
    def plot_confusion_matrix(self, classes=None):
        """绘制混淆矩阵"""
        cm = confusion_matrix(self.y_true, self.y_pred)
        plt.figure(figsize=(6, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=classes, yticklabels=classes)
        plt.title('混淆矩阵')
        plt.ylabel('真实标签')
        plt.xlabel('预测标签')
        plt.show()
        return self
    
    def plot_metrics_comparison(self):
        """绘制指标对比图"""
        if not self.metrics:
            self.compute_basic_metrics()
            
        plt.figure(figsize=(8, 5))
        metrics_df = pd.DataFrame.from_dict(self.metrics, orient='index')
        metrics_df.plot(kind='bar', legend=False)
        plt.title('模型指标对比')
        plt.xticks(rotation=0)
        plt.ylim(0, 1)
        plt.grid(True, axis='y')
        plt.show()
        return self
    
    def full_report(self, classes=None):
        """生成完整评估报告"""
        self.compute_basic_metrics()
        self.plot_confusion_matrix(classes)
        self.plot_metrics_comparison()
        
        if self.y_scores is not None:
            plot_precision_recall_vs_threshold(self.y_true, self.y_scores)
            plot_roc_curve(self.y_true, self.y_scores)
        
        print("\n分类报告:")
        print(classification_report(self.y_true, self.y_pred, target_names=classes))
        return self

# 示例使用
evaluator = ModelEvaluator(y_true_medical, y_pred_medical, y_scores)
evaluator.full_report(classes=['健康', '患病'])

六、业务场景决策

6.1 指标选择指南

业务场景	核心指标	原因	阈值调整策略
疾病诊断	召回率	减少漏诊	降低阈值
金融风控	精确率	减少误判	提高阈值
推荐系统	F1分数	平衡推荐质量与覆盖率	优化F1最大点
垃圾邮件	精确率+召回率	平衡误判与漏判	PR曲线拐点

6.2 业务决策框架

def business_decision(y_true, y_scores, business_type='medical', cost_matrix=None):
    """基于业务类型的决策优化"""
    
    # 获取所有可能的阈值
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_scores)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
    
    if business_type == 'medical':
        # 医疗场景：确保召回率>90%下最大化精确率
        mask = recalls[:-1] > 0.9
        if sum(mask) > 0:
            best_idx = np.argmax(precisions[:-1][mask])
            best_threshold = thresholds[mask][best_idx]
        else:
            best_threshold = thresholds[np.argmax(recalls[:-1])]
            
    elif business_type == 'fraud':
        # 金融场景：确保精确率>80%下最大化召回率
        mask = precisions[:-1] > 0.8
        if sum(mask) > 0:
            best_idx = np.argmax(recalls[:-1][mask])
            best_threshold = thresholds[mask][best_idx]
        else:
            best_threshold = thresholds[np.argmax(precisions[:-1])]
            
    elif cost_matrix is not None:
        # 自定义成本矩阵
        # cost_matrix格式: [[TN_cost, FP_cost], [FN_cost, TP_cost]]
        fpr, tpr, thresholds = roc_curve(y_true, y_scores)
        expected_cost = (cost_matrix[0][0] * (1-fpr) + cost_matrix[0][1] * fpr + 
                        cost_matrix[1][0] * (1-tpr) + cost_matrix[1][1] * tpr)
        best_threshold = thresholds[np.argmin(expected_cost)]
    else:
        # 默认选择F1最大点
        best_threshold = thresholds[np.argmax(f1_scores[:-1])]
    
    # 应用最佳阈值
    best_pred = (y_scores >= best_threshold).astype(int)
    metrics = sklearn_metrics(y_true, best_pred)
    
    print(f"推荐阈值: {best_threshold:.3f}")
    print(f"对应指标: {metrics}")
    return best_threshold

# 医疗场景示例
business_decision(y_true_medical, y_scores, business_type='medical')

# 自定义成本矩阵示例
cost_matrix = [[0, 10],  # TN成本0元，FP成本10元（误判为欺诈）
               [100, 0]] # FN成本100元（漏判欺诈），TP成本0元
business_decision(y_true_fraud, y_scores, cost_matrix=cost_matrix)

七、高级主题延伸

7.1 样本不平衡处理

from imblearn.metrics import classification_report_imbalanced

def evaluate_imbalanced(y_true, y_pred):
    """不平衡数据集评估"""
    print("标准分类报告:")
    print(classification_report(y_true, y_pred))
    
    print("\n不平衡数据集分类报告:")
    print(classification_report_imbalanced(y_true, y_pred))
    
    # 可视化少数类关注指标
    report = classification_report_imbalanced(y_true, y_pred, output_dict=True)
    metrics = {
        'recall': report['1']['recall'],
        'precision': report['1']['precision'],
        'f1': report['1']['f1'],
        'geo_mean': report['1']['geo_mean']
    }
    
    plt.figure(figsize=(8, 5))
    plt.bar(metrics.keys(), metrics.values())
    plt.title('少数类(1)关键指标')
    plt.ylim(0, 1)
    plt.ylabel('分数')
    plt.show()

# 示例使用（假设1是少数类）
y_true_imb = [0]*90 + [1]*10
y_pred_imb = [0]*85 + [1]*5 + [0]*5 + [1]*5
evaluate_imbalanced(y_true_imb, y_pred_imb)

7.2 多模型对比评估

def compare_models(models, X_test, y_test):
    """多模型性能对比"""
    results = []
    
    for name, model in models.items():
        y_pred = model.predict(X_test)
        results.append({
            'Model': name,
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'F1': f1_score(y_test, y_pred)
        })
    
    df = pd.DataFrame(results).set_index('Model')
    
    # 绘制对比图
    plt.figure(figsize=(10, 6))
    df.plot(kind='bar', rot=0)
    plt.title('模型性能对比')
    plt.ylabel('分数')
    plt.ylim(0, 1)
    plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
    plt.show()
    
    return df

# 示例使用
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

models = {
    'Logistic Regression': LogisticRegression().fit(X_train, y_train),
    'Random Forest': RandomForestClassifier().fit(X_train, y_train),
    'SVM': SVC().fit(X_train, y_train)
}

compare_models(models, X_test, y_test)

通过本指南，您已掌握了如何在实际项目中应用准确率、召回率和F1分数来全面评估AI模型性能。记住，没有放之四海而皆准的"最佳指标"，关键是根据业务需求选择合适的评估策略，并通过可视化工具清晰呈现评估结果。