前些天发现了一个巨牛的人工智能学习网站,通俗易懂,风趣幽默,忍不住分享一下给大家。点击跳转到网站
https://www.captainbed.cn/north
文章目录
一、基础概念解析
1.1 混淆矩阵(Confusion Matrix)
混淆矩阵是分类模型评估的基础,它展示了模型预测结果与实际标签的对应关系:
from sklearn.metrics import confusion_matrix
import seaborn as sns
def plot_confusion_matrix(y_true, y_pred, classes):
"""绘制美观的混淆矩阵"""
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=classes, yticklabels=classes)
plt.title('混淆矩阵')
plt.ylabel('真实标签')
plt.xlabel('预测标签')
plt.show()
# 示例使用
y_true = [0, 1, 0, 1, 1, 0, 0, 1]
y_pred = [0, 1, 1, 1, 0, 0, 0, 1]
plot_confusion_matrix(y_true, y_pred, classes=['负类', '正类'])
1.2 关键术语定义
术语 | 公式 | 解释 |
---|---|---|
真正例(TP) | - | 模型正确预测的正例 |
假正例(FP) | - | 模型错误预测的正例 |
真负例(TN) | - | 模型正确预测的负例 |
假负例(FN) | - | 模型错误预测的负例 |
准确率 | (TP+TN)/(TP+FP+TN+FN) | 所有预测正确的比例 |
召回率 | TP/(TP+FN) | 实际正例中被正确预测的比例 |
精确率 | TP/(TP+FP) | 预测为正例中实际正例的比例 |
F1分数 | 2*(精确率*召回率)/(精确率+召回率) | 精确率和召回率的调和平均 |
二、核心指标实现
2.1 手动计算实现
def calculate_metrics(y_true, y_pred):
"""手动计算分类指标"""
TP = sum((true == 1) and (pred == 1) for true, pred in zip(y_true, y_pred))
FP = sum((true == 0) and (pred == 1) for true, pred in zip(y_true, y_pred))
TN = sum((true == 0) and (pred == 0) for true, pred in zip(y_true, y_pred))
FN = sum((true == 1) and (pred == 0) for true, pred in zip(y_true, y_pred))
accuracy = (TP + TN) / (TP + FP + TN + FN) if (TP+FP+TN+FN) != 0 else 0
precision = TP / (TP + FP) if (TP + FP) != 0 else 0
recall = TP / (TP + FN) if (TP + FN) != 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
return {
'Accuracy': accuracy,
'Precision': precision,
'Recall': recall,
'F1': f1,
'TP': TP,
'FP': FP,
'TN': TN,
'FN': FN
}
# 示例使用
metrics = calculate_metrics(y_true, y_pred)
print(pd.DataFrame.from_dict(metrics, orient='index', columns=['值']))
2.2 Scikit-learn实现
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def sklearn_metrics(y_true, y_pred):
"""使用scikit-learn计算指标"""
return {
'Accuracy': accuracy_score(y_true, y_pred),
'Precision': precision_score(y_true, y_pred),
'Recall': recall_score(y_true, y_pred),
'F1': f1_score(y_true, y_pred)
}
# 多分类场景(宏平均/微平均)
def multiclass_metrics(y_true, y_pred, average='macro'):
return {
'Precision': precision_score(y_true, y_pred, average=average),
'Recall': recall_score(y_true, y_pred, average=average),
'F1': f1_score(y_true, y_pred, average=average)
}
三、实战应用场景
3.1 医疗诊断场景(高召回率需求)
# 医疗诊断更关注召回率(减少漏诊)
def evaluate_medical_model(y_true, y_pred):
metrics = sklearn_metrics(y_true, y_pred)
plt.figure(figsize=(10, 5))
plt.bar(['召回率', '精确率', 'F1'],
[metrics['Recall'], metrics['Precision'], metrics['F1']],
color=['red', 'blue', 'green'])
plt.ylim(0, 1)
plt.title('医疗诊断模型评估(召回率优先)')
plt.ylabel('分数')
plt.show()
return metrics
# 模拟数据:1表示患病,0表示健康
y_true_medical = [1, 0, 1, 1, 0, 1, 1, 1, 0, 1]
y_pred_medical = [1, 0, 1, 0, 0, 1, 1, 0, 0, 1]
evaluate_medical_model(y_true_medical, y_pred_medical)
3.2 金融风控场景(高精确率需求)
# 金融风控更关注精确率(减少误判)
def evaluate_fraud_model(y_true, y_pred):
metrics = sklearn_metrics(y_true, y_pred)
# 可视化对比
fig, ax = plt.subplots(figsize=(8, 6))
metrics_df = pd.DataFrame.from_dict(metrics, orient='index')
metrics_df.plot(kind='bar', ax=ax, legend=False)
ax.set_title('金融风控模型评估(精确率优先)')
ax.set_ylabel('分数')
ax.set_xticklabels(metrics_df.index, rotation=0)
# 添加阈值线
ax.axhline(y=0.9, color='r', linestyle='--')
plt.show()
return metrics
# 模拟数据:1表示欺诈,0表示正常
y_true_fraud = [1, 0, 0, 0, 1, 0, 0, 1, 0, 0]
y_pred_fraud = [1, 0, 0, 1, 1, 0, 0, 0, 0, 0]
evaluate_fraud_model(y_true_fraud, y_pred_fraud)
3.3 多分类场景评估
from sklearn.metrics import classification_report
def evaluate_multiclass(y_true, y_pred, classes):
"""多分类综合评估"""
# 详细分类报告
print("详细分类报告:")
print(classification_report(y_true, y_pred, target_names=classes))
# 可视化各类别F1分数
report = classification_report(y_true, y_pred, target_names=classes, output_dict=True)
f1_scores = {k: v['f1-score'] for k, v in report.items() if k in classes}
plt.figure(figsize=(10, 5))
plt.bar(f1_scores.keys(), f1_scores.values())
plt.title('各类别F1分数')
plt.ylim(0, 1)
plt.ylabel('F1分数')
plt.show()
# 示例使用(新闻分类)
y_true_news = [0, 1, 2, 0, 1, 2, 0, 1, 2]
y_pred_news = [0, 1, 1, 0, 2, 2, 0, 1, 2]
evaluate_multiclass(y_true_news, y_pred_news, ['体育', '科技', '财经'])
四、阈值调整与曲线分析
4.1 精确率-召回率曲线
from sklearn.metrics import precision_recall_curve
def plot_precision_recall_vs_threshold(y_true, y_scores):
"""绘制精确率-召回率-阈值曲线"""
precisions, recalls, thresholds = precision_recall_curve(y_true, y_scores)
plt.figure(figsize=(10, 6))
plt.plot(thresholds, precisions[:-1], "b--", label="精确率")
plt.plot(thresholds, recalls[:-1], "g-", label="召回率")
plt.xlabel("阈值")
plt.title("精确率与召回率随阈值变化")
plt.legend(loc="center left")
plt.grid(True)
plt.ylim([0, 1])
# 标记F1最大点
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
best_idx = np.argmax(f1_scores[:-1])
plt.scatter(thresholds[best_idx], precisions[best_idx], c='red', s=100)
plt.scatter(thresholds[best_idx], recalls[best_idx], c='red', s=100)
plt.show()
return thresholds[best_idx]
# 示例使用
y_scores = np.array([0.1, 0.3, 0.35, 0.4, 0.6, 0.65, 0.8, 0.9])
y_true_labels = [0, 0, 1, 0, 1, 1, 1, 1]
best_threshold = plot_precision_recall_vs_threshold(y_true_labels, y_scores)
print(f"最佳阈值: {best_threshold:.2f}")
4.2 ROC曲线与AUC
from sklearn.metrics import roc_curve, auc
def plot_roc_curve(y_true, y_scores):
"""绘制ROC曲线并计算AUC"""
fpr, tpr, thresholds = roc_curve(y_true, y_scores)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2,
label='ROC曲线 (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('假阳性率 (FPR)')
plt.ylabel('真阳性率 (TPR)')
plt.title('接收者操作特征(ROC)曲线')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()
return roc_auc
# 示例使用
roc_auc = plot_roc_curve(y_true_labels, y_scores)
五、综合评估框架
5.1 自定义评估类
class ModelEvaluator:
"""综合模型评估工具类"""
def __init__(self, y_true, y_pred, y_scores=None):
self.y_true = np.array(y_true)
self.y_pred = np.array(y_pred)
self.y_scores = y_scores
self.metrics = {}
def compute_basic_metrics(self):
"""计算基础分类指标"""
self.metrics['accuracy'] = accuracy_score(self.y_true, self.y_pred)
self.metrics['precision'] = precision_score(self.y_true, self.y_pred)
self.metrics['recall'] = recall_score(self.y_true, self.y_pred)
self.metrics['f1'] = f1_score(self.y_true, self.y_pred)
return self
def plot_confusion_matrix(self, classes=None):
"""绘制混淆矩阵"""
cm = confusion_matrix(self.y_true, self.y_pred)
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=classes, yticklabels=classes)
plt.title('混淆矩阵')
plt.ylabel('真实标签')
plt.xlabel('预测标签')
plt.show()
return self
def plot_metrics_comparison(self):
"""绘制指标对比图"""
if not self.metrics:
self.compute_basic_metrics()
plt.figure(figsize=(8, 5))
metrics_df = pd.DataFrame.from_dict(self.metrics, orient='index')
metrics_df.plot(kind='bar', legend=False)
plt.title('模型指标对比')
plt.xticks(rotation=0)
plt.ylim(0, 1)
plt.grid(True, axis='y')
plt.show()
return self
def full_report(self, classes=None):
"""生成完整评估报告"""
self.compute_basic_metrics()
self.plot_confusion_matrix(classes)
self.plot_metrics_comparison()
if self.y_scores is not None:
plot_precision_recall_vs_threshold(self.y_true, self.y_scores)
plot_roc_curve(self.y_true, self.y_scores)
print("\n分类报告:")
print(classification_report(self.y_true, self.y_pred, target_names=classes))
return self
# 示例使用
evaluator = ModelEvaluator(y_true_medical, y_pred_medical, y_scores)
evaluator.full_report(classes=['健康', '患病'])
六、业务场景决策
6.1 指标选择指南
业务场景 | 核心指标 | 原因 | 阈值调整策略 |
---|---|---|---|
疾病诊断 | 召回率 | 减少漏诊 | 降低阈值 |
金融风控 | 精确率 | 减少误判 | 提高阈值 |
推荐系统 | F1分数 | 平衡推荐质量与覆盖率 | 优化F1最大点 |
垃圾邮件 | 精确率+召回率 | 平衡误判与漏判 | PR曲线拐点 |
6.2 业务决策框架
def business_decision(y_true, y_scores, business_type='medical', cost_matrix=None):
"""基于业务类型的决策优化"""
# 获取所有可能的阈值
precisions, recalls, thresholds = precision_recall_curve(y_true, y_scores)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
if business_type == 'medical':
# 医疗场景:确保召回率>90%下最大化精确率
mask = recalls[:-1] > 0.9
if sum(mask) > 0:
best_idx = np.argmax(precisions[:-1][mask])
best_threshold = thresholds[mask][best_idx]
else:
best_threshold = thresholds[np.argmax(recalls[:-1])]
elif business_type == 'fraud':
# 金融场景:确保精确率>80%下最大化召回率
mask = precisions[:-1] > 0.8
if sum(mask) > 0:
best_idx = np.argmax(recalls[:-1][mask])
best_threshold = thresholds[mask][best_idx]
else:
best_threshold = thresholds[np.argmax(precisions[:-1])]
elif cost_matrix is not None:
# 自定义成本矩阵
# cost_matrix格式: [[TN_cost, FP_cost], [FN_cost, TP_cost]]
fpr, tpr, thresholds = roc_curve(y_true, y_scores)
expected_cost = (cost_matrix[0][0] * (1-fpr) + cost_matrix[0][1] * fpr +
cost_matrix[1][0] * (1-tpr) + cost_matrix[1][1] * tpr)
best_threshold = thresholds[np.argmin(expected_cost)]
else:
# 默认选择F1最大点
best_threshold = thresholds[np.argmax(f1_scores[:-1])]
# 应用最佳阈值
best_pred = (y_scores >= best_threshold).astype(int)
metrics = sklearn_metrics(y_true, best_pred)
print(f"推荐阈值: {best_threshold:.3f}")
print(f"对应指标: {metrics}")
return best_threshold
# 医疗场景示例
business_decision(y_true_medical, y_scores, business_type='medical')
# 自定义成本矩阵示例
cost_matrix = [[0, 10], # TN成本0元,FP成本10元(误判为欺诈)
[100, 0]] # FN成本100元(漏判欺诈),TP成本0元
business_decision(y_true_fraud, y_scores, cost_matrix=cost_matrix)
七、高级主题延伸
7.1 样本不平衡处理
from imblearn.metrics import classification_report_imbalanced
def evaluate_imbalanced(y_true, y_pred):
"""不平衡数据集评估"""
print("标准分类报告:")
print(classification_report(y_true, y_pred))
print("\n不平衡数据集分类报告:")
print(classification_report_imbalanced(y_true, y_pred))
# 可视化少数类关注指标
report = classification_report_imbalanced(y_true, y_pred, output_dict=True)
metrics = {
'recall': report['1']['recall'],
'precision': report['1']['precision'],
'f1': report['1']['f1'],
'geo_mean': report['1']['geo_mean']
}
plt.figure(figsize=(8, 5))
plt.bar(metrics.keys(), metrics.values())
plt.title('少数类(1)关键指标')
plt.ylim(0, 1)
plt.ylabel('分数')
plt.show()
# 示例使用(假设1是少数类)
y_true_imb = [0]*90 + [1]*10
y_pred_imb = [0]*85 + [1]*5 + [0]*5 + [1]*5
evaluate_imbalanced(y_true_imb, y_pred_imb)
7.2 多模型对比评估
def compare_models(models, X_test, y_test):
"""多模型性能对比"""
results = []
for name, model in models.items():
y_pred = model.predict(X_test)
results.append({
'Model': name,
'Accuracy': accuracy_score(y_test, y_pred),
'Precision': precision_score(y_test, y_pred),
'Recall': recall_score(y_test, y_pred),
'F1': f1_score(y_test, y_pred)
})
df = pd.DataFrame(results).set_index('Model')
# 绘制对比图
plt.figure(figsize=(10, 6))
df.plot(kind='bar', rot=0)
plt.title('模型性能对比')
plt.ylabel('分数')
plt.ylim(0, 1)
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.show()
return df
# 示例使用
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
models = {
'Logistic Regression': LogisticRegression().fit(X_train, y_train),
'Random Forest': RandomForestClassifier().fit(X_train, y_train),
'SVM': SVC().fit(X_train, y_train)
}
compare_models(models, X_test, y_test)
通过本指南,您已掌握了如何在实际项目中应用准确率、召回率和F1分数来全面评估AI模型性能。记住,没有放之四海而皆准的"最佳指标",关键是根据业务需求选择合适的评估策略,并通过可视化工具清晰呈现评估结果。
一、基础概念解析
1.1 混淆矩阵(Confusion Matrix)
混淆矩阵是分类模型评估的基础,它展示了模型预测结果与实际标签的对应关系:
from sklearn.metrics import confusion_matrix
import seaborn as sns
def plot_confusion_matrix(y_true, y_pred, classes):
"""绘制美观的混淆矩阵"""
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=classes, yticklabels=classes)
plt.title('混淆矩阵')
plt.ylabel('真实标签')
plt.xlabel('预测标签')
plt.show()
# 示例使用
y_true = [0, 1, 0, 1, 1, 0, 0, 1]
y_pred = [0, 1, 1, 1, 0, 0, 0, 1]
plot_confusion_matrix(y_true, y_pred, classes=['负类', '正类'])
1.2 关键术语定义
术语 | 公式 | 解释 |
---|---|---|
真正例(TP) | - | 模型正确预测的正例 |
假正例(FP) | - | 模型错误预测的正例 |
真负例(TN) | - | 模型正确预测的负例 |
假负例(FN) | - | 模型错误预测的负例 |
准确率 | (TP+TN)/(TP+FP+TN+FN) | 所有预测正确的比例 |
召回率 | TP/(TP+FN) | 实际正例中被正确预测的比例 |
精确率 | TP/(TP+FP) | 预测为正例中实际正例的比例 |
F1分数 | 2*(精确率*召回率)/(精确率+召回率) | 精确率和召回率的调和平均 |
二、核心指标实现
2.1 手动计算实现
def calculate_metrics(y_true, y_pred):
"""手动计算分类指标"""
TP = sum((true == 1) and (pred == 1) for true, pred in zip(y_true, y_pred))
FP = sum((true == 0) and (pred == 1) for true, pred in zip(y_true, y_pred))
TN = sum((true == 0) and (pred == 0) for true, pred in zip(y_true, y_pred))
FN = sum((true == 1) and (pred == 0) for true, pred in zip(y_true, y_pred))
accuracy = (TP + TN) / (TP + FP + TN + FN) if (TP+FP+TN+FN) != 0 else 0
precision = TP / (TP + FP) if (TP + FP) != 0 else 0
recall = TP / (TP + FN) if (TP + FN) != 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
return {
'Accuracy': accuracy,
'Precision': precision,
'Recall': recall,
'F1': f1,
'TP': TP,
'FP': FP,
'TN': TN,
'FN': FN
}
# 示例使用
metrics = calculate_metrics(y_true, y_pred)
print(pd.DataFrame.from_dict(metrics, orient='index', columns=['值']))
2.2 Scikit-learn实现
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def sklearn_metrics(y_true, y_pred):
"""使用scikit-learn计算指标"""
return {
'Accuracy': accuracy_score(y_true, y_pred),
'Precision': precision_score(y_true, y_pred),
'Recall': recall_score(y_true, y_pred),
'F1': f1_score(y_true, y_pred)
}
# 多分类场景(宏平均/微平均)
def multiclass_metrics(y_true, y_pred, average='macro'):
return {
'Precision': precision_score(y_true, y_pred, average=average),
'Recall': recall_score(y_true, y_pred, average=average),
'F1': f1_score(y_true, y_pred, average=average)
}
三、实战应用场景
3.1 医疗诊断场景(高召回率需求)
# 医疗诊断更关注召回率(减少漏诊)
def evaluate_medical_model(y_true, y_pred):
metrics = sklearn_metrics(y_true, y_pred)
plt.figure(figsize=(10, 5))
plt.bar(['召回率', '精确率', 'F1'],
[metrics['Recall'], metrics['Precision'], metrics['F1']],
color=['red', 'blue', 'green'])
plt.ylim(0, 1)
plt.title('医疗诊断模型评估(召回率优先)')
plt.ylabel('分数')
plt.show()
return metrics
# 模拟数据:1表示患病,0表示健康
y_true_medical = [1, 0, 1, 1, 0, 1, 1, 1, 0, 1]
y_pred_medical = [1, 0, 1, 0, 0, 1, 1, 0, 0, 1]
evaluate_medical_model(y_true_medical, y_pred_medical)
3.2 金融风控场景(高精确率需求)
# 金融风控更关注精确率(减少误判)
def evaluate_fraud_model(y_true, y_pred):
metrics = sklearn_metrics(y_true, y_pred)
# 可视化对比
fig, ax = plt.subplots(figsize=(8, 6))
metrics_df = pd.DataFrame.from_dict(metrics, orient='index')
metrics_df.plot(kind='bar', ax=ax, legend=False)
ax.set_title('金融风控模型评估(精确率优先)')
ax.set_ylabel('分数')
ax.set_xticklabels(metrics_df.index, rotation=0)
# 添加阈值线
ax.axhline(y=0.9, color='r', linestyle='--')
plt.show()
return metrics
# 模拟数据:1表示欺诈,0表示正常
y_true_fraud = [1, 0, 0, 0, 1, 0, 0, 1, 0, 0]
y_pred_fraud = [1, 0, 0, 1, 1, 0, 0, 0, 0, 0]
evaluate_fraud_model(y_true_fraud, y_pred_fraud)
3.3 多分类场景评估
from sklearn.metrics import classification_report
def evaluate_multiclass(y_true, y_pred, classes):
"""多分类综合评估"""
# 详细分类报告
print("详细分类报告:")
print(classification_report(y_true, y_pred, target_names=classes))
# 可视化各类别F1分数
report = classification_report(y_true, y_pred, target_names=classes, output_dict=True)
f1_scores = {k: v['f1-score'] for k, v in report.items() if k in classes}
plt.figure(figsize=(10, 5))
plt.bar(f1_scores.keys(), f1_scores.values())
plt.title('各类别F1分数')
plt.ylim(0, 1)
plt.ylabel('F1分数')
plt.show()
# 示例使用(新闻分类)
y_true_news = [0, 1, 2, 0, 1, 2, 0, 1, 2]
y_pred_news = [0, 1, 1, 0, 2, 2, 0, 1, 2]
evaluate_multiclass(y_true_news, y_pred_news, ['体育', '科技', '财经'])
四、阈值调整与曲线分析
4.1 精确率-召回率曲线
from sklearn.metrics import precision_recall_curve
def plot_precision_recall_vs_threshold(y_true, y_scores):
"""绘制精确率-召回率-阈值曲线"""
precisions, recalls, thresholds = precision_recall_curve(y_true, y_scores)
plt.figure(figsize=(10, 6))
plt.plot(thresholds, precisions[:-1], "b--", label="精确率")
plt.plot(thresholds, recalls[:-1], "g-", label="召回率")
plt.xlabel("阈值")
plt.title("精确率与召回率随阈值变化")
plt.legend(loc="center left")
plt.grid(True)
plt.ylim([0, 1])
# 标记F1最大点
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
best_idx = np.argmax(f1_scores[:-1])
plt.scatter(thresholds[best_idx], precisions[best_idx], c='red', s=100)
plt.scatter(thresholds[best_idx], recalls[best_idx], c='red', s=100)
plt.show()
return thresholds[best_idx]
# 示例使用
y_scores = np.array([0.1, 0.3, 0.35, 0.4, 0.6, 0.65, 0.8, 0.9])
y_true_labels = [0, 0, 1, 0, 1, 1, 1, 1]
best_threshold = plot_precision_recall_vs_threshold(y_true_labels, y_scores)
print(f"最佳阈值: {best_threshold:.2f}")
4.2 ROC曲线与AUC
from sklearn.metrics import roc_curve, auc
def plot_roc_curve(y_true, y_scores):
"""绘制ROC曲线并计算AUC"""
fpr, tpr, thresholds = roc_curve(y_true, y_scores)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2,
label='ROC曲线 (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('假阳性率 (FPR)')
plt.ylabel('真阳性率 (TPR)')
plt.title('接收者操作特征(ROC)曲线')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()
return roc_auc
# 示例使用
roc_auc = plot_roc_curve(y_true_labels, y_scores)
五、综合评估框架
5.1 自定义评估类
class ModelEvaluator:
"""综合模型评估工具类"""
def __init__(self, y_true, y_pred, y_scores=None):
self.y_true = np.array(y_true)
self.y_pred = np.array(y_pred)
self.y_scores = y_scores
self.metrics = {}
def compute_basic_metrics(self):
"""计算基础分类指标"""
self.metrics['accuracy'] = accuracy_score(self.y_true, self.y_pred)
self.metrics['precision'] = precision_score(self.y_true, self.y_pred)
self.metrics['recall'] = recall_score(self.y_true, self.y_pred)
self.metrics['f1'] = f1_score(self.y_true, self.y_pred)
return self
def plot_confusion_matrix(self, classes=None):
"""绘制混淆矩阵"""
cm = confusion_matrix(self.y_true, self.y_pred)
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=classes, yticklabels=classes)
plt.title('混淆矩阵')
plt.ylabel('真实标签')
plt.xlabel('预测标签')
plt.show()
return self
def plot_metrics_comparison(self):
"""绘制指标对比图"""
if not self.metrics:
self.compute_basic_metrics()
plt.figure(figsize=(8, 5))
metrics_df = pd.DataFrame.from_dict(self.metrics, orient='index')
metrics_df.plot(kind='bar', legend=False)
plt.title('模型指标对比')
plt.xticks(rotation=0)
plt.ylim(0, 1)
plt.grid(True, axis='y')
plt.show()
return self
def full_report(self, classes=None):
"""生成完整评估报告"""
self.compute_basic_metrics()
self.plot_confusion_matrix(classes)
self.plot_metrics_comparison()
if self.y_scores is not None:
plot_precision_recall_vs_threshold(self.y_true, self.y_scores)
plot_roc_curve(self.y_true, self.y_scores)
print("\n分类报告:")
print(classification_report(self.y_true, self.y_pred, target_names=classes))
return self
# 示例使用
evaluator = ModelEvaluator(y_true_medical, y_pred_medical, y_scores)
evaluator.full_report(classes=['健康', '患病'])
六、业务场景决策
6.1 指标选择指南
业务场景 | 核心指标 | 原因 | 阈值调整策略 |
---|---|---|---|
疾病诊断 | 召回率 | 减少漏诊 | 降低阈值 |
金融风控 | 精确率 | 减少误判 | 提高阈值 |
推荐系统 | F1分数 | 平衡推荐质量与覆盖率 | 优化F1最大点 |
垃圾邮件 | 精确率+召回率 | 平衡误判与漏判 | PR曲线拐点 |
6.2 业务决策框架
def business_decision(y_true, y_scores, business_type='medical', cost_matrix=None):
"""基于业务类型的决策优化"""
# 获取所有可能的阈值
precisions, recalls, thresholds = precision_recall_curve(y_true, y_scores)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
if business_type == 'medical':
# 医疗场景:确保召回率>90%下最大化精确率
mask = recalls[:-1] > 0.9
if sum(mask) > 0:
best_idx = np.argmax(precisions[:-1][mask])
best_threshold = thresholds[mask][best_idx]
else:
best_threshold = thresholds[np.argmax(recalls[:-1])]
elif business_type == 'fraud':
# 金融场景:确保精确率>80%下最大化召回率
mask = precisions[:-1] > 0.8
if sum(mask) > 0:
best_idx = np.argmax(recalls[:-1][mask])
best_threshold = thresholds[mask][best_idx]
else:
best_threshold = thresholds[np.argmax(precisions[:-1])]
elif cost_matrix is not None:
# 自定义成本矩阵
# cost_matrix格式: [[TN_cost, FP_cost], [FN_cost, TP_cost]]
fpr, tpr, thresholds = roc_curve(y_true, y_scores)
expected_cost = (cost_matrix[0][0] * (1-fpr) + cost_matrix[0][1] * fpr +
cost_matrix[1][0] * (1-tpr) + cost_matrix[1][1] * tpr)
best_threshold = thresholds[np.argmin(expected_cost)]
else:
# 默认选择F1最大点
best_threshold = thresholds[np.argmax(f1_scores[:-1])]
# 应用最佳阈值
best_pred = (y_scores >= best_threshold).astype(int)
metrics = sklearn_metrics(y_true, best_pred)
print(f"推荐阈值: {best_threshold:.3f}")
print(f"对应指标: {metrics}")
return best_threshold
# 医疗场景示例
business_decision(y_true_medical, y_scores, business_type='medical')
# 自定义成本矩阵示例
cost_matrix = [[0, 10], # TN成本0元,FP成本10元(误判为欺诈)
[100, 0]] # FN成本100元(漏判欺诈),TP成本0元
business_decision(y_true_fraud, y_scores, cost_matrix=cost_matrix)
七、高级主题延伸
7.1 样本不平衡处理
from imblearn.metrics import classification_report_imbalanced
def evaluate_imbalanced(y_true, y_pred):
"""不平衡数据集评估"""
print("标准分类报告:")
print(classification_report(y_true, y_pred))
print("\n不平衡数据集分类报告:")
print(classification_report_imbalanced(y_true, y_pred))
# 可视化少数类关注指标
report = classification_report_imbalanced(y_true, y_pred, output_dict=True)
metrics = {
'recall': report['1']['recall'],
'precision': report['1']['precision'],
'f1': report['1']['f1'],
'geo_mean': report['1']['geo_mean']
}
plt.figure(figsize=(8, 5))
plt.bar(metrics.keys(), metrics.values())
plt.title('少数类(1)关键指标')
plt.ylim(0, 1)
plt.ylabel('分数')
plt.show()
# 示例使用(假设1是少数类)
y_true_imb = [0]*90 + [1]*10
y_pred_imb = [0]*85 + [1]*5 + [0]*5 + [1]*5
evaluate_imbalanced(y_true_imb, y_pred_imb)
7.2 多模型对比评估
def compare_models(models, X_test, y_test):
"""多模型性能对比"""
results = []
for name, model in models.items():
y_pred = model.predict(X_test)
results.append({
'Model': name,
'Accuracy': accuracy_score(y_test, y_pred),
'Precision': precision_score(y_test, y_pred),
'Recall': recall_score(y_test, y_pred),
'F1': f1_score(y_test, y_pred)
})
df = pd.DataFrame(results).set_index('Model')
# 绘制对比图
plt.figure(figsize=(10, 6))
df.plot(kind='bar', rot=0)
plt.title('模型性能对比')
plt.ylabel('分数')
plt.ylim(0, 1)
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.show()
return df
# 示例使用
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
models = {
'Logistic Regression': LogisticRegression().fit(X_train, y_train),
'Random Forest': RandomForestClassifier().fit(X_train, y_train),
'SVM': SVC().fit(X_train, y_train)
}
compare_models(models, X_test, y_test)
通过本指南,您已掌握了如何在实际项目中应用准确率、召回率和F1分数来全面评估AI模型性能。记住,没有放之四海而皆准的"最佳指标",关键是根据业务需求选择合适的评估策略,并通过可视化工具清晰呈现评估结果。