@浙大疏锦行
DAY 31 文件的规范拆分和写法
知识点回顾
1. 规范的文件命名
2. 规范的文件夹管理
3. 机器学习项目的拆分
4. 编码格式和类型注解
作业:尝试针对之前的心脏病项目,准备拆分的项目文件,思考下哪些部分可以未来复用。
预处理:
import pandas as pd
import numpy as np
from typing import Tuple, Dict
def load_data(file_path: str) -> pd.DataFrame:
"""加载数据文件
Args:
file_path: 数据文件路径
Returns:
加载的数据框
"""
return pd.read_csv(file_path)
def handle_missing_values(data: pd.DataFrame) -> pd.DataFrame:
"""处理缺失值
Args:
data: 包含缺失值的数据框
Returns:
处理后的数据框
"""
data_clean = data.copy()
continuous_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
for feature in continuous_features:
mode_value = data[feature].mode()[0]
data_clean[feature].fillna(mode_value, inplace=True)
return data_clean
if __name__ == "__main__":
# 测试代码
data = load_data(r"C:\Users\wangzhikai\Desktop\python60-days-challenge-master\day31练习\data\raw\heart.csv")
data_clean = handle_missing_values(data)
print("数据预处理完成!")
训练:
# -*- coding: utf-8 -*-
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import time
import joblib # 用于保存模型
from typing import Tuple # 用于类型注解
from data.preprocessing import load_data,handle_missing_values
def prepare_data() -> Tuple:
"""准备训练数据
Returns:
训练集和测试集的特征和标签
"""
# 加载和预处理数据
data = load_data(r"C:\Users\wangzhikai\Desktop\python60-days-challenge-master\day31练习\data\raw\heart.csv")
data_clean = handle_missing_values(data)
# 分离特征和标签
X = data_clean.drop(['target'], axis=1)
y = data_clean['target']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
return X_train, X_test, y_train, y_test
def train_model(X_train, y_train, model_params=None) -> RandomForestClassifier:
"""训练随机森林模型
Args:
X_train: 训练特征
y_train: 训练标签
model_params: 模型参数字典
Returns:
训练好的模型
"""
if model_params is None:
model_params = {'random_state': 42}
model = RandomForestClassifier(**model_params)
model.fit(X_train, y_train)
return model
def evaluate_model(model, X_test, y_test) -> None:
"""评估模型性能
Args:
model: 训练好的模型
X_test: 测试特征
y_test: 测试标签
"""
y_pred = model.predict(X_test)
print("\n分类报告:")
print(classification_report(y_test, y_pred))
print("\n混淆矩阵:")
print(confusion_matrix(y_test, y_pred))
def save_model(model, model_path: str) -> None:
"""保存模型
Args:
model: 训练好的模型
model_path: 模型保存路径
"""
os.makedirs(os.path.dirname(model_path), exist_ok=True)
joblib.dump(model, model_path)
print(f"\n模型已保存至: {model_path}")
if __name__ == "__main__":
# 准备数据
X_train, X_test, y_train, y_test = prepare_data()
# 记录开始时间
start_time = time.time()
# 训练模型
model = train_model(X_train, y_train)
# 记录结束时间
end_time = time.time()
print(f"\n训练耗时: {end_time - start_time:.4f} 秒")
# 评估模型
evaluate_model(model, X_test, y_test)
# 保存模型
save_model(model, "models/random_forest_model.joblib")
可视化:
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import numpy as np
from typing import Any
def plot_feature_importance_shap(model: Any, X_test, save_path: str = None) -> None:
"""绘制SHAP特征重要性图
Args:
model: 训练好的模型
X_test: 测试数据
save_path: 图片保存路径
"""
# 初始化SHAP解释器
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
# 绘制特征重要性条形图
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values[:, :, 0], X_test, plot_type="bar", show=False)
plt.title("SHAP特征重要性")
if save_path:
plt.savefig(save_path)
print(f"特征重要性图已保存至: {save_path}")
plt.show()
def plot_confusion_matrix(y_true, y_pred, save_path: str = None) -> None:
"""绘制混淆矩阵热力图
Args:
y_true: 真实标签
y_pred: 预测标签
save_path: 图片保存路径
"""
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('混淆矩阵')
plt.ylabel('真实标签')
plt.xlabel('预测标签')
if save_path:
plt.savefig(save_path)
print(f"混淆矩阵图已保存至: {save_path}")
plt.show()
def set_plot_style():
"""设置绘图样式"""
plt.style.use('seaborn')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
if __name__ == "__main__":
# 设置绘图样式
set_plot_style()
# 这里可以添加测试代码
print("可视化模块加载成功!")