第二十三天打卡-CSDN博客

本文链接：https://blog.csdn.net/weixin_64192256/article/details/147933352
作业：
整理下全部逻辑的先后顺序，看看能不能制作出适合所有机器学习的通用pipeline
数据预处理 → 特征选择 → 降维 → 模型训练

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from skopt import BayesSearchCV
from skopt.space import Integer, Categorical
import shap

# -------------------- 1. 数据加载与划分 --------------------
# 加载原始数据（假设目标列名为'Credit Default'）
data = pd.read_csv('data.csv')
y = data['Credit Default']
X = data.drop('Credit Default', axis=1)

# 划分数据集（保持原始状态，避免数据泄露）
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -------------------- 2. 定义特征处理流程 --------------------
# 定义特征类型（根据实际数据调整）
numeric_features = ['Annual Income', 'Current Loan Amount']
ordinal_features = ['Years in current job']
ordinal_categories = [['< 1 year', '1 year', '2 years',..., '10+ years']]
nominal_features = ['Purpose']

# 数值特征处理：中位数填充 + 标准化
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# 有序分类处理：众数填充 + 顺序编码
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(categories=ordinal_categories, 
                             handle_unknown='use_encoded_value', 
                             unknown_value=-1))
])

# 标称分类处理：众数填充 + 独热编码
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# 合并预处理步骤
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('ord', ordinal_transformer, ordinal_features),
        ('nom', nominal_transformer, nominal_features)
    ],
    remainder='drop'  # 丢弃未指定列
)

# -------------------- 3. 构建完整Pipeline --------------------
# 包含特征选择、降维和分类器
full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selector', SelectKBest(score_func=f_classif, k=10)),  # 选择Top10特征
    ('dim_reducer', PCA(n_components=0.95)),  # 保留95%方差
    ('classifier', RandomForestClassifier(random_state=42))
])

# -------------------- 4. 网格搜索调参 --------------------
# 定义参数网格（注意格式：步骤名__参数名）
param_grid = {
    'feature_selector__k': [5, 10, 15],  # 特征选择数量
    'dim_reducer__n_components': [0.8, 0.9, 0.95],  # PCA保留方差
    'classifier__n_estimators': [100, 200],  # 随机森林树数量
    'classifier__max_depth': [None, 5, 10]
}

# 创建网格搜索对象
grid_search = GridSearchCV(
    estimator=full_pipeline,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    scoring='f1_macro'
)

# 执行网格搜索
grid_search.fit(X_train, y_train)

# 输出最佳参数
print("最佳参数组合:", grid_search.best_params_)
print("最佳验证分数:", grid_search.best_score_)

# -------------------- 5. 贝叶斯优化调参 --------------------
# 定义参数搜索空间（使用skopt的Space类）
bayes_space = {
    'feature_selector__k': Integer(5, 20),
    'dim_reducer__n_components': Categorical([0.8, 0.85, 0.9, 0.95]),
    'classifier__n_estimators': Integer(50, 300),
    'classifier__max_depth': Integer(3, 15)
}

# 创建贝叶斯优化对象
bayes_search = BayesSearchCV(
    estimator=full_pipeline,
    search_spaces=bayes_space,
    n_iter=30,  # 迭代次数
    cv=3,
    n_jobs=-1,
    scoring='f1_macro'
)

# 执行贝叶斯优化
bayes_search.fit(X_train, y_train)

# 输出最佳参数
print("\n贝叶斯优化最佳参数:", bayes_search.best_params_)
print("贝叶斯优化最佳分数:", bayes_search.best_score_)

# -------------------- 6. 模型评估与可视化 --------------------
# 使用最佳模型进行预测
best_model = bayes_search.best_estimator_
y_pred = best_model.predict(X_test)

# 输出分类报告
print("\n测试集分类报告:")
print(classification_report(y_test, y_pred))

# -------------------- 7. SHAP特征重要性分析 --------------------
# 提取预处理后的特征名称
onehot_columns = best_model.named_steps['preprocessor'].named_transformers_['nom'].named_steps['onehot'].get_feature_names_out(nominal_features)
all_features = numeric_features + ordinal_features + list(onehot_columns)

# 创建SHAP解释器
explainer = shap.TreeExplainer(best_model.named_steps['classifier'])
X_processed = best_model.named_steps['preprocessor'].transform(X_train)  # 获取预处理后的数据

# 计算SHAP值
shap_values = explainer.shap_values(X_processed)

# 绘制特征重要性图
shap.summary_plot(shap_values, X_processed, feature_names=all_features)
@浙大疏锦行