遗传算法优化XGBoost回归代码（输出world报告）

BenChuat
已于 2024-08-31 17:49:59 修改
阅读量680
点赞数 16
分类专栏：数学建模文章标签：回归数据挖掘人工智能
于 2024-08-31 17:48:17 首次发布
本文链接：https://blog.csdn.net/m0_73065928/article/details/141755929
版权
数学建模专栏收录该内容
10 篇文章 0 订阅
订阅专栏
这段代码的功能是通过遗传算法优化XGBoost回归模型的超参数，并生成详细的报告。用户需要输入一个包含数据的Excel文件路径。代码会读取数据，进行预处理（如数据清洗和PCA降维），然后根据用户选择的任务类型（分类或回归）设置适当的评估标准。接着，代码使用遗传算法优化XGBoost模型的超参数，包括最大深度、树的数量和学习率等。经过训练和评估，最终生成一个Word报告，详细记录了模型的参数设置、训练时间以及在训练集和测试集上的评估结果。
在这里插入图片描述
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from deap import base, creator, tools, algorithms
import numpy as np
import xgboost as xgb
from docx import Document
import matplotlib.pyplot as plt
import time
import os

# 设置中文字体和负号显示
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 定义全局变量以便于调整和调试
DATA_SPLIT_RATIO = 0.2  # 数据划分比例（测试集占比）
DATA_SHUFFLE = True  # 是否进行数据洗牌
CROSS_VALIDATION = True  # 是否进行交叉验证
NGEN = 15  # 遗传算法迭代代数
POP_SIZE = 50  # 种群大小
MUTPB = 0.2  # 突变概率
CXPB = 0.5  # 交叉概率

def load_data(file_path):
    # 读取Excel数据
    data = pd.read_excel(file_path)  # 替换为您的数据文件
    data = data.dropna()  # 清除缺失值
    return data

def pca_analysis(X):
    # 数据标准化
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 主成分分析（PCA）
    pca = PCA(n_components=min(X_scaled.shape))
    pca.fit(X_scaled)
    explained_variance = pca.explained_variance_
    explained_variance_ratio = pca.explained_variance_ratio_
    cumulative_variance = np.cumsum(explained_variance_ratio)

    # 总方差解释表格
    print("总方差解释表格：")
    print("成分\t特征根\t方差解释率(%)\t累积方差解释率(%)")
    for i in range(len(explained_variance)):
        print(f"{i + 1}\t{explained_variance[i]:.3f}\t{explained_variance_ratio[i] * 100:.3f}\t{cumulative_variance[i] * 100:.3f}")

    return X_scaled, pca

def ask_user_for_choices(file_path):
    print("\n请查看Excel数据的前几行：")
    data = load_data(file_path)
    print(data.head())

    preprocess = input("数据是否已完成预处理（数据清洗和标准化）？(是/否): ").strip().lower()
    if preprocess != '是':
        print("请先进行数据预处理。")
        return

    task_type = input("请选择任务类型（分类/回归）: ").strip().lower()
    target_column = input("请指定Excel中的因变量列名: ").strip()
    X = data.drop(columns=[target_column])
    y = data[target_column]

    if task_type == '分类':
        if pd.api.types.is_numeric_dtype(y) and len(y.unique()) > 2:
            print("分类任务需要目标变量为类别标签。")
            return
        else:
            metrics_function = accuracy_score
    elif task_type == '回归':
        if not pd.api.types.is_numeric_dtype(y):
            print("回归任务需要目标变量为连续数值。")
            return
        else:
            metrics_function = mean_squared_error
    else:
        print("任务类型无效。")
        return

    pca_replace = input("是否使用主成分分析的特征代替数据？(是/否): ").strip().lower()
    if pca_replace == '是':
        X_scaled, pca = pca_analysis(X)
        num_components = int(input("请选择前几个主成分作为特征: "))
        X = X_scaled[:, :num_components]
    else:
        X = X.values  # 保持原始数据

    # 遗传算法和XGBoost模型的定义
    if hasattr(creator, "FitnessMax"):
        del creator.FitnessMax
    if hasattr(creator, "Individual"):
        del creator.Individual

    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
    creator.create("Individual", list, fitness=creator.FitnessMax)

    toolbox = base.Toolbox()
    toolbox.register("attr_int_max_depth", np.random.randint, 3, 15)
    toolbox.register("attr_int_n_estimators", np.random.randint, 50, 500)
    toolbox.register("attr_float_learning_rate", np.random.uniform, 0.01, 0.3)
    toolbox.register("attr_float_subsample", np.random.uniform, 0.5, 1.0)
    toolbox.register("attr_float_colsample_bytree", np.random.uniform, 0.5, 1.0)
    toolbox.register("individual", tools.initCycle, creator.Individual,
                     (toolbox.attr_int_max_depth, toolbox.attr_int_n_estimators, toolbox.attr_float_learning_rate,
                      toolbox.attr_float_subsample, toolbox.attr_float_colsample_bytree), n=1)
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
    toolbox.register("mate", tools.cxBlend, alpha=0.5)
    toolbox.register("mutate", tools.mutPolynomialBounded, low=[3, 50, 0.01, 0.5, 0.5], 
                     up=[15, 500, 0.3, 1.0, 1.0], eta=1.0, indpb=MUTPB)
    toolbox.register("select", tools.selTournament, tournsize=3)

    def evaluate(individual):
        max_depth, n_estimators, learning_rate, subsample, colsample_bytree = individual
        
        # 确保max_depth在有效范围内（0及以上）
        max_depth = int(np.clip(max_depth, 1, 15))
        # 确保learning_rate, subsample, colsample_bytree在有效范围内
        learning_rate = np.clip(learning_rate, 0.01, 0.3)
        subsample = np.clip(subsample, 0.5, 1.0)
        colsample_bytree = np.clip(colsample_bytree, 0.5, 1.0)

        model = xgb.XGBRegressor(max_depth=max_depth, n_estimators=int(n_estimators), 
                                 learning_rate=learning_rate, subsample=subsample, 
                                 colsample_bytree=colsample_bytree, objective='reg:squarederror', random_state=42)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=DATA_SPLIT_RATIO, shuffle=DATA_SHUFFLE, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        return (1 / (1 + mse),)  # 最大化MSE的负值

    toolbox.register("evaluate", evaluate)

    # 遗传算法运行
    start_time = time.time()
    population = toolbox.population(n=POP_SIZE)
    algorithms.eaSimple(population, toolbox, cxpb=CXPB, mutpb=MUTPB, ngen=NGEN, 
                        verbose=True, stats=None, halloffame=None)
    end_time = time.time()
    elapsed_time = end_time - start_time

    # 最优模型参数
    best_ind = tools.selBest(population, 1)[0]
    max_depth_best, n_estimators_best, learning_rate_best, subsample_best, colsample_bytree_best = best_ind
    print(f"最优max_depth值：{int(max_depth_best)}")
    print(f"最优n_estimators值：{int(n_estimators_best)}")
    print(f"最优learning_rate值：{learning_rate_best}")
    print(f"最优subsample值：{subsample_best}")
    print(f"最优colsample_bytree值：{colsample_bytree_best}")

    # 生成报告
    generate_report(xgb.XGBRegressor, X, y, best_ind, int(max_depth_best), int(n_estimators_best), 
                     learning_rate_best, subsample_best, colsample_bytree_best, task_type, elapsed_time, file_path)

def generate_report(model_class, X, y, best_ind, max_depth_best, n_estimators_best, 
                     learning_rate_best, subsample_best, colsample_bytree_best, 
                     task_type, elapsed_time, file_path):
    # 数据切分
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=DATA_SPLIT_RATIO, shuffle=DATA_SHUFFLE, random_state=42
    )
    
    # 训练模型
    model = model_class(max_depth=max_depth_best, n_estimators=n_estimators_best,
                        learning_rate=learning_rate_best, subsample=subsample_best,
                        colsample_bytree=colsample_bytree_best, objective='reg:squarederror', random_state=42)
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # 计算性能指标
    mse_train = mean_squared_error(y_train, y_train_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    rmse_train = np.sqrt(mse_train)
    rmse_test = np.sqrt(mse_test)
    mae_train = mean_absolute_error(y_train, y_train_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    r2_train = r2_score(y_train, y_train_pred)
    r2_test = r2_score(y_test, y_test_pred)

    # 创建报告文档
    document = Document()
    document.add_heading('遗传-XGBoost模型报告', 0)
    
    # 分析步骤
    document.add_heading('分析步骤', level=1)
    steps = [
        "通过训练集数据来建立XGBoost回归模型。",
        "通过建立的XGBoost来计算特征重要性。",
        "将建立的XGBoost回归模型应用到训练、测试数据，得到模型评估结果。",
        "由于XGBoost具有随机性，每次运算的结果不一样，若保存本次训练模型，后续可以直接上传数据代入到本次训练模型进行计算预测。",
        "注：XGBoost无法像传统模型一样得到确定的方程，通常通过测试数据预测精度来对模型进行评价。"
    ]
    for step in steps:
        document.add_paragraph(step)

    # 详细结论
    document.add_heading('详细结论', level=1)

    # 输出结果1：模型参数
    document.add_heading('输出结果1：模型参数', level=2)
    parameters = {
        '训练用时': f'{elapsed_time:.3f}s',
        '数据切分': f'{DATA_SPLIT_RATIO}',
        '数据洗牌': '是' if DATA_SHUFFLE else '否',
        '交叉验证': '否',
        '基学习器': 'gbtree',
        '基学习器数量': f'{n_estimators_best}',
        '学习率': f'{learning_rate_best}',
        'L1正则项': '0',
        'L2正则项': '1',
        '样本征采样率': f'{subsample_best}',
        '树特征采样率': f'{colsample_bytree_best}',
        '节点特征采样率': '1',
        '叶子节点中样本的最小权重': '0',
        '树的最大深度': f'{max_depth_best}'
    }
    table = document.add_table(rows=1, cols=2)
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = '参数名'
    hdr_cells[1].text = '参数值'
    
    for param, value in parameters.items():
        row_cells = table.add_row().cells
        row_cells[0].text = param
        row_cells[1].text = value

    document.add_paragraph('图表说明：上表展示了模型各项参数配置以及模型训练时长。')

    """ # 输出结果2：特征重要性
    document.add_heading('输出结果2：特征重要性', level=2)
    importance_df = pd.DataFrame(feature_importance, columns=['特征名称', '特征重要性'])
    importance_df['特征重要性'] = importance_df['特征重要性'] * 100  # 转换为百分比

    table = document.add_table(rows=1, cols=2)
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = '特征名称'
    hdr_cells[1].text = '特征重要性'

    for _, row in importance_df.iterrows():
        row_cells = table.add_row().cells
        row_cells[0].text = row['特征名称']
        row_cells[1].text = f'{row["特征重要性"]:.2f}%'
    
    document.add_paragraph('图表说明：上柱形图或表格展示了各特征（自变量）的重要性比例。') """

    # 输出结果3：模型评估结果
    document.add_heading('输出结果3：模型评估结果', level=2)
    eval_results = {
        'MSE': [mse_train, mse_test],
        'RMSE': [rmse_train, rmse_test],
        'MAE': [mae_train, mae_test],
        'R²': [r2_train, r2_test]
    }
    eval_df = pd.DataFrame(eval_results, index=['训练集', '测试集'])

    table = document.add_table(rows=1, cols=len(eval_results))
    hdr_cells = table.rows[0].cells
    for col in eval_results.keys():
        hdr_cells[list(eval_results.keys()).index(col)].text = col

    for row in eval_df.iterrows():
        row_cells = table.add_row().cells
        for idx, value in enumerate(row[1]):
            row_cells[idx].text = f'{value:.4f}' if isinstance(value, (int, float)) else value

    document.add_paragraph(
        '图表说明：上表中展示了交叉验证集、训练集和测试集的预测评价指标，通过量化指标来衡量XGBoost的预测效果。其中，通过交叉验证集的评价指标可以不断调整超参数，以得到可靠稳定的模型。\n'
        '● MSE（均方误差）： 预测值与实际值之差平方的期望值。取值越小，模型准确度越高。\n'
        '● RMSE（均方根误差）：为MSE的平方根，取值越小，模型准确度越高。\n'
        '● MAE（平均绝对误差）： 绝对误差的平均值，能反映预测值误差的实际情况。取值越小，模型准确度越高。\n'
        '● MAPE（平均绝对百分比误差）： 是 MAE 的变形，它是一个百分比值。取值越小，模型准确度越高。\n'
        '● R²： 将预测值跟只使用均值的情况下相比，结果越靠近 1 模型准确度越高。'
    )

    # 保存报告
    file_title, _ = os.path.splitext(os.path.basename(file_path))
    report_path = f'遗传-XGBoost模型报告_{file_title}.docx'
    document.save(report_path)
    print(f"报告已保存为: {report_path}")

if __name__ == "__main__":
    file_path = '样条min-max标准化.xlsx'  # 替换为您的数据文件路径
    ask_user_for_choices(file_path)