遗传算法优化XGBoost回归代码(输出world报告)

这段代码的功能是通过遗传算法优化XGBoost回归模型的超参数,并生成详细的报告。用户需要输入一个包含数据的Excel文件路径。代码会读取数据,进行预处理(如数据清洗和PCA降维),然后根据用户选择的任务类型(分类或回归)设置适当的评估标准。接着,代码使用遗传算法优化XGBoost模型的超参数,包括最大深度、树的数量和学习率等。经过训练和评估,最终生成一个Word报告,详细记录了模型的参数设置、训练时间以及在训练集和测试集上的评估结果。
在这里插入图片描述

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from deap import base, creator, tools, algorithms
import numpy as np
import xgboost as xgb
from docx import Document
import matplotlib.pyplot as plt
import time
import os

# 设置中文字体和负号显示
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 定义全局变量以便于调整和调试
DATA_SPLIT_RATIO = 0.2  # 数据划分比例(测试集占比)
DATA_SHUFFLE = True  # 是否进行数据洗牌
CROSS_VALIDATION = True  # 是否进行交叉验证
NGEN = 15  # 遗传算法迭代代数
POP_SIZE = 50  # 种群大小
MUTPB = 0.2  # 突变概率
CXPB = 0.5  # 交叉概率

def load_data(file_path):
    # 读取Excel数据
    data = pd.read_excel(file_path)  # 替换为您的数据文件
    data = data.dropna()  # 清除缺失值
    return data

def pca_analysis(X):
    # 数据标准化
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 主成分分析(PCA)
    pca = PCA(n_components=min(X_scaled.shape))
    pca.fit(X_scaled)
    explained_variance = pca.explained_variance_
    explained_variance_ratio = pca.explained_variance_ratio_
    cumulative_variance = np.cumsum(explained_variance_ratio)

    # 总方差解释表格
    print("总方差解释表格:")
    print("成分\t特征根\t方差解释率(%)\t累积方差解释率(%)")
    for i in range(len(explained_variance)):
        print(f"{i + 1}\t{explained_variance[i]:.3f}\t{explained_variance_ratio[i] * 100:.3f}\t{cumulative_variance[i] * 100:.3f}")

    return X_scaled, pca

def ask_user_for_choices(file_path):
    print("\n请查看Excel数据的前几行:")
    data = load_data(file_path)
    print(data.head())

    preprocess = input("数据是否已完成预处理(数据清洗和标准化)?(是/否): ").strip().lower()
    if preprocess != '是':
        print("请先进行数据预处理。")
        return

    task_type = input("请选择任务类型(分类/回归): ").strip().lower()
    target_column = input("请指定Excel中的因变量列名: ").strip()
    X = data.drop(columns=[target_column])
    y = data[target_column]

    if task_type == '分类':
        if pd.api.types.is_numeric_dtype(y) and len(y.unique()) > 2:
            print("分类任务需要目标变量为类别标签。")
            return
        else:
            metrics_function = accuracy_score
    elif task_type == '回归':
        if not pd.api.types.is_numeric_dtype(y):
            print("回归任务需要目标变量为连续数值。")
            return
        else:
            metrics_function = mean_squared_error
    else:
        print("任务类型无效。")
        return

    pca_replace = input("是否使用主成分分析的特征代替数据?(是/否): ").strip().lower()
    if pca_replace == '是':
        X_scaled, pca = pca_analysis(X)
        num_components = int(input("请选择前几个主成分作为特征: "))
        X = X_scaled[:, :num_components]
    else:
        X = X.values  # 保持原始数据

    # 遗传算法和XGBoost模型的定义
    if hasattr(creator, "FitnessMax"):
        del creator.FitnessMax
    if hasattr(creator, "Individual"):
        del creator.Individual

    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
    creator.create("Individual", list, fitness=creator.FitnessMax)

    toolbox = base.Toolbox()
    toolbox.register("attr_int_max_depth", np.random.randint, 3, 15)
    toolbox.register("attr_int_n_estimators", np.random.randint, 50, 500)
    toolbox.register("attr_float_learning_rate", np.random.uniform, 0.01, 0.3)
    toolbox.register("attr_float_subsample", np.random.uniform, 0.5, 1.0)
    toolbox.register("attr_float_colsample_bytree", np.random.uniform, 0.5, 1.0)
    toolbox.register("individual", tools.initCycle, creator.Individual,
                     (toolbox.attr_int_max_depth, toolbox.attr_int_n_estimators, toolbox.attr_float_learning_rate,
                      toolbox.attr_float_subsample, toolbox.attr_float_colsample_bytree), n=1)
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
    toolbox.register("mate", tools.cxBlend, alpha=0.5)
    toolbox.register("mutate", tools.mutPolynomialBounded, low=[3, 50, 0.01, 0.5, 0.5], 
                     up=[15, 500, 0.3, 1.0, 1.0], eta=1.0, indpb=MUTPB)
    toolbox.register("select", tools.selTournament, tournsize=3)

    def evaluate(individual):
        max_depth, n_estimators, learning_rate, subsample, colsample_bytree = individual
        
        # 确保max_depth在有效范围内(0及以上)
        max_depth = int(np.clip(max_depth, 1, 15))
        # 确保learning_rate, subsample, colsample_bytree在有效范围内
        learning_rate = np.clip(learning_rate, 0.01, 0.3)
        subsample = np.clip(subsample, 0.5, 1.0)
        colsample_bytree = np.clip(colsample_bytree, 0.5, 1.0)

        model = xgb.XGBRegressor(max_depth=max_depth, n_estimators=int(n_estimators), 
                                 learning_rate=learning_rate, subsample=subsample, 
                                 colsample_bytree=colsample_bytree, objective='reg:squarederror', random_state=42)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=DATA_SPLIT_RATIO, shuffle=DATA_SHUFFLE, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        return (1 / (1 + mse),)  # 最大化MSE的负值

    toolbox.register("evaluate", evaluate)

    # 遗传算法运行
    start_time = time.time()
    population = toolbox.population(n=POP_SIZE)
    algorithms.eaSimple(population, toolbox, cxpb=CXPB, mutpb=MUTPB, ngen=NGEN, 
                        verbose=True, stats=None, halloffame=None)
    end_time = time.time()
    elapsed_time = end_time - start_time

    # 最优模型参数
    best_ind = tools.selBest(population, 1)[0]
    max_depth_best, n_estimators_best, learning_rate_best, subsample_best, colsample_bytree_best = best_ind
    print(f"最优max_depth值:{int(max_depth_best)}")
    print(f"最优n_estimators值:{int(n_estimators_best)}")
    print(f"最优learning_rate值:{learning_rate_best}")
    print(f"最优subsample值:{subsample_best}")
    print(f"最优colsample_bytree值:{colsample_bytree_best}")

    # 生成报告
    generate_report(xgb.XGBRegressor, X, y, best_ind, int(max_depth_best), int(n_estimators_best), 
                     learning_rate_best, subsample_best, colsample_bytree_best, task_type, elapsed_time, file_path)

def generate_report(model_class, X, y, best_ind, max_depth_best, n_estimators_best, 
                     learning_rate_best, subsample_best, colsample_bytree_best, 
                     task_type, elapsed_time, file_path):
    # 数据切分
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=DATA_SPLIT_RATIO, shuffle=DATA_SHUFFLE, random_state=42
    )
    
    # 训练模型
    model = model_class(max_depth=max_depth_best, n_estimators=n_estimators_best,
                        learning_rate=learning_rate_best, subsample=subsample_best,
                        colsample_bytree=colsample_bytree_best, objective='reg:squarederror', random_state=42)
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # 计算性能指标
    mse_train = mean_squared_error(y_train, y_train_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    rmse_train = np.sqrt(mse_train)
    rmse_test = np.sqrt(mse_test)
    mae_train = mean_absolute_error(y_train, y_train_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)
    r2_train = r2_score(y_train, y_train_pred)
    r2_test = r2_score(y_test, y_test_pred)

    # 创建报告文档
    document = Document()
    document.add_heading('遗传-XGBoost模型报告', 0)
    
    # 分析步骤
    document.add_heading('分析步骤', level=1)
    steps = [
        "通过训练集数据来建立XGBoost回归模型。",
        "通过建立的XGBoost来计算特征重要性。",
        "将建立的XGBoost回归模型应用到训练、测试数据,得到模型评估结果。",
        "由于XGBoost具有随机性,每次运算的结果不一样,若保存本次训练模型,后续可以直接上传数据代入到本次训练模型进行计算预测。",
        "注:XGBoost无法像传统模型一样得到确定的方程,通常通过测试数据预测精度来对模型进行评价。"
    ]
    for step in steps:
        document.add_paragraph(step)

    # 详细结论
    document.add_heading('详细结论', level=1)

    # 输出结果1:模型参数
    document.add_heading('输出结果1:模型参数', level=2)
    parameters = {
        '训练用时': f'{elapsed_time:.3f}s',
        '数据切分': f'{DATA_SPLIT_RATIO}',
        '数据洗牌': '是' if DATA_SHUFFLE else '否',
        '交叉验证': '否',
        '基学习器': 'gbtree',
        '基学习器数量': f'{n_estimators_best}',
        '学习率': f'{learning_rate_best}',
        'L1正则项': '0',
        'L2正则项': '1',
        '样本征采样率': f'{subsample_best}',
        '树特征采样率': f'{colsample_bytree_best}',
        '节点特征采样率': '1',
        '叶子节点中样本的最小权重': '0',
        '树的最大深度': f'{max_depth_best}'
    }
    table = document.add_table(rows=1, cols=2)
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = '参数名'
    hdr_cells[1].text = '参数值'
    
    for param, value in parameters.items():
        row_cells = table.add_row().cells
        row_cells[0].text = param
        row_cells[1].text = value

    document.add_paragraph('图表说明:上表展示了模型各项参数配置以及模型训练时长。')

    """ # 输出结果2:特征重要性
    document.add_heading('输出结果2:特征重要性', level=2)
    importance_df = pd.DataFrame(feature_importance, columns=['特征名称', '特征重要性'])
    importance_df['特征重要性'] = importance_df['特征重要性'] * 100  # 转换为百分比

    table = document.add_table(rows=1, cols=2)
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = '特征名称'
    hdr_cells[1].text = '特征重要性'

    for _, row in importance_df.iterrows():
        row_cells = table.add_row().cells
        row_cells[0].text = row['特征名称']
        row_cells[1].text = f'{row["特征重要性"]:.2f}%'
    
    document.add_paragraph('图表说明:上柱形图或表格展示了各特征(自变量)的重要性比例。') """

    # 输出结果3:模型评估结果
    document.add_heading('输出结果3:模型评估结果', level=2)
    eval_results = {
        'MSE': [mse_train, mse_test],
        'RMSE': [rmse_train, rmse_test],
        'MAE': [mae_train, mae_test],
        'R²': [r2_train, r2_test]
    }
    eval_df = pd.DataFrame(eval_results, index=['训练集', '测试集'])

    table = document.add_table(rows=1, cols=len(eval_results))
    hdr_cells = table.rows[0].cells
    for col in eval_results.keys():
        hdr_cells[list(eval_results.keys()).index(col)].text = col

    for row in eval_df.iterrows():
        row_cells = table.add_row().cells
        for idx, value in enumerate(row[1]):
            row_cells[idx].text = f'{value:.4f}' if isinstance(value, (int, float)) else value

    document.add_paragraph(
        '图表说明:上表中展示了交叉验证集、训练集和测试集的预测评价指标,通过量化指标来衡量XGBoost的预测效果。其中,通过交叉验证集的评价指标可以不断调整超参数,以得到可靠稳定的模型。\n'
        '● MSE(均方误差): 预测值与实际值之差平方的期望值。取值越小,模型准确度越高。\n'
        '● RMSE(均方根误差):为MSE的平方根,取值越小,模型准确度越高。\n'
        '● MAE(平均绝对误差): 绝对误差的平均值,能反映预测值误差的实际情况。取值越小,模型准确度越高。\n'
        '● MAPE(平均绝对百分比误差): 是 MAE 的变形,它是一个百分比值。取值越小,模型准确度越高。\n'
        '● R²: 将预测值跟只使用均值的情况下相比,结果越靠近 1 模型准确度越高。'
    )

    # 保存报告
    file_title, _ = os.path.splitext(os.path.basename(file_path))
    report_path = f'遗传-XGBoost模型报告_{file_title}.docx'
    document.save(report_path)
    print(f"报告已保存为: {report_path}")

if __name__ == "__main__":
    file_path = '样条min-max标准化.xlsx'  # 替换为您的数据文件路径
    ask_user_for_choices(file_path)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

BenChuat

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值