这段代码的功能是通过遗传算法优化XGBoost回归模型的超参数,并生成详细的报告。用户需要输入一个包含数据的Excel文件路径。代码会读取数据,进行预处理(如数据清洗和PCA降维),然后根据用户选择的任务类型(分类或回归)设置适当的评估标准。接着,代码使用遗传算法优化XGBoost模型的超参数,包括最大深度、树的数量和学习率等。经过训练和评估,最终生成一个Word报告,详细记录了模型的参数设置、训练时间以及在训练集和测试集上的评估结果。
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from deap import base, creator, tools, algorithms
import numpy as np
import xgboost as xgb
from docx import Document
import matplotlib.pyplot as plt
import time
import os
# 设置中文字体和负号显示
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 定义全局变量以便于调整和调试
DATA_SPLIT_RATIO = 0.2 # 数据划分比例(测试集占比)
DATA_SHUFFLE = True # 是否进行数据洗牌
CROSS_VALIDATION = True # 是否进行交叉验证
NGEN = 15 # 遗传算法迭代代数
POP_SIZE = 50 # 种群大小
MUTPB = 0.2 # 突变概率
CXPB = 0.5 # 交叉概率
def load_data(file_path):
# 读取Excel数据
data = pd.read_excel(file_path) # 替换为您的数据文件
data = data.dropna() # 清除缺失值
return data
def pca_analysis(X):
# 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 主成分分析(PCA)
pca = PCA(n_components=min(X_scaled.shape))
pca.fit(X_scaled)
explained_variance = pca.explained_variance_
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance_ratio)
# 总方差解释表格
print("总方差解释表格:")
print("成分\t特征根\t方差解释率(%)\t累积方差解释率(%)")
for i in range(len(explained_variance)):
print(f"{i + 1}\t{explained_variance[i]:.3f}\t{explained_variance_ratio[i] * 100:.3f}\t{cumulative_variance[i] * 100:.3f}")
return X_scaled, pca
def ask_user_for_choices(file_path):
print("\n请查看Excel数据的前几行:")
data = load_data(file_path)
print(data.head())
preprocess = input("数据是否已完成预处理(数据清洗和标准化)?(是/否): ").strip().lower()
if preprocess != '是':
print("请先进行数据预处理。")
return
task_type = input("请选择任务类型(分类/回归): ").strip().lower()
target_column = input("请指定Excel中的因变量列名: ").strip()
X = data.drop(columns=[target_column])
y = data[target_column]
if task_type == '分类':
if pd.api.types.is_numeric_dtype(y) and len(y.unique()) > 2:
print("分类任务需要目标变量为类别标签。")
return
else:
metrics_function = accuracy_score
elif task_type == '回归':
if not pd.api.types.is_numeric_dtype(y):
print("回归任务需要目标变量为连续数值。")
return
else:
metrics_function = mean_squared_error
else:
print("任务类型无效。")
return
pca_replace = input("是否使用主成分分析的特征代替数据?(是/否): ").strip().lower()
if pca_replace == '是':
X_scaled, pca = pca_analysis(X)
num_components = int(input("请选择前几个主成分作为特征: "))
X = X_scaled[:, :num_components]
else:
X = X.values # 保持原始数据
# 遗传算法和XGBoost模型的定义
if hasattr(creator, "FitnessMax"):
del creator.FitnessMax
if hasattr(creator, "Individual"):
del creator.Individual
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)
toolbox = base.Toolbox()
toolbox.register("attr_int_max_depth", np.random.randint, 3, 15)
toolbox.register("attr_int_n_estimators", np.random.randint, 50, 500)
toolbox.register("attr_float_learning_rate", np.random.uniform, 0.01, 0.3)
toolbox.register("attr_float_subsample", np.random.uniform, 0.5, 1.0)
toolbox.register("attr_float_colsample_bytree", np.random.uniform, 0.5, 1.0)
toolbox.register("individual", tools.initCycle, creator.Individual,
(toolbox.attr_int_max_depth, toolbox.attr_int_n_estimators, toolbox.attr_float_learning_rate,
toolbox.attr_float_subsample, toolbox.attr_float_colsample_bytree), n=1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("mate", tools.cxBlend, alpha=0.5)
toolbox.register("mutate", tools.mutPolynomialBounded, low=[3, 50, 0.01, 0.5, 0.5],
up=[15, 500, 0.3, 1.0, 1.0], eta=1.0, indpb=MUTPB)
toolbox.register("select", tools.selTournament, tournsize=3)
def evaluate(individual):
max_depth, n_estimators, learning_rate, subsample, colsample_bytree = individual
# 确保max_depth在有效范围内(0及以上)
max_depth = int(np.clip(max_depth, 1, 15))
# 确保learning_rate, subsample, colsample_bytree在有效范围内
learning_rate = np.clip(learning_rate, 0.01, 0.3)
subsample = np.clip(subsample, 0.5, 1.0)
colsample_bytree = np.clip(colsample_bytree, 0.5, 1.0)
model = xgb.XGBRegressor(max_depth=max_depth, n_estimators=int(n_estimators),
learning_rate=learning_rate, subsample=subsample,
colsample_bytree=colsample_bytree, objective='reg:squarederror', random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=DATA_SPLIT_RATIO, shuffle=DATA_SHUFFLE, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
return (1 / (1 + mse),) # 最大化MSE的负值
toolbox.register("evaluate", evaluate)
# 遗传算法运行
start_time = time.time()
population = toolbox.population(n=POP_SIZE)
algorithms.eaSimple(population, toolbox, cxpb=CXPB, mutpb=MUTPB, ngen=NGEN,
verbose=True, stats=None, halloffame=None)
end_time = time.time()
elapsed_time = end_time - start_time
# 最优模型参数
best_ind = tools.selBest(population, 1)[0]
max_depth_best, n_estimators_best, learning_rate_best, subsample_best, colsample_bytree_best = best_ind
print(f"最优max_depth值:{int(max_depth_best)}")
print(f"最优n_estimators值:{int(n_estimators_best)}")
print(f"最优learning_rate值:{learning_rate_best}")
print(f"最优subsample值:{subsample_best}")
print(f"最优colsample_bytree值:{colsample_bytree_best}")
# 生成报告
generate_report(xgb.XGBRegressor, X, y, best_ind, int(max_depth_best), int(n_estimators_best),
learning_rate_best, subsample_best, colsample_bytree_best, task_type, elapsed_time, file_path)
def generate_report(model_class, X, y, best_ind, max_depth_best, n_estimators_best,
learning_rate_best, subsample_best, colsample_bytree_best,
task_type, elapsed_time, file_path):
# 数据切分
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=DATA_SPLIT_RATIO, shuffle=DATA_SHUFFLE, random_state=42
)
# 训练模型
model = model_class(max_depth=max_depth_best, n_estimators=n_estimators_best,
learning_rate=learning_rate_best, subsample=subsample_best,
colsample_bytree=colsample_bytree_best, objective='reg:squarederror', random_state=42)
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
# 计算性能指标
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_train = np.sqrt(mse_train)
rmse_test = np.sqrt(mse_test)
mae_train = mean_absolute_error(y_train, y_train_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)
# 创建报告文档
document = Document()
document.add_heading('遗传-XGBoost模型报告', 0)
# 分析步骤
document.add_heading('分析步骤', level=1)
steps = [
"通过训练集数据来建立XGBoost回归模型。",
"通过建立的XGBoost来计算特征重要性。",
"将建立的XGBoost回归模型应用到训练、测试数据,得到模型评估结果。",
"由于XGBoost具有随机性,每次运算的结果不一样,若保存本次训练模型,后续可以直接上传数据代入到本次训练模型进行计算预测。",
"注:XGBoost无法像传统模型一样得到确定的方程,通常通过测试数据预测精度来对模型进行评价。"
]
for step in steps:
document.add_paragraph(step)
# 详细结论
document.add_heading('详细结论', level=1)
# 输出结果1:模型参数
document.add_heading('输出结果1:模型参数', level=2)
parameters = {
'训练用时': f'{elapsed_time:.3f}s',
'数据切分': f'{DATA_SPLIT_RATIO}',
'数据洗牌': '是' if DATA_SHUFFLE else '否',
'交叉验证': '否',
'基学习器': 'gbtree',
'基学习器数量': f'{n_estimators_best}',
'学习率': f'{learning_rate_best}',
'L1正则项': '0',
'L2正则项': '1',
'样本征采样率': f'{subsample_best}',
'树特征采样率': f'{colsample_bytree_best}',
'节点特征采样率': '1',
'叶子节点中样本的最小权重': '0',
'树的最大深度': f'{max_depth_best}'
}
table = document.add_table(rows=1, cols=2)
hdr_cells = table.rows[0].cells
hdr_cells[0].text = '参数名'
hdr_cells[1].text = '参数值'
for param, value in parameters.items():
row_cells = table.add_row().cells
row_cells[0].text = param
row_cells[1].text = value
document.add_paragraph('图表说明:上表展示了模型各项参数配置以及模型训练时长。')
""" # 输出结果2:特征重要性
document.add_heading('输出结果2:特征重要性', level=2)
importance_df = pd.DataFrame(feature_importance, columns=['特征名称', '特征重要性'])
importance_df['特征重要性'] = importance_df['特征重要性'] * 100 # 转换为百分比
table = document.add_table(rows=1, cols=2)
hdr_cells = table.rows[0].cells
hdr_cells[0].text = '特征名称'
hdr_cells[1].text = '特征重要性'
for _, row in importance_df.iterrows():
row_cells = table.add_row().cells
row_cells[0].text = row['特征名称']
row_cells[1].text = f'{row["特征重要性"]:.2f}%'
document.add_paragraph('图表说明:上柱形图或表格展示了各特征(自变量)的重要性比例。') """
# 输出结果3:模型评估结果
document.add_heading('输出结果3:模型评估结果', level=2)
eval_results = {
'MSE': [mse_train, mse_test],
'RMSE': [rmse_train, rmse_test],
'MAE': [mae_train, mae_test],
'R²': [r2_train, r2_test]
}
eval_df = pd.DataFrame(eval_results, index=['训练集', '测试集'])
table = document.add_table(rows=1, cols=len(eval_results))
hdr_cells = table.rows[0].cells
for col in eval_results.keys():
hdr_cells[list(eval_results.keys()).index(col)].text = col
for row in eval_df.iterrows():
row_cells = table.add_row().cells
for idx, value in enumerate(row[1]):
row_cells[idx].text = f'{value:.4f}' if isinstance(value, (int, float)) else value
document.add_paragraph(
'图表说明:上表中展示了交叉验证集、训练集和测试集的预测评价指标,通过量化指标来衡量XGBoost的预测效果。其中,通过交叉验证集的评价指标可以不断调整超参数,以得到可靠稳定的模型。\n'
'● MSE(均方误差): 预测值与实际值之差平方的期望值。取值越小,模型准确度越高。\n'
'● RMSE(均方根误差):为MSE的平方根,取值越小,模型准确度越高。\n'
'● MAE(平均绝对误差): 绝对误差的平均值,能反映预测值误差的实际情况。取值越小,模型准确度越高。\n'
'● MAPE(平均绝对百分比误差): 是 MAE 的变形,它是一个百分比值。取值越小,模型准确度越高。\n'
'● R²: 将预测值跟只使用均值的情况下相比,结果越靠近 1 模型准确度越高。'
)
# 保存报告
file_title, _ = os.path.splitext(os.path.basename(file_path))
report_path = f'遗传-XGBoost模型报告_{file_title}.docx'
document.save(report_path)
print(f"报告已保存为: {report_path}")
if __name__ == "__main__":
file_path = '样条min-max标准化.xlsx' # 替换为您的数据文件路径
ask_user_for_choices(file_path)