@浙大疏锦行
题目:基于燃料电池水热管理方针数据的机器学习分析
尝试找到一个kaggle或者其他地方的结构化数据集,用之前的内容完成一个全新的项目,这样你也是独立完成了一个专属于自己的项目。
要求:
1. 有数据地址的提供数据地址,没有地址的上传网盘贴出地址即可。
2. 尽可能与他人不同,优先选择本专业相关数据集
3. 探索一下开源数据的网站有哪些?
研究真正核心的地方集中在以下几点:
1. 数据的质量上,是否有好的研究主题但是你这个数据很难获取,所以你这个研究有价值
2. 研究问题的选择上,同一个数据你找到了有意思的点,比如更换了因变量,做出了和别人不同的研究问题
3. 同一个问题,特征加工上,是否对数据进一步加工得出了新的结论-----你的加工被证明是有意义的。
# 先运行之前预处理好的代码
import pandas as pd
import pandas as pd #用于数据处理和分析,可处理表格数据。
import numpy as np #用于数值计算,提供了高效的数组操作。
import matplotlib.pyplot as plt #用于绘制各种类型的图表
import seaborn as sns #基于matplotlib的高级绘图库,能绘制更美观的统计图形。
import warnings
warnings.filterwarnings("ignore")
# 设置中文字体(解决中文显示问题)
plt.rcParams['font.sans-serif'] = ['SimHei'] # Windows系统常用黑体字体
plt.rcParams['axes.unicode_minus'] = False # 正常显示负号
data =pd.read_excel('数据汇总.xlsx') #读取数据
data.info() #查看数据信息
# 离散变量
discrete_features = data.select_dtypes(include=['object']).columns.tolist()
# 连续变量
continuous_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist() #把筛选出来的列名转换成列表
# 0 - 1 映射
gl_Rfd_mapping = {
'inconsistent': 0,
'consistent': 1
}
data['gl_Rfd'] = data['gl_Rfd'].map(gl_Rfd_mapping)
g_Rfd_mapping = {
'downstream': 0,
'countercurrent': 1
}
data['g_Rfd'] = data['g_Rfd'].map(g_Rfd_mapping)
gl_Rp_mapping = {
'vertical': 0,
'coincidence': 1
}
data['gl_Rp'] = data['gl_Rp'].map(gl_Rp_mapping)
# 连续特征用众数补全
for feature in continuous_features:
mode_value = data[feature].mode()[0] #获取该列的众数。
data[feature].fillna(mode_value, inplace=True) #用众数填充该列的缺失值
# 绘制每个连续变量的箱线图
fig, axes1 = plt.subplots(2,4, figsize=(20,10))
axes1 = axes1.flatten()
for i, var in enumerate(continuous_features):
sns.boxplot(x=data[var], ax=axes1[i])
axes1[i].set_title(f'{var} 的箱线图')
axes1[i].set_xlabel(var)
# 绘制每个离散变量的直方图
fig, axes2 = plt.subplots(1,3, figsize=(20,10))
axes2 = axes2.flatten()
for i, var in enumerate(discrete_features):
sns.histplot(x=data[var], ax=axes2[i])
axes2[i].set_title(f'{var} 的直方图')
axes2[i].set_xlabel(var)
# 设置横坐标刻度位置和标签
axes2[i].set_xticks([0, 1])
axes2[i].set_xticklabels([0, 1])
# 计算相关系数矩阵
correlation_matrix = data[continuous_features].corr()
# 设置图片清晰度
plt.rcParams['figure.dpi'] = 300
# 绘制热力图
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap of Continuous Features')
plt.show()
# 划分数据集
from sklearn.model_selection import train_test_split
X = data.drop(['MAX_T'], axis=1) # 特征,axis=1表示按列删除
y = data['MAX_T'] # 标签
# 按照8:2划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 80%训练集,20%测试集
# 支持向量回归
from sklearn.svm import SVR
# K近邻回归
from sklearn.neighbors import KNeighborsRegressor
# 线性回归(逻辑回归本质是分类模型,线性回归用于回归任务)
from sklearn.linear_model import LinearRegression
# XGBoost回归
import xgboost as xgb
# 随机森林回归
from sklearn.ensemble import RandomForestRegressor
# CatBoost回归
from catboost import CatBoostRegressor
# 决策树回归
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings #用于忽略警告信息
warnings.filterwarnings("ignore") # 忽略所有警告信息
# 创建支持向量回归模型
svr_model = SVR()
svr_model.fit(X_train, y_train)
y_pred = svr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# 创建 K 近邻回归模型,这里 n_neighbors 可以根据实际情况调整
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred = knn_model.predict(X_test)
mse1 = mean_squared_error(y_test, y_pred)
rmse1 = np.sqrt(mse1)
mae1 = mean_absolute_error(y_test, y_pred)
r21 = r2_score(y_test, y_pred)
# 线性回归
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
mse_linear = mean_squared_error(y_test, y_pred_linear)
rmse_linear = np.sqrt(mse_linear)
mae_linear = mean_absolute_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)
# XGBoost 回归
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
# 随机森林回归
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
# CatBoost 回归
cat_model = CatBoostRegressor(verbose=0)
cat_model.fit(X_train, y_train)
y_pred_cat = cat_model.predict(X_test)
mse_cat = mean_squared_error(y_test, y_pred_cat)
rmse_cat = np.sqrt(mse_cat)
mae_cat = mean_absolute_error(y_test, y_pred_cat)
r2_cat = r2_score(y_test, y_pred_cat)
# 决策树回归
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
mse_dt = mean_squared_error(y_test, y_pred_dt)
rmse_dt = np.sqrt(mse_dt)
mae_dt = mean_absolute_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)
# 定义模型名称和对应的评估指标
models = ['支持向量回归', 'K近邻回归', '线性回归', 'XGBoost回归', '随机森林回归', 'CatBoost回归', '决策树回归']
mse_values = [mse, mse1, mse_linear, mse_xgb, mse_rf, mse_cat, mse_dt]
rmse_values = [rmse, rmse1, rmse_linear, rmse_xgb, rmse_rf, rmse_cat, rmse_dt]
mae_values = [mae, mae1, mae_linear, mae_xgb, mae_rf, mae_cat, mae_dt]
r2_values = [r2, r21, r2_linear, r2_xgb, r2_rf, r2_cat, r2_dt]
# 找到均方误差最小的模型
best_mse_index = mse_values.index(min(mse_values))
best_mse_model = models[best_mse_index]
# 找到均方根误差最小的模型
best_rmse_index = rmse_values.index(min(rmse_values))
best_rmse_model = models[best_rmse_index]
# 找到平均绝对误差最小的模型
best_mae_index = mae_values.index(min(mae_values))
best_mae_model = models[best_mae_index]
# 找到决定系数最大的模型
best_r2_index = r2_values.index(max(r2_values))
best_r2_model = models[best_r2_index]
print("均方误差最小的模型:", best_mse_model)
print("均方根误差最小的模型:", best_rmse_model)
print("平均绝对误差最小的模型:", best_mae_model)
print("决定系数最大的模型:", best_r2_model)
# 综合考虑,假设决定系数权重为 0.4,其他三个指标权重各为 0.2
scores = []
for i in range(len(models)):
score = 0.4 * r2_values[i] + 0.2 * (1 - (mse_values[i] / max(mse_values))) + 0.2 * (1 - (rmse_values[i] / max(rmse_values))) + 0.2 * (1 - (mae_values[i] / max(mae_values)))
scores.append(score)
best_overall_index = scores.index(max(scores))
best_overall_model = models[best_overall_index]
print("综合评估最好的模型:", best_overall_model)
print("XGBoost 回归模型评估指标:")
print(f"均方误差: {mse_xgb}")
print(f"均方根误差: {rmse_xgb}")
print(f"平均绝对误差: {mae_xgb}")
print(f"决定系数: {r2_xgb}")
# --- 1. 默认参数的XGBoost ---
# 评估基准模型,这里确实不需要验证集
print("--- 1. 默认参数XGBoost (训练集 -> 测试集) ---")
import time # 记录时长
start_time = time.time() # 记录开始时间
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
end_time = time.time() # 记录结束时间
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
print(f"均方误差: {mse_xgb}")
print(f"均方根误差: {rmse_xgb}")
print(f"平均绝对误差: {mae_xgb}")
print(f"决定系数: {r2_xgb}")
from deap import base, creator, tools, algorithms # 遗传算法和进化计算的Python库
import random
# --- 2. 遗传算法优化XGBoost ---
print("\n--- 2. 遗传算法优化XGBoost (训练集 -> 测试集) ---")
# 定义适应度函数和个体类型
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)
# 定义超参数范围
param_ranges = {
'learning_rate': (0.01, 0.3),
'n_estimators': (50, 300),
'max_depth': (3, 10)
}
# 初始化工具盒
toolbox = base.Toolbox()
# 定义属性生成器
toolbox.register("attr_learning_rate", random.uniform, *param_ranges['learning_rate'])
toolbox.register("attr_n_estimators", random.randint, *param_ranges['n_estimators'])
toolbox.register("attr_max_depth", random.randint, *param_ranges['max_depth'])
# 定义个体和种群生成器
toolbox.register("individual", tools.initCycle, creator.Individual,
(toolbox.attr_learning_rate, toolbox.attr_n_estimators,
toolbox.attr_max_depth), n=1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
# 修改适应度函数
def evaluate(individual):
learning_rate, n_estimators, max_depth = individual
model = xgb.XGBRegressor(
learning_rate=learning_rate,
n_estimators=n_estimators,
max_depth=max_depth,
random_state=42
)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# 综合评估,假设决定系数权重为 0.4,其他三个指标权重各为 0.2
max_mse = np.max([mse])
max_rmse = np.max([rmse])
max_mae = np.max([mae])
score = 0.4 * r2 + 0.2 * (1 - (mse / max_mse)) + 0.2 * (1 - (rmse / max_rmse)) + 0.2 * (1 - (mae / max_mae))
# 因为要最小化适应度,所以取负
return -score,
# 注册评估函数
toolbox.register("evaluate", evaluate)
# 注册交叉、变异和选择操作
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutUniformInt,
low=[param_ranges['learning_rate'][0], param_ranges['n_estimators'][0], param_ranges['max_depth'][0]],
up=[param_ranges['learning_rate'][1], param_ranges['n_estimators'][1], param_ranges['max_depth'][1]],
indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)
# 创建初始种群
pop = toolbox.population(n=50) # 种群大小为 50
# 遗传算法参数
NGEN = 40
CXPB = 0.5
MUTPB = 0.2
start_time = time.time()
# 运行遗传算法
def custom_mutate(individual):
# 处理 learning_rate,使用高斯变异
individual[0] += random.gauss(0, 0.05)
individual[0] = max(param_ranges['learning_rate'][0], min(individual[0], param_ranges['learning_rate'][1]))
# 处理 n_estimators 和 max_depth,使用均匀整数变异
if random.random() < 0.2: # indpb
individual[1] = random.randint(param_ranges['n_estimators'][0], param_ranges['n_estimators'][1])
if random.random() < 0.2: # indpb
individual[2] = random.randint(param_ranges['max_depth'][0], param_ranges['max_depth'][1])
return individual,
toolbox.register("mutate", custom_mutate)
end_time = time.time()
# 找到最优个体
best_ind = tools.selBest(pop, k=1)[0]
best_learning_rate, best_n_estimators, best_max_depth = best_ind
print(f"最优超参数: 学习率 = {best_learning_rate}, 估计器数量 = {best_n_estimators}, 最大深度 = {best_max_depth}")
# 使用最优超参数训练最终的 XGBoost 模型
best_model = xgb.XGBRegressor(
learning_rate=best_learning_rate,
n_estimators=best_n_estimators,
max_depth=best_max_depth,
random_state=42
)
best_model.fit(X_train, y_train)
y_pred_best = best_model.predict(X_test)
# 在测试集上评估最优模型
mse_best = mean_squared_error(y_test, y_pred_best)
rmse_best = np.sqrt(mse_best)
mae_best = mean_absolute_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)
print("\n最优模型评估指标:")
print(f"均方误差 (MSE): {mse_best}")
print(f"均方根误差 (RMSE): {rmse_best}")
print(f"平均绝对误差 (MAE): {mae_best}")
print(f"决定系数 (R²): {r2_best}")
import shap
# 初始化 SHAP 解释器
explainer = shap.Explainer(best_model)
shap_values = explainer(X_test)
# 全局特征重要性蜜蜂群图
shap.summary_plot(shap_values, X_test)
plt.show()
# 单个样本的 force plot
shap.force_plot(explainer.expected_value, shap_values[0].values, X_test.iloc[0], matplotlib=True)
plt.show()
# 依赖图,选择一个特征进行分析,这里以第一个特征为例
shap.dependence_plot(0, shap_values.values, X_test)
plt.show()