随机搜索优化ML与DL-CSDN博客

本文链接：https://blog.csdn.net/lenglingling/article/details/106627946

文章目录

随机搜索在机器学习和深度学习的使用

随机搜索在机器学习和深度学习的使用

1.项目简介

该项目是一个回归问题，机器学习中使用的是随机森林回归，深度学习使用的是基础的全连接层神经网络，使用Jupyter Notebook完成，代码和数据文件。

2.机器学习案例

2.1导入相关库

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor  # 用于回归的随机森林
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV  # 随机搜索
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

2.2导入数据

data = pd.read_excel('E:/Jupyter/Mojin/超参数优化/data/NHANES.xlsx')
print('data shape: {0}'.format(data.shape))  # 查看数据结构
data.tail(10)  # 输出数据后10行

运行结果：
在这里插入图片描述
1.可以看到共有28009行数据，10个指标。
2.数据集中的CKD_epi_eGFR变量为因变量，它是连续的数值型变量，其余9个变量为特征变量，包含患者的年龄、性别、肤色、身体质量指数及高密度枝蛋白质指数等。

2.3拆分数据集

# 提取特征变量
features = data.columns[:-1]
print('特征变量：{0}'.format(features))
# 拆分数据集
trainX, testX, trainY, testY = train_test_split(data[features], data['CKD_epi_eGFR'], 
                                                test_size=0.2,
                                                random_state=1234)

2.4随机搜索

# 定义随机森林超参数范围
n_estimators = [200, 250, 300]
max_depth = [5, 8, 10, 15]

param = dict(n_estimators=n_estimators,
             max_depth=max_depth)

grid = RandomizedSearchCV(estimator=RandomForestRegressor(), param_distributions=param)

# 模型在训练数据集上的拟合
grid_result = grid.fit(trainX, trainY)

# 返回最佳参数组合
print('Best：%f using %s' % (grid_result.best_score_, grid_result.best_params_))

运行结果：
在这里插入图片描述

for i in range(len(grid_result.cv_results_['params'])):
    print('%f (%f) with: %r' % (grid_result.cv_results_['mean_test_score'][i],
                                grid_result.cv_results_['std_test_score'][i], 
                                grid_result.cv_results_['params'][i]))

2.5使用最优参数重新训练模型

pred = grid_result.best_estimator_.predict(testX)

# 计算模型的 MSE
mse = metrics.mean_squared_error(testY, pred)
print('测试集MSE: {0}'.format(mse))

R2 = metrics.r2_score(testY, pred)
print('测试集R方：{0}'.format(R2))

运行结果：
在这里插入图片描述

importance = grid_result.best_estimator_.feature_importances_
# 构建含序列用于绘图
Impt_Series = pd.Series(importance, index = trainX.columns)
# 对序列排序绘图
Impt_Series.sort_values(ascending = True).plot(kind='barh')
# 显示图形
plt.show();

运行结果：
在这里插入图片描述
如上结果，影响模型预测准确率的三个主要因素分别为年龄、某尿液细胞指标和慢性肾脏病所属阶段。

3.深度学习案例

3.1导入相关库

import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
from sklearn.model_selection import RandomizedSearchCV  # 随机搜索
from sklearn.model_selection import train_test_split  # 拆分数据集
from sklearn import metrics  # 评估
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasRegressor  # Keras回归模型封装器

KerasRegressor是将深度模型包装传递给网格搜索法的接口，具体说明见官方文档。

3.2导入数据

data = pd.read_excel('E:/Jupyter/Mojin/超参数优化/data/NHANES.xlsx')
print('data shape: {0}'.format(data.shape))  # 查看数据结构
data.tail(10)  # 输出数据后10行

3.3拆分数据集

# 提取特征变量
features = data.columns[:-1]
print('特征变量：{0}'.format(features))
# 拆分数据集
trainX, testX, trainY, testY = train_test_split(data[features], data[['CKD_epi_eGFR']], 
                                                test_size=0.2,
                                                random_state=1234)

# 将数据类型转换成数组
trainX = trainX.values
print('trainX shape:{0}'.format(trainX.shape))
trainY = trainY.values
print('trainY shape: {0}'.format(trainY.shape))
testX = testX.values
testY = testY.values

# 数据标准化
"""
对于深度学习，不同指标的范围区别很大时，最好进行数据标准化！
"""
meanX = trainX.mean(axis=0)
stdX = trainX.std(axis=0)
meanY = trainY.mean(axis=0)
stdY = trainY.std(axis=0)

trainX_norm = (trainX - meanX) / stdX
trainY_norm = (trainY - meanY) / stdY
testX_norm = (testX - meanX) / stdX
testY_norm = (testY - meanY) / stdY

3.4构造模型

def create_model(nodeNum=32, dropout=0.2, depth=2):
    """
    nodeNum: 隐藏层节点数
    dropout: Dropout层丢弃的比例，一般在0.2~0.5之间
    depth: 模型隐藏层的层数
    """
    model = Sequential()
    if depth < 2:
        raise Exception('至少两层结构')
    else:
        model.add(Dense(units=nodeNum, 
                    input_shape=(9,),  # 特征指标个数：9(trainX.shape[1])
                    activation='relu'))
        model.add(Dropout(rate=dropout))  # 防止过拟合
        for i in range(depth - 2):
            model.add(Dense(units=nodeNum,
                                activation='relu'))
        model.add(Dense(units=1))
        
        model.compile(loss='mse',
                  optimizer='rmsprop', 
                  metrics=['mse'])
        model.summary()
    return model

model = KerasRegressor(build_fn=create_model, batch_size=100)

3.5随机搜索

# 定义深度学习超参数范围
nodeNum = [30, 40, 50]
depth = [3, 4, 5]
epochs = [20, 30, 40]

param = dict(nodeNum=nodeNum,
             depth=depth,
             epochs=epochs)

grid = RandomizedSearchCV(estimator=model, param_distributions=param)

# 模型在训练数据集上的拟合
grid_result = grid.fit(trainX_norm, trainY_norm)

# 返回最佳参数组合
print('Best：%f using %s' % (grid_result.best_score_, grid_result.best_params_))

运行结果：
在这里插入图片描述

3.6使用最优参数重新训练模型

pred = grid_result.best_estimator_.predict(testX_norm)
pred = (pred * stdY) + meanY

# 计算模型的 MSE
mse = metrics.mean_squared_error(testY, pred)
print('测试集MSE: {0}'.format(mse))

R2 = metrics.r2_score(testY, pred)
print('测试集R方：{0}'.format(R2))