文章目录
随机搜索在机器学习和深度学习的使用
1.项目简介
该项目是一个回归问题,机器学习中使用的是随机森林回归,深度学习使用的是基础的全连接层神经网络,使用Jupyter Notebook完成,代码和数据文件。
2.机器学习案例
2.1导入相关库
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor # 用于回归的随机森林
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV # 随机搜索
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
2.2导入数据
data = pd.read_excel('E:/Jupyter/Mojin/超参数优化/data/NHANES.xlsx')
print('data shape: {0}'.format(data.shape)) # 查看数据结构
data.tail(10) # 输出数据后10行
运行结果:
1.可以看到共有28009行数据,10个指标。
2.数据集中的CKD_epi_eGFR变量为因变量,它是连续的数值型变量,其余9个变量为特征变量,包含患者的年龄、性别、肤色、身体质量指数及高密度枝蛋白质指数等。
2.3拆分数据集
# 提取特征变量
features = data.columns[:-1]
print('特征变量:{0}'.format(features))
# 拆分数据集
trainX, testX, trainY, testY = train_test_split(data[features], data['CKD_epi_eGFR'],
test_size=0.2,
random_state=1234)
2.4随机搜索
# 定义随机森林超参数范围
n_estimators = [200, 250, 300]
max_depth = [5, 8, 10, 15]
param = dict(n_estimators=n_estimators,
max_depth=max_depth)
grid = RandomizedSearchCV(estimator=RandomForestRegressor(), param_distributions=param)
# 模型在训练数据集上的拟合
grid_result = grid.fit(trainX, trainY)
# 返回最佳参数组合
print('Best:%f using %s' % (grid_result.best_score_, grid_result.best_params_))
运行结果:
for i in range(len(grid_result.cv_results_['params'])):
print('%f (%f) with: %r' % (grid_result.cv_results_['mean_test_score'][i],
grid_result.cv_results_['std_test_score'][i],
grid_result.cv_results_['params'][i]))
2.5使用最优参数重新训练模型
pred = grid_result.best_estimator_.predict(testX)
# 计算模型的 MSE
mse = metrics.mean_squared_error(testY, pred)
print('测试集MSE: {0}'.format(mse))
R2 = metrics.r2_score(testY, pred)
print('测试集R方:{0}'.format(R2))
运行结果:
importance = grid_result.best_estimator_.feature_importances_
# 构建含序列用于绘图
Impt_Series = pd.Series(importance, index = trainX.columns)
# 对序列排序绘图
Impt_Series.sort_values(ascending = True).plot(kind='barh')
# 显示图形
plt.show();
运行结果:
如上结果,影响模型预测准确率的三个主要因素分别为年龄、某尿液细胞指标和慢性肾脏病所属阶段。
3.深度学习案例
3.1导入相关库
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
from sklearn.model_selection import RandomizedSearchCV # 随机搜索
from sklearn.model_selection import train_test_split # 拆分数据集
from sklearn import metrics # 评估
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasRegressor # Keras回归模型封装器
KerasRegressor是将深度模型包装传递给网格搜索法的接口,具体说明见官方文档。
3.2导入数据
data = pd.read_excel('E:/Jupyter/Mojin/超参数优化/data/NHANES.xlsx')
print('data shape: {0}'.format(data.shape)) # 查看数据结构
data.tail(10) # 输出数据后10行
3.3拆分数据集
# 提取特征变量
features = data.columns[:-1]
print('特征变量:{0}'.format(features))
# 拆分数据集
trainX, testX, trainY, testY = train_test_split(data[features], data[['CKD_epi_eGFR']],
test_size=0.2,
random_state=1234)
# 将数据类型转换成数组
trainX = trainX.values
print('trainX shape:{0}'.format(trainX.shape))
trainY = trainY.values
print('trainY shape: {0}'.format(trainY.shape))
testX = testX.values
testY = testY.values
# 数据标准化
"""
对于深度学习,不同指标的范围区别很大时,最好进行数据标准化!
"""
meanX = trainX.mean(axis=0)
stdX = trainX.std(axis=0)
meanY = trainY.mean(axis=0)
stdY = trainY.std(axis=0)
trainX_norm = (trainX - meanX) / stdX
trainY_norm = (trainY - meanY) / stdY
testX_norm = (testX - meanX) / stdX
testY_norm = (testY - meanY) / stdY
3.4构造模型
def create_model(nodeNum=32, dropout=0.2, depth=2):
"""
nodeNum: 隐藏层节点数
dropout: Dropout层丢弃的比例,一般在0.2~0.5之间
depth: 模型隐藏层的层数
"""
model = Sequential()
if depth < 2:
raise Exception('至少两层结构')
else:
model.add(Dense(units=nodeNum,
input_shape=(9,), # 特征指标个数:9(trainX.shape[1])
activation='relu'))
model.add(Dropout(rate=dropout)) # 防止过拟合
for i in range(depth - 2):
model.add(Dense(units=nodeNum,
activation='relu'))
model.add(Dense(units=1))
model.compile(loss='mse',
optimizer='rmsprop',
metrics=['mse'])
model.summary()
return model
model = KerasRegressor(build_fn=create_model, batch_size=100)
3.5随机搜索
# 定义深度学习超参数范围
nodeNum = [30, 40, 50]
depth = [3, 4, 5]
epochs = [20, 30, 40]
param = dict(nodeNum=nodeNum,
depth=depth,
epochs=epochs)
grid = RandomizedSearchCV(estimator=model, param_distributions=param)
# 模型在训练数据集上的拟合
grid_result = grid.fit(trainX_norm, trainY_norm)
# 返回最佳参数组合
print('Best:%f using %s' % (grid_result.best_score_, grid_result.best_params_))
运行结果:
3.6使用最优参数重新训练模型
pred = grid_result.best_estimator_.predict(testX_norm)
pred = (pred * stdY) + meanY
# 计算模型的 MSE
mse = metrics.mean_squared_error(testY, pred)
print('测试集MSE: {0}'.format(mse))
R2 = metrics.r2_score(testY, pred)
print('测试集R方:{0}'.format(R2))
运行结果: