DataWhale-AI夏令营-物质科学赛道笔记

简单记录下学习过程

1.K折交叉验证 + 随机森林回归预测

# K折交叉验证 + 随机森林回归
# 训练一趟是11分钟左右,太慢了,毕竟sklearn的模型默认都是使用CPU训练,无法使用GPU
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# 超参数:通过K折交叉验证,调整模型参数,找到在该数据集划分下的最佳参数
n_estimators = 100  # 决策树的数量:越大拟合效果越好,但是也可能过拟合且增加训练成本
max_depth = 80  # 决策树的最大深度:过大容易欠拟合,过小容易过拟合
min_samples_split = 3  # 分裂内部节点所需的最小样本数
min_samples_leaf = 2  # 叶子节点所需的最小样本数
fold = 5  # 5折:把数据集分成5份,其中4份作为训练集,1份作为验证集。一般是3、5、10折
kf = KFold(n_splits=fold, shuffle=shuffle, random_state=random_state)
models = []  # 保存K个模型后续再保存最优的模型
all_train_r2_score = []
all_valid_r2_score = []

for train_idx, valid_idx in kf.split(train_x):  # 划分数据集,测试模型在该划分下的性能
    # 这里的train_x是numpy类型,shape=(23538, 8096)
    print(f'train_idx={train_idx} | valid_idx={valid_idx}')
    x_train, x_valid = train_x[train_idx], train_x[valid_idx]
    y_train, y_valid = train_y[train_idx], train_y[valid_idx]

    model = RandomForestRegressor(n_estimators=n_estimators,
                              max_depth=max_depth,
                              min_samples_split=min_samples_split,
                              min_samples_leaf=min_samples_leaf,
                              n_jobs=-1,   # 利用所有的CPU
                              verbose=1)  # 显示训练信息
    model.fit(x_train,y_train)  # 模型训练
    models.append(model)

    y_train_pred = model.predict(x_train)
    train_r2_score = r2_score(y_train, y_train_pred)  # 使用R2评价模型对该数据集的拟合能力
    y_valid_pred = model.predict(x_valid)
    valid_r2_score = r2_score(y_valid, y_valid_pred)

    all_train_r2_score.append(train_r2_score)
    all_valid_r2_score.append(valid_r2_score)
    print(f'train_r2_score={train_r2_score:.4f} | valid_r2_score={valid_r2_score:.4f}')
mean_train_r2_score = sum(all_train_r2_score) / k_num
mean_valid_r2_score = sum(all_valid_r2_score) / k_num
print(f'mean_train_r2_score={mean_train_r2_score:.4f} | mean_valid_r2_score={mean_valid_r2_score:.4f}')

运行结果

2.K折交叉验证+随机森林回归预测+网格搜索

# 网格搜索其实就是暴力枚举,感觉不是很好用
'''
n_estimators三个参数
max_depth三个参数
KFold用于内部交叉验证,n_splits=3
如果外部在用一个KFold交叉验证,n_splits=5
那么就需要跑5*3*3*3=135次随机森林预测
上面n_estimators=100的随机森林都要跑11min,且n_estimators越大跑越久,135次的时间上划不来

网格搜索在cuML是没有的,但是可以手动for循环暴力枚举
例如for n_es in param_grid['n_estimators']:
		for m_dep in param_grid['max_depth']:
			...
'''


base_model = RandomForestRegressor(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   min_samples_split=min_samples_split,
                                   min_samples_leaf=min_samples_leaf,
                                   n_jobs=-1, 
                                   verbose=1)
# 设置参数网格                               
param_grid = {
    'n_estimators': [50, 100, 200],
	'max_depth': [60, 80, 100]
    # 可以添加其他参数
}
grid_search = GridSearchCV(
    estimator=base_model ,
    param_grid=param_grid,
    cv=KFold(n_splits=3),  # K折交叉验证(回归),如果是分类要用StratifiedKFold
    scoring='r2',  # 有各种评价指标,如果是neg_mean_squared_error越接近0越好
    verbose=2,
    return_train_score=True,
    n_jobs=-1
)

3.K折交叉 + cuML(让机器学习可以使用GPU加速)

# 速度很快,每一折都只需要0.7min,训练出来的效果是和CPU一样的,但是效率高啊
# 不过cmML不是所有机器学习的模型都支持的
import cudf
from cuml.ensemble import RandomForestRegressor
# from cuml.preprocessing.model_selection import train_test_split  # 有5折/10折交叉验证,不需要这个划分数据集
from sklearn.utils import resample
from sklearn.model_selection import KFold  # 同理,cuml也没有网格搜索,可以手动实现
import time
from sklearn.metrics import r2_score
# from sklearn.ensemble import BaggingRegressor # cuml里没有袋装集成模型,可以手动配合resample,实现袋装集成模型


start_time = time.time()  # 开始计时


kf = KFold(n_splits=fold, shuffle=shuffle, random_state=random_state)  # 外部K折交叉验证

base_model = RandomForestRegressor(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   min_samples_split=min_samples_split,
                                   min_samples_leaf=min_samples_leaf,
                                   # n_jobs=-1,  cuml不适用,这里是设置CPU并行数量的,而cuml是在GPU上运行
                                   verbose=1)

train_x = train_x.astype(np.float32)  # cudf支持的数据格式是float32或64
train_y = train_y.astype(np.float32)
X_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(train_x))  # 这里传入panda.DataFrame数据,如果训练集本身就是dataFrame格式就直接传进来即可
y_cudf = cudf.Series(train_y)

all_train_r2_score = []
all_valid_r2_score = []
best_model_number = 0
models = []  # 保存k个模型
for k, (train_idx, valid_idx) in enumerate(kf.split(X_cudf)):
    print(f'第{k}折 | train_idx={train_idx} | valid_idx={valid_idx}')
    x_train, x_valid = X_cudf.iloc[train_idx], X_cudf.iloc[valid_idx]  # iloc原理和panda的dataframe一样, 它用于基于行和列的位置进行数据选择
    y_train, y_valid = y_cudf.iloc[train_idx], y_cudf.iloc[valid_idx]  # 在GPU上操作iloc,提供了更快的数据访问速度
    
    x_train = x_train.to_cupy()  # 将cudf DataFrame和Series转换为cupy数组,cupy是一个与Numpy兼容的GPU加速库。
    y_train = y_train.to_cupy()
    x_valid = x_valid.to_cupy()
    y_valid = y_valid.to_cupy()

    base_model.fit(x_train, y_train)  # 训练
    models.append(base_model)
    
    y_train_pred = base_model.predict(x_train)
    y_train_pred = y_train_pred.get()  # 将数据格式numpy的GPU版本,变成CPU版本,方便后面的操作
    y_train = y_train.get()
    train_r2_score = r2_score(y_train, y_train_pred)
    
    y_valid_pred = base_model.predict(x_valid)
    y_valid_pred = y_valid_pred.get()
    y_valid = y_valid.get()
    valid_r2_score = r2_score(y_valid, y_valid_pred)
    
    if all_train_r2_score and valid_r2_score > all_valid_r2_score[-1]:  # 自行DIY要保存的模型
        best_model_number = k
        
    all_train_r2_score.append(train_r2_score)
    all_valid_r2_score.append(valid_r2_score)
    print(f'第{k}折 | train_r2_score={train_r2_score:.4f} | valid_r2_score={valid_r2_score:.4f}')
    
    
mean_train_r2_score = sum(all_train_r2_score) / fold
mean_valid_r2_score = sum(all_valid_r2_score) / fold
print(f'mean_train_r2_score={mean_train_r2_score:.4f} | mean_valid_r2_score={mean_valid_r2_score:.4f}')

end_time = time.time()  # 结束计时
# 计算并打印运行时间
elapsed_time_minute = (end_time - start_time)/60
print(f"Total running time: {elapsed_time_minute:.2f} minutes")

with open(model_path, 'wb') as file:
    pickle.dump(models[best_model_number], file)

最后调到比较好的参数后,再把所有的训练集都丢进去,不要划分验证集了,保存最终模型

  • 4
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值