低Remaining范围样本高权重
- 通过给低Remaining范围样本更高的权重,模型会更加关注这些样本,从而提高对这些样本的预测准确性。这对于处理不平衡数据集特别有用,因为模型通常会倾向于多数类样本,而忽略少数类样本。
- 在某些情况下,低Remaining范围样本可能包含重要的信息或特征。通过增加这些样本的权重,模型可以更好地学习这些特征,从而减少偏差。
- 通过关注低Remaining范围样本,模型可以更好地适应不同的数据分布,从而提高泛化能力。这意味着模型在处理新数据时表现会更好。
- 在训练过程中,损失函数会根据样本权重进行调整。高权重的低Remaining范围样本会对损失函数产生更大的影响,从而引导模型更好地拟合这些样本。
weight_ls = np.array(feats['mRNA_remaining_pct'].apply(lambda x:2 if ((x<=30)and(x>=0)) else 1))
使用官方评价指标作为损失函数
替换root_mean_squared_error
# calculate_metrics函数用于计算评估指标
def calculate_metrics(preds, data, threshold=30):
y_pred = preds
y_true = data.get_label()
mae = np.mean(np.abs(y_true - y_pred))
# if mae < 0: mae = 0
# elif mae >100: mae = 100
y_true_binary = ((y_true <= threshold) & (y_true >= 0)).astype(int)
y_pred_binary = ((y_pred <= threshold) & (y_pred >= 0)).astype(int)
mask = (y_pred >= 0) & (y_pred <= threshold)
range_mae = (
mean_absolute_error(y_true[mask], y_pred[mask]) if np.sum(mask) > 0 else 100
)
# if range_mae < 0: range_mae = 0
# elif range_mae >100: range_mae = 100
# precision = precision_score(y_true_binary, y_pred_binary, average="binary")
# recall = recall_score(y_true_binary, y_pred_binary, average="binary")
if np.sum(y_pred_binary) > 0:
precision = (np.array(y_pred_binary) & y_true_binary).sum()/np.sum(y_pred_binary)
else:
precision = 0
if np.sum(y_true_binary) > 0:
recall = (np.array(y_pred_binary) & y_true_binary).sum()/np.sum(y_true_binary)
else:
recall = 0
if precision + recall == 0:
f1 = 0
else:
f1 = 2 * precision * recall / (precision + recall)
score = (1 - mae / 100) * 0.5 + (1 - range_mae / 100) * f1 * 0.5
return "custom_score", score, True # True表示分数越高越好
自适应学习率
自适应学习率可以根据训练过程中的反馈动态调整学习率,使模型在训练初期快速收敛,同时在训练后期避免震荡,从而提高训练效率。
# adaptive_learning_rate函数用于自适应学习率
def adaptive_learning_rate(decay_rate=0.8, patience=50):
best_score = float("-inf") # 初始化为负无穷,因为分数越高越好
wait = 0
def callback(env):
nonlocal best_score, wait
current_score = env.evaluation_result_list[-1][2] # 假设使用的是最后一个评估指标
current_lr = env.model.params.get('learning_rate')
if current_score > best_score:
best_score = current_score
# wait = 0 # 需要连续的score没有上升
else:
wait += 1
if wait >= patience:
new_lr = float(current_lr) * decay_rate
wait = 0
env.model.params['learning_rate'] = new_lr
print(f"Learning rate adjusted to {env.model.params.get('learning_rate')}")
return callback
多折交叉训练
将数据集分成多个折叠(folds),每个折叠都作为一次验证集,其余的作为训练集,可以有效减少模型的过拟合风险。模型在不同的训练集和验证集上进行训练和评估,有助于提高模型的泛化能力。
# train函数用于训练模型
def train(feats, n_original):
# 定义k折交叉验证
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
# 开始k折交叉验证
gbms = []
for fold, (train_idx, val_idx) in enumerate(
kf.split(feats.iloc[:n_original, :]), 1
):
# 准备训练集和验证集
X_train, X_val = feats.iloc[train_idx, :-1], feats.iloc[val_idx, :-1]
y_train, y_val = feats.iloc[train_idx, -1], feats.iloc[val_idx, -1]
w_train = weight_ls[train_idx]
# 创建LightGBM数据集
train_data = lgb.Dataset(X_train, label=y_train, weight=w_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
boost_round = 25000
early_stop_rounds = int(boost_round*0.1)
# 显示metric
lgb_log = lgb.log_evaluation(period=200, show_stdv=True)
lgb_stop = lgb.early_stopping(stopping_rounds=early_stop_rounds, first_metric_only=True, verbose=True, min_delta=0.00001)
# 设置LightGBM参数
params = {
"boosting_type": "gbdt",
"objective": "regression",
"metric": "None",
# "metric": "root_mean_squared_error",
"max_depth": 8,
"num_leaves": 63,
"min_data_in_leaf": 2,
"learning_rate": 0.05,
"feature_fraction": 0.9,
"lambda_l1": 0.1,
"lambda_l2": 0.2,
"verbose": -1, # -1时不输出
"early_stopping_round": early_stop_rounds,
"num_threads": 8,
}
# 在训练时使用自适应学习率回调函数
adaptive_lr = adaptive_learning_rate(decay_rate=0.9, patience=1000)
gbm = lgb.train(
params,
train_data,
num_boost_round=boost_round,
valid_sets=[val_data],
feval=calculate_metrics, # 将自定义指标函数作为feval参数传入
# callbacks=[print_validation_result, adaptive_lr, lgb_log, lgb_stop],
callbacks=[adaptive_lr, lgb_log, lgb_stop],
)
valid_score = gbm.best_score["valid_0"]["custom_score"]
print(f"best_valid_score: {valid_score}")
gbms.append(gbm)
return gbms
优化结果
比赛得分获得0.03的提升