1.lgm优化
1.1 低Remaining范围样本高权重
weight_ls = np.array(feats['mRNA_remaining_pct'].apply(lambda x:2 if ((x<=30)and(x>=0)) else 1))
1.2 使用官方评价指标作为损失函数
def calculate_metrics(preds, data, threshold=30):
y_pred = preds
y_true = data.get_label()
mae = np.mean(np.abs(y_true - y_pred))
y_true_binary = ((y_true <= threshold) & (y_true >= 0)).astype(int)
y_pred_binary = ((y_pred <= threshold) & (y_pred >= 0)).astype(int)
mask = (y_pred >= 0) & (y_pred <= threshold)
range_mae = (
mean_absolute_error(y_true[mask], y_pred[mask]) if np.sum(mask) > 0 else 100
)
if np.sum(y_pred_binary) > 0:
precision = (np.array(y_pred_binary) & y_true_binary).sum()/np.sum(y_pred_binary)
else:
precision = 0
if np.sum(y_true_binary) > 0:
recall = (np.array(y_pred_binary) & y_true_binary).sum()/np.sum(y_true_binary)
else:
recall = 0
if precision + recall == 0:
f1 = 0
else:
f1 = 2 * precision * recall / (precision + recall)
score = (1 - mae / 100) * 0.5 + (1 - range_mae / 100) * f1 * 0.5
return "custom_score", score, True
1.3 自适应学习率
def adaptive_learning_rate(decay_rate=0.8, patience=50):
best_score = float("-inf")
wait = 0
def callback(env):
nonlocal best_score, wait
current_score = env.evaluation_result_list[-1][2]
current_lr = env.model.params.get('learning_rate')
if current_score > best_score:
best_score = current_score
else:
wait += 1
if wait >= patience:
new_lr = float(current_lr) * decay_rate
wait = 0
env.model.params['learning_rate'] = new_lr
print(f"Learning rate adjusted to {env.model.params.get('learning_rate')}")
return callback
1.4 多折交叉验证
def train(feats, n_original):
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
gbms = []
for fold, (train_idx, val_idx) in enumerate(
kf.split(feats.iloc[:n_original, :]), 1
):
X_train, X_val = feats.iloc[train_idx, :-1], feats.iloc[val_idx, :-1]
y_train, y_val = feats.iloc[train_idx, -1], feats.iloc[val_idx, -1]
w_train = weight_ls[train_idx]
train_data = lgb.Dataset(X_train, label=y_train, weight=w_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
boost_round = 25000
early_stop_rounds = int(boost_round*0.1)
lgb_log = lgb.log_evaluation(period=200, show_stdv=True)
lgb_stop = lgb.early_stopping(stopping_rounds=early_stop_rounds, first_metric_only=True, verbose=True, min_delta=0.00001)
params = {
"boosting_type": "gbdt",
"objective": "regression",
"metric": "None",
"max_depth": 8,
"num_leaves": 63,
"min_data_in_leaf": 2,
"learning_rate": 0.05,
"feature_fraction": 0.9,
"lambda_l1": 0.1,
"lambda_l2": 0.2,
"verbose": -1,
"early_stopping_round": early_stop_rounds,
"num_threads": 8,
}
adaptive_lr = adaptive_learning_rate(decay_rate=0.9, patience=1000)
gbm = lgb.train(
params,
train_data,
num_boost_round=boost_round,
valid_sets=[val_data],
feval=calculate_metrics,
callbacks=[adaptive_lr, lgb_log, lgb_stop],
)
valid_score = gbm.best_score["valid_0"]["custom_score"]
print(f"best_valid_score: {valid_score}")
gbms.append(gbm)
return gbms
2.优化超参数
2.1 网格搜索(Grid Search)
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor
param_grid = {
'max_depth': [3, 5, 7, 9],
'learning_rate': [0.01, 0.02, 0.05, 0.1],
'n_estimators': [100, 500, 1000, 2000],
'min_child_samples': [20, 30, 50, 100]
}
lgbm_model = LGBMRegressor(objective='regression', metric='rmse')
grid_search = GridSearchCV(estimator=lgbm_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
print("Best parameters found: ", grid_search.best_params_)
2.2 随机搜索(Random Search)
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
param_dist = {
'max_depth': sp_randint(3, 10),
'learning_rate': [0.01, 0.02, 0.05, 0.1],
'n_estimators': sp_randint(100, 2000),
'min_child_samples': sp_randint(20, 100)
}
random_search = RandomizedSearchCV(estimator=lgbm_model, param_distributions=param_dist, n_iter=100, cv=5, n_jobs=-1, verbose=2)
random_search.fit(X_train, y_train)
print("Best parameters found: ", random_search.best_params_)
2.3 贝叶斯优化
import optuna
def objective(trial):
params = {
'max_depth': trial.suggest_int('max_depth', 3, 10),
'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
'min_child_samples': trial.suggest_int('min_child_samples', 20, 100)
}
model = LGBMRegressor(**params)
model.fit(X_train, y_train)
return model.score(X_val, y_val)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
print('Best trial:')
trial = study.best_trial
print('Value: {}'.format(trial.value))
print('Params: ')
for key, value in trial.params.items():
print(' {}: {}'.format(key, value))