评估函数
较简单,无需计算梯度,可先实现
损失函数
一阶和二阶梯度计算需要点逻辑,较复杂,提供两种实现方式
方式一
import lightgbm as lgb
from sklearn.model_selection import KFold
from scipy.misc import derivative
# 定义 LightGBM 模型参数
params = {
# 'objective': 'regression',
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'n_jobs': 8
}
def weighted_mae(y_true, y_pred, datetime_hour):
w = (datetime_hour + 1) / 13
return np.mean(w * np.abs(y_true - y_pred))
# 自定义评估函数
def custom_metric(y_pred, data, t):
y_true = data.get_label()
return 'WMAE', weighted_mae(y_true, y_pred, t), False
def custom_loss(y_pred, data, t):
y_true = data.get_label()
w = (t + 1) / 13
grad = w * np.sign(y_pred - y_true)
hess = w
return grad, hess
# 定义交叉验证的折数
n_splits = 5
# 定义 KFold 交叉验证器
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
# 训练特征和标签
train_features = [i for i in train_data.columns if i != "roi"]
train_labels = train_data["roi"]
# 创建空列表,保存每折中模型的训练误差和验证误差
train_scores, valid_scores = [], []
# 创建空的 DataFrame,保存所有特征的 gain 重要性指标
feature_importance = pd.DataFrame({'feature': train_features, 'importance': 0})
# 创建空列表,保存每折模型预测的结果
test_predictions = []
# 遍历所有折数
for fold, (train_index, valid_index) in enumerate(kf.split(train_data)):
# 将训练数据分成训练集和验证集
X_train, y_train = train_data[train_features].iloc[train_index], train_labels.iloc[train_index]
X_valid, y_valid = train_data[train_features].iloc[valid_index], train_labels.iloc[valid_index]
t_train = np.array(X_train['datetime_hour'])
t_valid = np.array(X_valid['datetime_hour'])
# 将数据转换为LightGBM的Dataset格式
train_set = lgb.Dataset(X_train, label=y_train)
valid_set = lgb.Dataset(X_valid, label=y_valid)
# 训练 LightGBM 模型
model = lgb.train(params,
train_set,
num_boost_round=1000,
valid_sets=[valid_set],
early_stopping_rounds=100,
verbose_eval=100,
feval=lambda y_pred, data: custom_metric(y_pred, data, t_valid),
fobj=lambda y_pred, data: custom_loss(y_pred, data, t_train))
# 记录模型在训练集和验证集上的误差
valid_scores.append(model.best_score['valid_0']['WMAE'])
# 记录当前模型的特征重要性指标
fold_importance = pd.DataFrame({'feature': train_features, 'importance': model.feature_importance()})
feature_importance['importance'] += fold_importance['importance']
# 使用当前模型预测测试集,并将预测结果保存在列表中
test_predictions.append(model.predict(test_data.iloc[:].drop('uuid', axis=1)))
print(test_predictions)
# print(f"Fold {fold + 1} - training mae: {train_scores[-1]:.4f}, validation mae: {valid_scores[-1]:.4f}")
print(f"Fold {fold + 1} - validation mae: {valid_scores[-1]:.4f}")
# 输出所有折数的训练误差和验证误差平均值
print(f"Overall training mae: {sum(train_scores)/n_splits:.4f}, overall validation mae: {sum(valid_scores)/n_splits:.4f}")
# 输出特征重要性指标
feature_importance['importance'] /= n_splits
feature_importance = feature_importance.sort_values('importance', ascending=False).reset_index(drop=True)
print(feature_importance)
方式二
import lightgbm as lgb
from scipy.misc import derivative
#数据处理
#focal loss
#LGB+Focal Loss 其中alpha:为不能让容易分类类别的损失函数太小, 默认值0.25;
#gamma:更加关注困难样本 即关注y=1的样本 默认值2
def focal_loss_lgb_sk( y_pred,dtrain, alpha=0.25, gamma=2):
label = dtrain.get_label()
a,g = alpha, gamma
def fl(x,t):
p = 1/(1+np.exp(-x))
return -( a*t + (1-a)*(1-t) ) * (( 1 - ( t*p + (1-t)*(1-p)) )**g) * ( t*np.log(p)+(1-t)*np.log(1-p) )
partial_fl = lambda x: fl(x, label)
grad = derivative(partial_fl, y_pred, n=1, dx=1e-6)
hess = derivative(partial_fl, y_pred, n=2, dx=1e-6)
return grad, hess
#自定义f1评价指标
def f1_score_vali(preds, data_vali):
labels = data_vali.get_label()
preds = np.argmax(preds.reshape(4, -1), axis=0)
score_vali = f1_score(y_true=labels, y_pred=preds, average='macro')
return 'f1_score', score_vali, True
params = {
"learning_rate": 0.1,
"boosting": 'gbdt',
"lambda_l2": 0.1,
"max_depth": -1,
"num_leaves": 128,
"bagging_fraction": 0.8,
"feature_fraction": 0.8,
"metric": None,
"objective": "multiclass",
"num_class": 4,
"nthread": 10,
"verbose": -1,
}
gbm = lgb.train(params,
train_set=train_matrix,
valid_sets=valid_matrix,
num_boost_round=2000,
verbose_eval=100,
early_stopping_rounds=200,
fobj=focal_loss_lgb_sk,
feval=f1_score_vali,
valid_sets=lgb_eval)
#fobj:自定义损失函数
#feval:自定义评价指标
#预测
#计算验证集指标