lightgbm建模,在其内置的比较少,如用于二分类的任务只有binary,最多再搭配class_weight来惩罚不同类别的损失函数。但我们可以自定义损失函数,只要损失函数可以求二阶导。这篇博客结合1、从复现logloss到实现focalloss的思路,2、借鉴深度学习的损失函数等文章的做法,在自有数据集上用LGBMClassifier实现。
1、复现logloss,以及初始化损失函数
数据来自:maxhalford.github.io/files/datasets/creditcardfraud.zip
1.1 原始版本
# sklearn 实现
bst_params = {'learning_rate':0.01, 'n_estimators':10000}
callbacks = [lgb.early_stopping(stopping_rounds=20)]
model = lgb.LGBMClassifier(objective='binary', random_state=2021, **bst_params,verbose=-1)
model.fit(X_fit, y_fit,verbose=100,callbacks=callbacks
,eval_names=['fit', 'val']
,eval_set=[(X_fit, y_fit),(X_val, y_val)])
y_pred = model.predict_proba(X_test)[:,1]
print()
print(f"Test's ROC AUC: {roc_auc_score(y_test, y_pred):.5f}")
print(f"Test's logloss: {log_loss(y_test, y_pred):.5f}")
Training until validation scores don't improve for 20 rounds
[100] fit's binary_logloss: 0.0018981 val's binary_logloss: 0.0035569
[200] fit's binary_logloss: 0.00080822 val's binary_logloss: 0.00283644
[300] fit's binary_logloss: 0.000396519 val's binary_logloss: 0.00264941
Early stopping, best iteration is:
[352] fit's binary_logloss: 0.000281286 val's binary_logloss: 0.00261413
Test's ROC AUC: 0.97772
Test's logloss: 0.00237
1.2 自定义损失函数,并初始化
def logloss_objective_sklearn(labels,preds):
p = special.expit(preds)
grad = p - labels
hess = p * (1 - p)
return grad, hess
def logloss_metric_sklearn(labels,preds):
p = special.expit(preds)
ll = np.empty_like(p)
pos = labels == 1
ll[pos] = np.log(p[pos])
ll[~pos] = np.log(1 - p[~pos])
is_higher_better = False
return 'logloss', -ll.mean(), is_higher_better
def logloss_init_score(y):
p = y.mean()
p = np.clip(p, 1e-15, 1 - 1e-15) # never hurts
log_odds = np.log(p / (1 - p))
return log_odds
initScore_fit = np.full_like(y_fit, logloss_init_score(y_fit), dtype=float)
initScore_val = np.full_like(y_val, logloss_init_score(y_val), dtype=float)
# sklearn 实现
bst_params = {'learning_rate':0.01, 'n_estimators':10000}
callbacks = [lgb.early_stopping(stopping_rounds=20)]
model = lgb.LGBMClassifier(objective=logloss_objective_sklearn, random_state=42, **bst_params,verbose=-1)
model.fit(X_fit, y_fit,verbose=100,callbacks=callbacks,eval_metric=logloss_metric_sklearn,
init_score= initScore_fit,
eval_init_score =[initScore_fit, initScore_val],
eval_names=['fit', 'val'],
eval_set=[(X_fit, y_fit),(X_val, y_val)])
y_pred = special.expit(logloss_init_score(y_fit) + model.predict(X_test))
print()
print(f"Test's ROC AUC: {roc_auc_score(y_test, y_pred):.5f}")
print(f"Test's logloss: {log_loss(y_test, y_pred):.5f}")
Training until validation scores don't improve for 20 rounds
[100] fit's binary_logloss: -2.08799 fit's logloss: 0.0018981 val's binary_logloss: -2.08022 val's logloss: 0.00355788
[200] fit's binary_logloss: -2.20016 fit's logloss: 0.00080822 val's binary_logloss: -2.18523 val's logloss: 0.00283437
[300] fit's binary_logloss: -2.29581 fit's logloss: 0.000396519 val's binary_logloss: -2.27789 val's logloss: 0.00264595
Early stopping, best iteration is:
[352] fit's binary_logloss: -2.33998 fit's logloss: 0.000281286 val's binary_logloss: -2.32222 val's logloss: 0.00261033
Test's ROC AUC: 0.97772
Test's logloss: 0.00237
2、定义focalLoss
focalLoss的梯度和二阶导数,均是用手动推导出来,也可以用autograd或则jax等三方包来生成
def fl_pos(pt, alpha, gamma):
return -alpha * (1 - pt)**gamma * np.log(pt)
class FocalLoss:
def __init__(self, gamma, alpha=None):
# 使用FocalLoss只需要设定以上两个参数,如果alpha=None,默认取值为1
self.alpha = alpha
self.gamma = gamma
def at(self, y):
# alpha 参数, 根据FL的定义函数,正样本权重为self.alpha,负样本权重为1 - self.alpha
if self.alpha is None:
return np.ones_like(y)
return np.where(y, self.alpha, 1 - self.alpha)
def pt(self, y, p):
# pt和p的关系
p = np.clip(p, 1e-15, 1 - 1e-15)
return np.where(y, p, 1 - p)
def __call__(self, y_true, y_pred):
# 即FL的计算公式
at = self.at(y_true)
pt = self.pt(y_true, y_pred)
return -at * (1 - pt) ** self.gamma * np.log(pt)
def grad(self, y_true, y_pred):
# 一阶导数
y = 2 * y_true - 1 # {0, 1} -> {-1, 1}
at = self.at(y_true)
pt = self.pt(y_true, y_pred)
g = self.gamma
return at * y * (1 - pt) ** g * (g * pt * np.log(pt) + pt - 1)
def hess(self, y_true, y_pred):
# 二阶导数
y = 2 * y_true - 1 # {0, 1} -> {-1, 1}
at = self.at(y_true)
pt = self.pt(y_true, y_pred)
g = self.gamma
u = at * y * (1 - pt) ** g
du = -at * y * g * (1 - pt) ** (g - 1)
v = g * pt * np.log(pt) + pt - 1
dv = g * np.log(pt) + g + 1
return (du * v + u * dv) * y * (pt * (1 - pt))
def init_score(self, y_true):
# 样本初始值寻找过程
res = optimize.minimize_scalar(
lambda p: self(y_true, p).sum(),
bounds=(0, 1),
method='bounded'
)
p = res.x
log_odds = np.log(p / (1 - p))
return log_odds
def lgb_obj(self, preds, train_data):
y = train_data.get_label()
p = special.expit(preds)
return self.grad(y, p), self.hess(y, p)
def lgb_eval(self, preds, train_data):
y = train_data.get_label()
p = special.expit(preds)
is_higher_better = False
return 'focal_loss', self(y, p).mean(), is_higher_better
def lgb_obj_sklearn(self, labels,preds):
p = special.expit(preds)
return self.grad(labels, p), self.hess(labels, p)
def lgb_eval_sklearn(self, labels,preds):
p = special.expit(preds)
is_higher_better = False
return 'focal_loss', self(labels, p).mean(), is_higher_better
fl = FocalLoss(alpha=0.9, gamma=0.05)
bst_params = {'learning_rate':0.01, 'n_estimators':10000}
callbacks = [lgb.early_stopping(stopping_rounds=20)]
initScore_fit = np.full_like(y_fit, fl.init_score(y_fit), dtype=float)
initScore_val = np.full_like(y_val, fl.init_score(y_val), dtype=float)
model = lgb.LGBMClassifier(objective=fl.lgb_obj_sklearn, random_state=2021, **bst_params,verbose=-1)
model.fit(X_fit, y_fit,verbose=100,callbacks=callbacks,eval_metric=fl.lgb_eval_sklearn,
init_score= initScore_fit,
eval_init_score =[initScore_fit, initScore_val],
eval_set=[(X_fit, y_fit),(X_val, y_val)])
y_pred = special.expit(fl.init_score(y_fit) + model.predict(X_test))
print()
print(f"Test's ROC AUC: {roc_auc_score(y_test, y_pred):.5f}")
print(f"Test's logloss: {log_loss(y_test, y_pred):.5f}")
Training until validation scores don't improve for 20 rounds
[100] training's binary_logloss: -1.75704 training's focal_loss: 0.00151758 valid_1's binary_logloss: -1.74779 valid_1's focal_loss: 0.00236481
[200] training's binary_logloss: -1.8983 training's focal_loss: 0.000622405 valid_1's binary_logloss: -1.88522 valid_1's focal_loss: 0.00169771
[300] training's binary_logloss: -2.02261 training's focal_loss: 0.000298705 valid_1's binary_logloss: -2.00838 valid_1's focal_loss: 0.00153115
Early stopping, best iteration is:
[307] training's binary_logloss: -2.03088 training's focal_loss: 0.0002844 valid_1's binary_logloss: -2.01628 valid_1's focal_loss: 0.00152971
Test's ROC AUC: 0.97781
Test's logloss: 0.00330
3、引入torch的损失函数
使用的是torch.nn.SmoothL1Loss(reduction=‘mean’)
from torch import autograd
from scipy import special, optimize
def torch_loss_metric(labels, preds):
preds=torch.from_numpy(preds)
p = torch.sigmoid(preds)
p.requires_grad=True
labels=torch.from_numpy(labels)
labels.requires_grad=False
loss=torch.nn.SmoothL1Loss(reduction='mean')(p,labels)
dy_dx = torch.autograd.grad(loss,p,create_graph=True,retain_graph=True)[0]
dy_dx2 = torch.autograd.grad(dy_dx,p,
grad_outputs=torch.ones(p.shape),
create_graph=False)[0]
return dy_dx.detach_().numpy(), dy_dx2.numpy()
bst_params = {'learning_rate':0.01, 'n_estimators':1000}
callbacks = [lgb.early_stopping(stopping_rounds=20)]
model = lgb.LGBMClassifier(objective=torch_loss_metric, random_state=42, **bst_params,verbose=-1)
model.fit(X_fit, y_fit,verbose=100,callbacks=callbacks
# ,eval_metric='auc'
,eval_set=[(X_fit, y_fit),(X_val, y_val)])
y_pred = special.expit(model.predict(X_test))
print(f"Test's ROC AUC: {roc_auc_score(y_test, y_pred):.5f}")
print(f"Test's logloss: {log_loss(y_test, y_pred):.5f}")
Training until validation scores don't improve for 20 rounds
[100] training's binary_logloss: -0.347727 valid_1's binary_logloss: -0.347017
[200] training's binary_logloss: -0.564626 valid_1's binary_logloss: -0.563895
[300] training's binary_logloss: -0.710497 valid_1's binary_logloss: -0.709754
[400] training's binary_logloss: -0.816488 valid_1's binary_logloss: -0.815743
[500] training's binary_logloss: -0.893713 valid_1's binary_logloss: -0.892971
[600] training's binary_logloss: -0.959138 valid_1's binary_logloss: -0.958395
[700] training's binary_logloss: -1.01306 valid_1's binary_logloss: -1.01231
[800] training's binary_logloss: -1.05835 valid_1's binary_logloss: -1.05778
[900] training's binary_logloss: -1.09786 valid_1's binary_logloss: -1.09805
[1000] training's binary_logloss: -1.13217 valid_1's binary_logloss: -1.13223
Did not meet early stopping. Best iteration is:
[1000] training's binary_logloss: -1.13217 valid_1's binary_logloss: -1.13223
Test's ROC AUC: 0.96652
Test's logloss: 0.10877
以上就是lgb自定义损失函数的内容,里面还是有很多细节需要完善,比如pmml文件的转换上线,这个问题还需要继续探索,不然模型无法在java平台上线,还有torch更多损失函数的支持,也是需要更多的实际的操作。
参考:
【1】https://zhuanlan.zhihu.com/p/358771434
【2】https://zhuanlan.zhihu.com/p/360247941
【3】https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html#lightgbm.LGBMClassifier.predict