以二分类的交叉熵损失函数为例
1、直接使用XGB中定义好的交叉熵损失函数
import xgboost as xgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import random
import os
seed = 1
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
# 加载数据集
data = load_breast_cancer()
X = data.data
y = data.target
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# 转换数据格式为DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
# 设置模型参数
params = {
'objective': 'binary:logistic', # 二分类逻辑回归
'eval_metric': 'error', # 评估指标为错误率
'seed': seed
}
# 训练模型
num_rounds = 100 # 迭代次数
model = xgb.train(params, dtrain, num_rounds)
# 预测
y_pred = model.predict(dtest)
y_pred_binary = [1 if p > 0.1 else 0 for p in y_pred] # 将概率转换为类别
# 计算准确率
accuracy = accuracy_score(y_test, y_pred_binary)
print("Accuracy:", accuracy)
最终的运行结果为
Accuracy: 0.9210526315789473
2、使用自定义的交叉熵损失函数
import xgboost as xgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import random
import os
seed = 1
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
# 加载数据集
data = load_breast_cancer()
X = data.data
y = data.target
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# 转换数据格式为DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
def custom_loss(preds, dtrain):
labels = dtrain.get_label()
preds = 1.0 / (1.0 + np.exp(-preds))
def binary_cross_entropy_gradient(y_pred, y_true):
eps = 1e-15 # 避免除零错误的常数
# 预测概率取值范围限制在 [eps, 1-eps] 内
y_pred = np.clip(y_pred, eps, 1 - eps)
# 二分类交叉熵损失函数的一阶导数(梯度)
gradient = - y_true / y_pred + (1 - y_true) / (1 - y_pred)
return gradient
def binary_cross_entropy_hessian(y_pred, y_true):
eps = 1e-15 # 避免除零错误的常数
# 预测概率取值范围限制在 [eps, 1-eps] 内
y_pred = np.clip(y_pred, eps, 1 - eps)
# 二分类交叉熵损失函数的二阶导数(海森矩阵)
hessian = y_true / (y_pred ** 2) + (1 - y_true) / ((1 - y_pred) ** 2)
return hessian
gradient = binary_cross_entropy_gradient(preds, labels)
hessian = binary_cross_entropy_hessian(preds, labels)
# 防止梯度爆炸做剪裁
l2 = np.linalg.norm(gradient)
max_norm = 0.43
if l2>max_norm:
gradient = gradient*(max_norm/l2)
l2 = np.linalg.norm(hessian)
if l2>max_norm:
hessian = hessian*(max_norm/l2)
return gradient, hessian
# 设置模型参数
params = {
'eval_metric': 'error', # 评估指标为错误率
'seed': seed
}
# 训练模型
num_rounds = 100 # 迭代次数
model = xgb.train(params, dtrain, num_rounds,obj=custom_loss)
# 预测
y_pred = model.predict(dtest)
y_pred_binary = [1 if p > 0.1 else 0 for p in y_pred] # 将概率转换为类别
# 计算准确率
accuracy = accuracy_score(y_test, y_pred_binary)
print("Accuracy:", accuracy)
Accuracy: 0.9122807017543859
3、AUC不一致的原因猜测
可能是对梯度爆炸或者消失时的处理机制不一致造成的