切片树模型|Slice tree model
当XGBoost中的 booster
参数设置为 gbtree
或 dart
时,算法构建了一个由多棵树组成的树模型。这个树模型可以被切片成多个子模型,每个子模型包含原始模型中一部分树。这个切片过程允许创建更小、更专业的模型,专注于原始模型性能或行为的特定方面。
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.datasets import make_classification
num_classes = 3
X, y = make_classification(n_samples=1000, n_informative=5, n_classes=num_classes)
dtrain = xgb.DMatrix(data=X, label=y)
num_parallel_tree = 4
num_boost_round = 16
# total number of built trees is num_parallel_tree * num_classes * num_boost_round
# We build a boosted random forest for classification here.
booster = xgb.train({
'num_parallel_tree': 4, 'subsample': 0.5, 'num_class': 3},
num_boost_round=num_boost_round, dtrain=dtrain)
# This is the sliced model, containing [3, 7) forests
# step is also supported with some limitations like negative step is invalid.
sliced: xgb.Booster = booster[3:7]
# Access individual tree layer
trees = [_ for _ in booster]
assert len(trees) == num_boost_round
sliced_trees = [_ for _ in sliced]
assert len(sliced_trees) == 4
print(trees)
print(sliced_trees)
[<xgboost.core.Booster at 0x7f04a6e20820>, <xgboost.core.Booster at 0x7f04a6d89250>, <xgboost.core.Booster at 0x7f04a6fac550>, <xgboost.core.Booster at 0x7f048b1eaee0>, <xgboost.core.Booster at 0x7f048b1ea520>, <xgboost.core.Booster at 0x7f048b1ea850>, <xgboost.core.Booster at 0x7f048b1ea700>, <xgboost.core.Booster at 0x7f048b1eabe0>, <xgboost.core.Booster at 0x7f04a6e3ddf0>, <xgboost.core.Booster at 0x7f04a6e3d850>, <xgboost.core.Booster at 0x7f048b1e72e0>, <xgboost.core.Booster at 0x7f048b1e7970>, <xgboost.core.Booster at 0x7f048b1e75b0>, <xgboost.core.Booster at 0x7f048b1e79d0>, <xgboost.core.Booster at 0x7f048b1e7f10>, <xgboost.core.Booster at 0x7f048b1e74c0>] [<xgboost.core.Booster at 0x7f04a6b82f40>, <xgboost.core.Booster at 0x7f04736dfeb0>, <xgboost.core.Booster at 0x7f04736dfbe0>, <xgboost.core.Booster at 0x7f04736dfd00>]
切片模型是所选树的副本,这意味着在切片过程中模型本身是不可变的。这个特性是早停回调中 save_best
选项的基础。
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.datasets import make_classification
from scipy.special import logit
from sklearn.datasets import load_svmlight_file
def individual_tree() -> None:
"""Get prediction from each individual tree and combine them together."""
X_train, y_train = load_svmlight_file(train)
X_test, y_test = load_svmlight_file(test)
Xy_train = xgb.QuantileDMatrix(X_train, y_train)
n_rounds = 4
# Specify the base score, otherwise xgboost will estimate one from the training data.
base_score = 0.5
params = {
"max_depth": 2,
"eta": 1,
"objective": "reg:logistic",
"tree_method": "hist",
"base_score": base_score,
}
booster = xgb.train(params, Xy_train, num_boost_round=n_rounds)
# Use logit to inverse the base score back to raw leaf value (margin)
scores = np.full((X_test.shape[0],), logit(base_score))
for i in range(n_rounds):
# - Use output_margin to get raw leaf values
# - Use iteration_range to get prediction for only one tree
# - Use previous prediction as base marign for the model
Xy_test = xgb.DMatrix(X_test, base_margin=scores)
if i == n_rounds - 1:
# last round, get the transformed prediction
scores = booster.predict(
Xy_test, iteration_range=(i, i + 1), output_margin=False
)
else:
# get raw leaf value for accumulation
scores = booster.predict(
Xy_test, iteration_range=(i, i + 1), output_margin=True
)
full = booster.predict(xgb.DMatrix(X_test), output_margin=False)
np.testing.assert_allclose(scores, full)
def model_slices() -> None:
"""Inference with each individual tree using model slices."""
X_train, y_train = load_svmlight_file(train)
X_test, y_test = load_svmlight_file(test)
Xy_train = xgb.QuantileDMatrix(X_train, y_train)
n_rounds = 4
# Specify the base score, otherwise xgboost will estimate one from the training data.
base_score = 0.5
params = {
"max_depth": 2,
"eta": 1,
"objective": "reg:logistic",
"tree_method": "hist",
"base_score": base_score,
}
booster = xgb.train(params, Xy_train, num_boost_round=n_rounds)
trees = [booster[t] for t in range(n_rounds)]
# Use logit to inverse the base score back to raw leaf value (margin)
scores = np.full((X_test.shape[0],), logit(base_score))
for i, t in enumerate(trees):
# Feed previous scores into base margin.
Xy_test = xgb.DMatrix(X_test, base_margin=scores)
if i == n_rounds - 1:
# last round, get the transformed prediction
scores = t.predict(Xy_test, output_margin=False)
else:
# get raw leaf value for accumulation
scores = t.predict(Xy_test, output_margin=True)
full = booster.predict(xgb.DMatrix(X_test), output_margin=False)
np.testing.assert_allclose(scores, full)
individual_tree()
model_slices()
两个函数演示如何使用XGBoost库来获取每棵单独树的预测,并将它们结合起来得到最终的预测结果。
individual_tree
函数:- 使用
scipy.special.logit
函数来反转基础分数(base score
)回到原始叶值(margin
) - 加载训练和测试数据集。
- 定义XGBoost参数,并使用
xgb.train
方法来训练模型。 - 使用
booster.predict
方法来获取每棵树的预测。对于最后一轮,获取转换后的预测;对于其他轮,获取原始叶值以便累加。 - 使用
np.testing.assert_allclose
方法来验证单独树预测的结合结果与使用整个模型得到的预测结果是否相近。
- 使用
model_slices
函数:- 与
individual_tree
函数类似,也是加载训练和测试数据集,定义XGBoost参数,并使用xgb.train
方法来训练模型。 - 使用
booster[t]
来获取每棵树,并将其存储在一个列表中。 - 使用每棵树的
predict
方法来获取预测。对于最后一轮,获取转换后的预测;对于其他轮,获取原始叶值以便累加。 - 同样使用
np.testing.assert_allclose
方法来验证单独树预测的结合结果与使用整个模型得到的预测结果是否相近。
- 与