xgboost官网代码调试
https://xgboost.readthedocs.io/en/latest/python/python_api.html#
https://github.com/reader-sword/xgboost
import xgboost as xgb
# read in data
# label_column specifies the index of the column containing the true label
# 加载csv文件示例
# dtrain = xgb.DMatrix('train.csv?format=csv&label_column=0')
# dtest = xgb.DMatrix('test.csv?format=csv&label_column=0')
dtrain = xgb.DMatrix('C:/data/xgboost_data/agaricus_train.txt')
dtest = xgb.DMatrix('C:/data/xgboost_data/agaricus_test.txt')
# specify parameters via map
param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic' }
num_round = 2
bst = xgb.train(param, dtrain, num_round)
# make prediction
preds = bst.predict(dtest)
print(preds)
[15:30:44] 6513x127 matrix with 143286 entries loaded from C:/data/xgboost_data/agaricus_train.txt
[15:30:44] 1611x127 matrix with 35442 entries loaded from C:/data/xgboost_data/agaricus_test.txt
[0.28583017 0.9239239 0.28583017 ... 0.9239239 0.05169873 0.9239239 ]
# 加载矩阵数据
import pandas as pd
import numpy as np
data = np.random.rand(5, 10) # 5 entities, each contains 10 features
label = np.random.randint(2, size=5) # binary target
print(label)
dtrain = xgb.DMatrix(data, label=label)
[0 0 1 0 0]
# 加载pandas数据
data = pd.DataFrame(np.arange(12).reshape((4,3)), columns=['a', 'b', 'c'])
data.head()
label = pd.DataFrame(np.random.randint(2, size=4))
label.head()
dtrain = xgb.DMatrix(data, label=label)
print(dtrain)
<xgboost.core.DMatrix object at 0x000001AC4A9F25C0>
# Saving DMatrix into a XGBoost binary file will make loading faster:
# 存储为DMatrix文件可能访问更快
dtrain = xgb.DMatrix('train.svm.txt')
dtrain.save_binary('train.buffer')
# 设置参数
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = 'auc'
# You can also specify multiple eval metrics:
param['eval_metric'] = ['auc', 'ams@0']
# alternatively:
# plst = param.items()
# plst += [('eval_metric', 'ams@0')]
# Specify validations set to watch performance
evallist = [(dtest, 'eval'), (dtrain, 'train')]
xgb.train
# 开始训练
num_round = 10
bst = xgb.train(param, dtrain, num_round, evallist)
# After training, the model can be saved. 存储特征模型
bst.save_model('0001.model')
# The model and its feature map can also be dumped to a text file.
# dump model
bst.dump_model('dump.raw.txt')
# dump model with feature map
bst.dump_model('dump.raw.txt', 'featmap.txt')
# A saved model can be loaded as follows: 加载之前存储的 模型
bst = xgb.Booster({'nthread': 4}) # init model
bst.load_model('model.bin') # load data
# Prediction 开始预测
# A model that has been trained or loaded can perform predictions on data sets.
# 7 entities, each contains 10 features
data = np.random.rand(7, 10)
dtest = xgb.DMatrix(data)
ypred = bst.predict(dtest)
# 画图
xgb.plot_importance(bst)