python实现LightGBM(进阶)

最新推荐文章于 2024-06-07 15:13:12 发布

weixin_43440760

最新推荐文章于 2024-06-07 15:13:12 发布

阅读量1.3k

点赞数 3

文章标签： python 机器学习人工智能

本文链接：https://blog.csdn.net/weixin_43440760/article/details/108978573

版权

文章目录

1. 模块准备

import json
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

try:
    import cPickle as pickle
except BaseException:
    import pickle

2. 加载并构建数据集

print('Loading data...')
# load or create your dataset
df_train = pd.read_csv('../binary_classification/binary.train', header=None, sep='\t')
df_test = pd.read_csv('../binary_classification/binary.test', header=None, sep='\t')
W_train = pd.read_csv('../binary_classification/binary.train.weight', header=None)[0]
W_test = pd.read_csv('../binary_classification/binary.test.weight', header=None)[0]

y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)

num_train, num_feature = X_train.shape

# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
lgb_train = lgb.Dataset(X_train, y_train,
                        weight=W_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
                       weight=W_test, free_raw_data=False)

3. 配置参数

包含特征命名、分类变量设置

LightGBM可直接处理分类变量，不需要做one-hot。

# 参数配置
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

# 特征命名
feature_name = ['feature_' + str(col) for col in range(num_feature)]

print('Starting training...')
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                valid_sets=lgb_train,  #验证集设置
                feature_name=feature_name,  #特征命名
                categorical_feature=[21]  #设置分类变量
                )

print('Finished first 10 rounds...')
# 检查特征名
print('7th feature name is:', lgb_train.feature_name[6])

4. 存储并读取模型（JSON）

print('Saving model...')
# save model to file
gbm.save_model('model.txt')

print('Dumping model to JSON...')
# dump model to JSON (and save to file)
model_json = gbm.dump_model()

with open('model.json', 'w+') as f:
    json.dump(model_json, f, indent=4)

5. 特征重要性

# feature names
print('Feature names:', gbm.feature_name())

# feature importances
print('Feature importances:', list(gbm.feature_importance()))

6. 模型预测

print('Loading model to predict...')
# 模型加载
bst = lgb.Booster(model_file='model.txt')
# 模型预测
y_pred = bst.predict(X_test)
# 模型验证
print("The rmse of loaded model's prediction is:", mean_squared_error(y_test, y_pred) ** 0.5)

7. 存储并读取模型（pickle）

print('Dumping and loading model with pickle...')
# 模型保存
with open('model.pkl', 'wb') as fout:
    pickle.dump(gbm, fout)
# 模型加载
with open('model.pkl', 'rb') as fin:
    pkl_bst = pickle.load(fin)
# 模型预测
y_pred = pkl_bst.predict(X_test, num_iteration=7)
# 模型验证
print("The rmse of pickled model's prediction is:", mean_squared_error(y_test, y_pred) ** 0.5)

8. 训练（断点训练/学习率衰减/训练过程修改参数）

# 断点训练
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model='model.txt',  #读取存储模型，进行后续训练
                valid_sets=lgb_eval)

print('Finished 10 - 20 rounds with model file...')

# 学习率衰减
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                learning_rates=lambda iter: 0.05 * (0.99 ** iter),  # 学习率衰减
                valid_sets=lgb_eval)

print('Finished 20 - 30 rounds with decay learning rates...')

# 训练过程修改参数
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                valid_sets=lgb_eval,
                callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])

print('Finished 30 - 40 rounds with changing bagging_fraction...')

9. 自定义目标函数/评价指标

# 自定义目标函数
def loglikelihood(preds, train_data):
    labels = train_data.get_label()
    preds = 1. / (1. + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1. - preds)
    return grad, hess
# 自定义评价指标binary_error
def binary_error(preds, train_data):
    labels = train_data.get_label()
    preds = 1. / (1. + np.exp(-preds))
    return 'error', np.mean(labels != (preds > 0.5)), False

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                fobj=loglikelihood, # 目标函数
                feval=binary_error, # 评价指标
                valid_sets=lgb_eval)

print('Finished 40 - 50 rounds with self-defined objective function and eval metric...')

# 自定义评价指标accuracy
def accuracy(preds, train_data):
    labels = train_data.get_label()
    preds = 1. / (1. + np.exp(-preds))
    return 'accuracy', np.mean(labels == (preds > 0.5)), True

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                fobj=loglikelihood,
                feval=[binary_error, accuracy],  #多个评价指标
                valid_sets=lgb_eval)

print('Finished 50 - 60 rounds with self-defined objective function '
      'and multiple self-defined eval metrics...')

10. 回调函数

print('Starting a new training job...')
# callback
def reset_metrics():
    def callback(env):
        lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
        if env.iteration - env.begin_iteration == 5:
            print('Add a new valid dataset at iteration 5...')
            env.model.add_valid(lgb_eval_new, 'new_valid')
    callback.before_iteration = True
    callback.order = 0
    return callback

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                valid_sets=lgb_train,
                callbacks=[reset_metrics()])

print('Finished first 10 rounds with callback function...')

weixin_43440760

关注

3
点赞
踩
10

收藏

觉得还不错? 一键收藏
1
评论
python实现LightGBM(进阶)

文章目录1. 模块准备2. 加载并构建数据集3. 配置参数4. 存储并读取模型（JSON）5. 特征重要性6. 模型预测7. 存储并读取模型（pickle）8. 训练（断点训练/学习率衰减/训练过程修改参数）9. 自定义目标函数/评价指标10. 回调函数1. 模块准备import jsonimport lightgbm as lgbimport pandas as pdimport numpy as npfrom sklearn.metrics import mean_squared_erro
复制链接

扫一扫