python实现LightGBM(进阶)

1. 模块准备

import json
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

try:
    import cPickle as pickle
except BaseException:
    import pickle

2. 加载并构建数据集

print('Loading data...')
# load or create your dataset
df_train = pd.read_csv('../binary_classification/binary.train', header=None, sep='\t')
df_test = pd.read_csv('../binary_classification/binary.test', header=None, sep='\t')
W_train = pd.read_csv('../binary_classification/binary.train.weight', header=None)[0]
W_test = pd.read_csv('../binary_classification/binary.test.weight', header=None)[0]

y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)

num_train, num_feature = X_train.shape

# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
lgb_train = lgb.Dataset(X_train, y_train,
                        weight=W_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
                       weight=W_test, free_raw_data=False)

3. 配置参数

包含特征命名、分类变量设置

  • LightGBM可直接处理分类变量,不需要做one-hot。
# 参数配置
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

# 特征命名
feature_name = ['feature_' + str(col) for col in range(num_feature)]

print('Starting training...')
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                valid_sets=lgb_train,  #验证集设置
                feature_name=feature_name,  #特征命名
                categorical_feature=[21]  #设置分类变量
                )

print('Finished first 10 rounds...')
# 检查特征名
print('7th feature name is:', lgb_train.feature_name[6])

4. 存储并读取模型(JSON)

print('Saving model...')
# save model to file
gbm.save_model('model.txt')

print('Dumping model to JSON...')
# dump model to JSON (and save to file)
model_json = gbm.dump_model()

with open('model.json', 'w+') as f:
    json.dump(model_json, f, indent=4)

5. 特征重要性

# feature names
print('Feature names:', gbm.feature_name())

# feature importances
print('Feature importances:', list(gbm.feature_importance()))

6. 模型预测

print('Loading model to predict...')
# 模型加载
bst = lgb.Booster(model_file='model.txt')
# 模型预测
y_pred = bst.predict(X_test)
# 模型验证
print("The rmse of loaded model's prediction is:", mean_squared_error(y_test, y_pred) ** 0.5)

7. 存储并读取模型(pickle)

print('Dumping and loading model with pickle...')
# 模型保存
with open('model.pkl', 'wb') as fout:
    pickle.dump(gbm, fout)
# 模型加载
with open('model.pkl', 'rb') as fin:
    pkl_bst = pickle.load(fin)
# 模型预测
y_pred = pkl_bst.predict(X_test, num_iteration=7)
# 模型验证
print("The rmse of pickled model's prediction is:", mean_squared_error(y_test, y_pred) ** 0.5)

8. 训练(断点训练/学习率衰减/训练过程修改参数)

# 断点训练
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model='model.txt',  #读取存储模型,进行后续训练
                valid_sets=lgb_eval)

print('Finished 10 - 20 rounds with model file...')

# 学习率衰减
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                learning_rates=lambda iter: 0.05 * (0.99 ** iter),  # 学习率衰减
                valid_sets=lgb_eval)

print('Finished 20 - 30 rounds with decay learning rates...')

# 训练过程修改参数
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                valid_sets=lgb_eval,
                callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])

print('Finished 30 - 40 rounds with changing bagging_fraction...')

9. 自定义目标函数/评价指标

# 自定义目标函数
def loglikelihood(preds, train_data):
    labels = train_data.get_label()
    preds = 1. / (1. + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1. - preds)
    return grad, hess
# 自定义评价指标binary_error
def binary_error(preds, train_data):
    labels = train_data.get_label()
    preds = 1. / (1. + np.exp(-preds))
    return 'error', np.mean(labels != (preds > 0.5)), False

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                fobj=loglikelihood, # 目标函数
                feval=binary_error, # 评价指标
                valid_sets=lgb_eval)

print('Finished 40 - 50 rounds with self-defined objective function and eval metric...')

# 自定义评价指标accuracy
def accuracy(preds, train_data):
    labels = train_data.get_label()
    preds = 1. / (1. + np.exp(-preds))
    return 'accuracy', np.mean(labels == (preds > 0.5)), True

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                fobj=loglikelihood,
                feval=[binary_error, accuracy],  #多个评价指标
                valid_sets=lgb_eval)

print('Finished 50 - 60 rounds with self-defined objective function '
      'and multiple self-defined eval metrics...')

10. 回调函数

print('Starting a new training job...')
# callback
def reset_metrics():
    def callback(env):
        lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
        if env.iteration - env.begin_iteration == 5:
            print('Add a new valid dataset at iteration 5...')
            env.model.add_valid(lgb_eval_new, 'new_valid')
    callback.before_iteration = True
    callback.order = 0
    return callback

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                valid_sets=lgb_train,
                callbacks=[reset_metrics()])

print('Finished first 10 rounds with callback function...')
  • 3
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
下面是一个简单的例子,使用 Python 中的 LightGBM 库进行分类任务: 首先,需要安装 LightGBM 库,可以使用以下命令进行安装: ```python !pip install lightgbm ``` 然后,加载数据集并进行预处理: ```python import pandas as pd from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split # 加载数据集 iris = load_iris() # 转换为 pandas DataFrame data = pd.DataFrame(data=iris.data, columns=iris.feature_names) # 添加目标变量 data['target'] = iris.target # 划分数据集为训练集和测试集 train_data, test_data, train_label, test_label = train_test_split(data[iris.feature_names], data['target'], test_size=0.2, random_state=42) ``` 接下来,使用 LightGBM 进行分类: ```python import lightgbm as lgb # 将数据转换为 LightGBM 的数据格式 train_data = lgb.Dataset(train_data, label=train_label) test_data = lgb.Dataset(test_data, label=test_label) # 设置参数 params = { 'objective': 'multiclass', 'num_class': 3, 'metric': 'multi_logloss', 'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.9 } # 训练模型 model = lgb.train(params, train_data, valid_sets=[test_data]) ``` 最后,使用训练好的模型进行预测: ```python # 预测测试集 pred_label = model.predict(test_data) # 转换为类别 pred_label = [list(x).index(max(x)) for x in pred_label] # 计算准确率 accuracy = sum(test_label == pred_label) / len(test_label) print('Accuracy:', accuracy) ``` 以上就是使用 Python 中的 LightGBM 库进行分类任务的简单例子。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值