python 实现lgb训练及模型预测

 1. 训练lightgbm模型

1.1 参数定义

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import datetime
import matplotlib.pyplot as plt

# 模型特征
feature_list = ['feature1', 'feature2', 'feature3', 'feature4', 'feature5']
# 类别型特征
category_feature_list = ['feature1', 'feature2']
# 模型参数
params = {
    'boosting_type': 'gbdt',
    #'boosting': 'dart',
    'objective': 'binary',
    'metric': {'binary_logloss', 'auc'},
    # 'metric': 'auc',
    'learning_rate': 0.01,
    'num_leaves': 10,
    'max_depth': 5,
    'max_bin': 10,
    'min_data_in_leaf': 8,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 0,
    'lambda_l2': 75,
    'min_split_gain': 0,
    'boost_from_average': False,
    'is_unbalance': True,
    'num_trees': 1,
    'verbose': 0
}

1.2 数据预处理

data = pd.read_csv('data.tsv', sep='\t').round(decimals=4)
X,y = data[feature_list].values, data['label'].values
# X, y = data.loc[:, feature_list], data.loc[:, 'label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=0, stratify=y_test)
# 数据转换
lgb_train = lgb.Dataset(data=X_train, label=y_train, categorical_feature=category_feature_list)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train, categorical_feature=category_feature_list)
# lgb_train = lgb.Dataset(X_train, y_train)
# lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

1.3 模型训练

gbm = lgb.train(params,
    train_set=lgb_train,
    valid_sets=lgb_eval,
    # valid_sets=[lgb_train, lgb_eval]
    # categorical_feature=category_feature_list,
    num_boost_round=100,
    early_stopping_rounds=50)
print(gbm.best_iteration)  # 最佳迭代次数
gbm.save_model('../models/gbm.model')

def lgb_test(train_x, train_y, test_x, test_y):
    from multiprocessing import cpu_count
    model = lgb.LGBMClassifier(
        boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
        max_depth=2, n_estimators=800,max_features = 140, objective='binary',
        subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
        learning_rate=0.05, min_child_weight=50,random_state=None,n_jobs=cpu_count()-1,
        num_iterations = 800 #迭代次数
    )
    model.fit(train_x, train_y,eval_set=[(train_x, train_y),(test_x,test_y)],eval_metric='auc',early_stopping_rounds=100)
    print(model.n_features_)
    print(clf.best_score_['valid_1']['auc'])
    return model

1.4 其他

1. 给样本增加权重

2. 计算woe

3. 决策树可视化

2. 对测试集打分

import pandas as pd
import lightgbm as lgb
import sys
THRESHOLD = 0.28 # 设定截断阈值
data_input_path = sys.argv[1] # 数据输入路径
data_output_path = sys.argv[2] # 数据输出路径
model_file_path = sys.argv[3] # 模型路径

# 1.加载测试数据
test_data = pd.read_csv(data_input_path, sep='\t')
# 2.加载模型
lgb_model = lgb.Booster(model_file=model_file_path)
# 3.特征选择
feature_names = [item for item in test_data.columns if 'open_' in item] + ['up_cnt']
# 4.模型预测
test_data['score'] = lgb_model.predict(test_data.loc[:, feature_names])
# y_preds = lgb_model.predict(test_data.loc[:, feature_names], num_iteration=gbm.best_iteration)
# y_test = y_test.astype(np.float64)
# print('log_loss', metrics.log_loss(y_test, y_preds))
# 5.打分后保留字段
keep_cols = ['phone', 'score', 'if_333']
# 6.根据阈值截断保留样本
result = test_data.loc[test_data['score'] >= THRESHOLD, keep_cols]
# 7.样本shuffle
result = result.sample(frac=1, random_state=1024)
# 8.根据某一字段筛选样本
result = result[result['if_333'] == 0]
# print(result.score.min())
# 9.打分结果保存到csv文件,(index=0,忽略索引;header=0,忽略表头)
result.to_csv(data_output_path, index=0, header=0, sep='\t', float_format='%.4f')

  • 2
    点赞
  • 14
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
以下是用Python实现lgb模型的StratifiedKFold的示例代码: ```python import lightgbm as lgb from sklearn.model_selection import StratifiedKFold # 假设数据集的特征矩阵为 X,标签为 y # 定义模型参数 params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'num_leaves': 31, 'learning_rate': 0.05, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': -1, 'random_state': 2021 } # 定义StratifiedKFold交叉验证 n_splits = 5 skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2021) # 定义输出变量 oof_preds = np.zeros(X.shape[0]) # 开始交叉验证 for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)): print("Fold", fold+1) X_train, X_valid = X[train_idx], X[valid_idx] y_train, y_valid = y[train_idx], y[valid_idx] # 定义训练数据 lgb_train = lgb.Dataset(X_train, y_train) lgb_valid = lgb.Dataset(X_valid, y_valid) # 训练模型 model = lgb.train(params, lgb_train, valid_sets=[lgb_valid], num_boost_round=10000, early_stopping_rounds=100, verbose_eval=100) # 对验证集进行预测 valid_preds = model.predict(X_valid, num_iteration=model.best_iteration) oof_preds[valid_idx] = valid_preds print("-" * 50) # 输出交叉验证结果 print("Overall AUC:", roc_auc_score(y, oof_preds)) ``` 在这个示例中,我们使用了lightgbm作为模型,同时使用了sklearn中的StratifiedKFold来进行交叉验证。示例中的模型参数可以根据具体任务进行调整。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值