使用lightGBM进行验证的demo

一、lightGBM处理回归问题

import pandas as pd
import numpy as np

#final_data = new_boston
#labels = boston.target

#在实践中数据集往往是一个dataFrame。
#这时可以使用
#final_data = all_data_df[feature_names].values
#labels = all_data_df['label'].values
#的方法。其中,feature_names为特征list,形如['age', 'salary', 'height']等

print (type(final_data))
print (final_data.shape)

print (type(labels))
print (labels.shape)

# final_data、labels的类型如下:
# 输出:
# <class 'numpy.ndarray'>
# (506, 13)
# <class 'numpy.ndarray'>
# (506,)




import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns
import gc#用来释放内存
params = {'num_leaves': 100, #结果对最终效果影响较大,越大值越好,太大会出现过拟合
          'min_data_in_leaf': 100,
          'objective': 'regression', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.05,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction": 0.8,  #提取的特征比率
          "bagging_freq": 1,
          "bagging_fraction": 0.85,
          "bagging_seed": 11,
          "lambda_l1": 0.01,             #l1正则
          'lambda_l2': 0.001,     #l2正则
          "verbosity": -1,
          "nthread": -1,                #线程数量,-1表示全部线程,线程越多,运行的速度越快
          'metric': {'mse'},#,'binary_error', 'binary_logloss',},  ##评价函数选择
          "random_state": 2021, #随机数种子,可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          }

train_x_flat = final_data
train_y = labels
#features = final_data.columns
print(train_x_flat.shape, train_y.shape)

folds = KFold(n_splits=5, shuffle=True, random_state=2019)
prob_oof = np.zeros((train_x_flat.shape[0], ))
# test_pred_prob = np.zeros((test.shape[0], ))


## train and predict
num_round = 1500
feature_importance_df = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_x_flat)):
    print("fold {}".format(fold_ + 1))
    trn_data = lgb.Dataset(train_x_flat[trn_idx], label=train_y[trn_idx])
    val_data = lgb.Dataset(train_x_flat[val_idx], label=train_y[val_idx])


    clf = lgb.train(params,
                    trn_data,
                    num_round,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=30,
                    early_stopping_rounds=100)
    prob_oof[val_idx] = clf.predict(train_x_flat[val_idx], num_iteration=clf.best_iteration)

#     fold_importance_df = pd.DataFrame()
#     fold_importance_df["Feature"] = features
#     fold_importance_df["importance"] = clf.feature_importance()
#     fold_importance_df["fold"] = fold_ + 1
#     feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
from sklearn import metrics
print(metrics.mean_squared_error(train_y, prob_oof))

#



二、使用lightGBM处理二分类问题:

基本框架相同,区别在于修改了objective和metric

import pandas as pd
import numpy as np

final_data = new_boston
labels = boston.target

print (type(final_data))
print (final_data.shape)

print (type(labels))
print (labels.shape)

# final_data、labels的类型如下:
# 输出:
# <class 'numpy.ndarray'>
# (506, 13)
# <class 'numpy.ndarray'>
# (506,)




import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns
import gc#用来释放内存
params = {'num_leaves': 100, #结果对最终效果影响较大,越大值越好,太大会出现过拟合
          'min_data_in_leaf': 100,
          'objective': 'binary', #定义的目标函数
          'max_depth': -1,
          'learning_rate': 0.05,
          "min_sum_hessian_in_leaf": 6,
          "boosting": "gbdt",
          "feature_fraction": 0.8,  #提取的特征比率
          "bagging_freq": 1,
          "bagging_fraction": 0.85,
          "bagging_seed": 11,
          "lambda_l1": 0.01,             #l1正则
          'lambda_l2': 0.001,     #l2正则
          "verbosity": -1,
          "nthread": -1,                #线程数量,-1表示全部线程,线程越多,运行的速度越快
          'metric': {'auc'},#,'binary_error', 'binary_logloss',},  ##评价函数选择
          "random_state": 2021, #随机数种子,可以防止每次运行的结果不一致
          # 'device': 'gpu' ##如果安装的事gpu版本的lightgbm,可以加快运算
          }

train_x_flat = final_data
train_y = labels
#features = final_data.columns
print(train_x_flat.shape, train_y.shape)

folds = KFold(n_splits=5, shuffle=True, random_state=2019)
prob_oof = np.zeros((train_x_flat.shape[0], ))
# test_pred_prob = np.zeros((test.shape[0], ))


## train and predict
num_round = 1500
feature_importance_df = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_x_flat)):
    print("fold {}".format(fold_ + 1))
    trn_data = lgb.Dataset(train_x_flat[trn_idx], label=train_y[trn_idx])
    val_data = lgb.Dataset(train_x_flat[val_idx], label=train_y[val_idx])


    clf = lgb.train(params,
                    trn_data,
                    num_round,
                    valid_sets=[trn_data, val_data],
                    verbose_eval=30,
                    early_stopping_rounds=100)
    prob_oof[val_idx] = clf.predict(train_x_flat[val_idx], num_iteration=clf.best_iteration)

#     fold_importance_df = pd.DataFrame()
#     fold_importance_df["Feature"] = features
#     fold_importance_df["importance"] = clf.feature_importance()
#     fold_importance_df["fold"] = fold_ + 1
#     feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
from sklearn import metrics
print(metrics.roc_auc_score(train_y, prob_oof))

#



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值