模型调参

建模与调参

导入相关关和相关设置

import pandas as pd
import numpy as np
from sklearn.metrics import f1_score

import os
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

读取数据

reduce_mem_usage 函数通过调整数据类型,帮助我们减少数据在内存中占用的空间

def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df
# 读取数据
data = pd.read_csv('data/train.csv')
# 简单预处理
data_list = []
for items in data.values:
    data_list.append([items[0]] + [float(i) for i in items[1].split(',')] + [items[2]])

data = pd.DataFrame(np.array(data_list))
data.columns = ['id'] + ['s_'+str(i) for i in range(len(data_list[0])-2)] + ['label']

data = reduce_mem_usage(data)
Memory usage of dataframe is 157.93 MB
Memory usage after optimization is: 39.67 MB
Decreased by 74.9%

简单建模

基于树模型的算法特性,异常值、缺失值处理可以跳过,但是对于业务较为了解的同学也可以自己对缺失异常值进行处理,效果可能会更优于模型处理的结果。

注:以下建模的据集并未构造任何特征,直接使用原特征。本次主要任务还是模建模调参。

建模之前的预操作

from sklearn.model_selection import KFold
# 分离数据集,方便进行交叉验证
X_train = data.drop(['id','label'], axis=1)
y_train = data['label']

# 5折交叉验证
folds = 5
seed = 2021
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

因为树模型中没有f1-score评价指标,所以需要自定义评价指标,在模型迭代中返回验证集f1-score变化情况。

def f1_score_vali(preds, data_vali):
    labels = data_vali.get_label()
    preds = np.argmax(preds.reshape(4, -1), axis=0)
    score_vali = f1_score(y_true=labels, y_pred=preds, average='macro')
    return 'f1_score', score_vali, True

使用Lightgbm进行建模

"""对训练集数据进行划分,分成训练集和验证集,并进行相应的操作"""
from sklearn.model_selection import train_test_split
import lightgbm as lgb
# 数据集划分
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2)
train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
valid_matrix = lgb.Dataset(X_val, label=y_val)

params = {
    "learning_rate": 0.1,
    "boosting": 'gbdt',  
    "lambda_l2": 0.1,
    "max_depth": -1,
    "num_leaves": 128,
    "bagging_fraction": 0.8,
    "feature_fraction": 0.8,
    "metric": None,
    "objective": "multiclass",
    "num_class": 4,
    "nthread": 10,
    "verbose": -1,
}

"""使用训练集数据进行模型训练"""
model = lgb.train(params, 
                  train_set=train_matrix, 
                  valid_sets=valid_matrix, 
                  num_boost_round=2000, 
                  verbose_eval=50, 
                  early_stopping_rounds=200,
                  feval=f1_score_vali)
Training until validation scores don't improve for 200 rounds
[50]	valid_0's multi_logloss: 0.0535465	valid_0's f1_score: 0.953675
[100]	valid_0's multi_logloss: 0.0484882	valid_0's f1_score: 0.961373
[150]	valid_0's multi_logloss: 0.0507799	valid_0's f1_score: 0.962653
[200]	valid_0's multi_logloss: 0.0531035	valid_0's f1_score: 0.963224
[250]	valid_0's multi_logloss: 0.0547945	valid_0's f1_score: 0.963721
Early stopping, best iteration is:
[88]	valid_0's multi_logloss: 0.0482441	valid_0's f1_score: 0.959676

对验证集进行预测

val_pre_lgb = model.predict(X_val, num_iteration=model.best_iteration)
preds = np.argmax(val_pre_lgb, axis=1)
score = f1_score(y_true=y_val, y_pred=preds, average='macro')
print('未调参前lightgbm单模型在验证集上的f1:{}'.format(score))
未调参前lightgbm单模型在验证集上的f1:0.9596756568138634

更进一步的,使用5折交叉验证进行模型性能评估

"""使用lightgbm 5折交叉验证进行建模预测"""
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(X_train, y_train)):
    print('************************************ {} ************************************'.format(str(i+1)))
    X_train_split, y_train_split, X_val, y_val = X_train.iloc[train_index], y_train[train_index], X_train.iloc[valid_index], y_train[valid_index]
    
    train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
    valid_matrix = lgb.Dataset(X_val, label=y_val)

    params = {
                "learning_rate": 0.1,
                "boosting": 'gbdt',  
                "lambda_l2": 0.1,
                "max_depth": -1,
                "num_leaves": 128,
                "bagging_fraction": 0.8,
                "feature_fraction": 0.8,
                "metric": None,
                "objective": "multiclass",
                "num_class": 4,
                "nthread": 10,
                "verbose": -1,
            }
    
    model = lgb.train(params, 
                      train_set=train_matrix, 
                      valid_sets=valid_matrix, 
                      num_boost_round=2000, 
                      verbose_eval=100, 
                      early_stopping_rounds=200,
                      feval=f1_score_vali)
    
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    
    val_pred = np.argmax(val_pred, axis=1)
    cv_scores.append(f1_score(y_true=y_val, y_pred=val_pred, average='macro'))
    print(cv_scores)

print("lgb_scotrainre_list:{}".format(cv_scores))
print("lgb_score_mean:{}".format(np.mean(cv_scores)))
print("lgb_score_std:{}".format(np.std(cv_scores)))
...
lgb_scotrainre_list:[0.9674515729721614, 0.9656700872844327, 0.9700043639844769, 0.9655663272378014, 0.9631137190307674]
lgb_score_mean:0.9663612141019279
lgb_score_std:0.0022854824074775683

模型调参

  • 1. 贪心调参

    先使用当前对模型影响最大的参数进行调优,达到当前参数下的模型最优化,再使用对模型影响次之的参数进行调优,如此下去,直到所有的参数调整完毕。

    这个方法的缺点就是可能会调到局部最优而不是全局最优,但是只需要一步一步的进行参数最优化调试即可,容易理解。

    需要注意的是在树模型中参数调整的顺序,也就是各个参数对模型的影响程度,这里列举一下日常调参过程中常用的参数和调参顺序:

    • ①:max_depth、num_leaves
    • ②:min_data_in_leaf、min_child_weight
    • ③:bagging_fraction、 feature_fraction、bagging_freq
    • ④:reg_lambda、reg_alpha
    • ⑤:min_split_gain
    from sklearn.model_selection import cross_val_score
    # 调objective
    best_obj = dict()
    for obj in objective:
        model = LGBMRegressor(objective=obj)
        """预测并计算roc的相关指标"""
        score = cross_val_score(model, X_train, y_train, cv=5, scoring='f1').mean()
        best_obj[obj] = score
    
    # num_leaves
    best_leaves = dict()
    for leaves in num_leaves:
        model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0], num_leaves=leaves)
        """预测并计算roc的相关指标"""
        score = cross_val_score(model, X_train, y_train, cv=5, scoring='f1').mean()
        best_leaves[leaves] = score
    
    # max_depth
    best_depth = dict()
    for depth in max_depth:
        model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0],
                              num_leaves=min(best_leaves.items(), key=lambda x:x[1])[0],
                              max_depth=depth)
        """预测并计算roc的相关指标"""
        score = cross_val_score(model, X_train, y_train, cv=5, scoring='f1').mean()
        best_depth[depth] = score
    
    """
    可依次将模型的参数通过上面的方式进行调整优化,并且通过可视化观察在每一个最优参数下模型的得分情况
    """
    

    可依次将模型的参数通过上面的方式进行调整优化,并且通过可视化观察在每一个最优参数下模型的得分情况

  • 2. 网格搜索

    sklearn 提供GridSearchCV用于进行网格搜索,只需要把模型的参数输进去,就能给出最优化的结果和参数。相比起贪心调参,网格搜索的结果会更优,但是网格搜索只适合于小数据集,一旦数据的量级上去了,很难得出结果。

    同样以Lightgbm算法为例,进行网格搜索调参:

    """通过网格搜索确定最优参数"""
    from sklearn.model_selection import GridSearchCV
    
    def get_best_cv_params(learning_rate=0.1, n_estimators=581, num_leaves=31, max_depth=-1, bagging_fraction=1.0, 
                           feature_fraction=1.0, bagging_freq=0, min_data_in_leaf=20, min_child_weight=0.001, 
                           min_split_gain=0, reg_lambda=0, reg_alpha=0, param_grid=None):
        # 设置5折交叉验证
        cv_fold = KFold(n_splits=5, shuffle=True, random_state=2021)
    
        model_lgb = lgb.LGBMClassifier(learning_rate=learning_rate,
                                       n_estimators=n_estimators,
                                       num_leaves=num_leaves,
                                       max_depth=max_depth,
                                       bagging_fraction=bagging_fraction,
                                       feature_fraction=feature_fraction,
                                       bagging_freq=bagging_freq,
                                       min_data_in_leaf=min_data_in_leaf,
                                       min_child_weight=min_child_weight,
                                       min_split_gain=min_split_gain,
                                       reg_lambda=reg_lambda,
                                       reg_alpha=reg_alpha,
                                       n_jobs= 8
                                      )
    
        f1 = make_scorer(f1_score, average='micro')
        grid_search = GridSearchCV(estimator=model_lgb, 
                                   cv=cv_fold,
                                   param_grid=param_grid,
                                   scoring=f1
    
                                  )
        grid_search.fit(X_train, y_train)
    
        print('模型当前最优参数为:{}'.format(grid_search.best_params_))
        print('模型当前最优得分为:{}'.format(grid_search.best_score_))
    
    """以下代码未运行,耗时较长,请谨慎运行,且每一步的最优参数需要在下一步进行手动更新,请注意"""
    
    """
    需要注意一下的是,除了获取上面的获取num_boost_round时候用的是原生的lightgbm(因为要用自带的cv)
    下面配合GridSearchCV时必须使用sklearn接口的lightgbm。
    """
    """设置n_estimators 为581,调整num_leaves和max_depth,这里选择先粗调再细调"""
    lgb_params = {'num_leaves': range(10, 80, 5), 'max_depth': range(3,10,2)}
    get_best_cv_params(learning_rate=0.1, n_estimators=581, num_leaves=None, max_depth=None, min_data_in_leaf=20, 
                       min_child_weight=0.001,bagging_fraction=1.0, feature_fraction=1.0, bagging_freq=0, 
                       min_split_gain=0, reg_lambda=0, reg_alpha=0, param_grid=lgb_params)
    
    """num_leaves为30,max_depth为7,进一步细调num_leaves和max_depth"""
    lgb_params = {'num_leaves': range(25, 35, 1), 'max_depth': range(5,9,1)}
    get_best_cv_params(learning_rate=0.1, n_estimators=85, num_leaves=None, max_depth=None, min_data_in_leaf=20, 
                       min_child_weight=0.001,bagging_fraction=1.0, feature_fraction=1.0, bagging_freq=0, 
                       min_split_gain=0, reg_lambda=0, reg_alpha=0, param_grid=lgb_params)
    
    """
    确定min_data_in_leaf为45,min_child_weight为0.001 ,下面进行bagging_fraction、feature_fraction和bagging_freq的调参
    """
    lgb_params = {'bagging_fraction': [i/10 for i in range(5,10,1)], 
                  'feature_fraction': [i/10 for i in range(5,10,1)],
                  'bagging_freq': range(0,81,10)
                 }
    get_best_cv_params(learning_rate=0.1, n_estimators=85, num_leaves=29, max_depth=7, min_data_in_leaf=45, 
                       min_child_weight=0.001,bagging_fraction=None, feature_fraction=None, bagging_freq=None, 
                       min_split_gain=0, reg_lambda=0, reg_alpha=0, param_grid=lgb_params)
    
    """
    确定bagging_fraction为0.4、feature_fraction为0.6、bagging_freq为 ,下面进行reg_lambda、reg_alpha的调参
    """
    lgb_params = {'reg_lambda': [0,0.001,0.01,0.03,0.08,0.3,0.5], 'reg_alpha': [0,0.001,0.01,0.03,0.08,0.3,0.5]}
    get_best_cv_params(learning_rate=0.1, n_estimators=85, num_leaves=29, max_depth=7, min_data_in_leaf=45, 
                       min_child_weight=0.001,bagging_fraction=0.9, feature_fraction=0.9, bagging_freq=40, 
                       min_split_gain=0, reg_lambda=None, reg_alpha=None, param_grid=lgb_params)
    
    """
    确定reg_lambda、reg_alpha都为0,下面进行min_split_gain的调参
    """
    lgb_params = {'min_split_gain': [i/10 for i in range(0,11,1)]}
    get_best_cv_params(learning_rate=0.1, n_estimators=85, num_leaves=29, max_depth=7, min_data_in_leaf=45, 
                       min_child_weight=0.001,bagging_fraction=0.9, feature_fraction=0.9, bagging_freq=40, 
                       min_split_gain=None, reg_lambda=0, reg_alpha=0, param_grid=lgb_params)
    
    """
    参数确定好了以后,我们设置一个比较小的learning_rate 0.005,来确定最终的num_boost_round
    """
    # 设置5折交叉验证
    # cv_fold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True, )
    final_params = {
                    'boosting_type': 'gbdt',
                    'learning_rate': 0.01,
                    'num_leaves': 29,
                    'max_depth': 7,
                    'objective': 'multiclass',
                    'num_class': 4,
                    'min_data_in_leaf':45,
                    'min_child_weight':0.001,
                    'bagging_fraction': 0.9,
                    'feature_fraction': 0.9,
                    'bagging_freq': 40,
                    'min_split_gain': 0,
                    'reg_lambda':0,
                    'reg_alpha':0,
                    'nthread': 6
                   }
    
    cv_result = lgb.cv(train_set=lgb_train,
                       early_stopping_rounds=20,
                       num_boost_round=5000,
                       nfold=5,
                       stratified=True,
                       shuffle=True,
                       params=final_params,
                       feval=f1_score_vali,
                       seed=0,
                      )
    

    在实际调整过程中,可先设置一个较大的学习率(上面的例子中0.1),通过Lgb原生的cv函数进行树个数的确定,之后再通过上面的实例代码进行参数的调整优化。

    最后针对最优的参数设置一个较小的学习率(例如0.05),同样通过cv函数确定树的个数,确定最终的参数。

    需要注意的是,针对大数据集,上面每一层参数的调整都需要耗费较长时间,

  • 贝叶斯调参

    在使用之前需要先安装包bayesian-optimization,运行如下命令即可:

    pip install bayesian-optimization
    

    贝叶斯调参的主要思想是:给定优化的目标函数(广义的函数,只需指定输入和输出即可,无需知道内部结构以及数学性质),通过不断地添加样本点来更新目标函数的后验分布(高斯过程,直到后验分布基本贴合于真实分布)。简单的说,就是考虑了上一次参数的信息,从而更好的调整当前的参数。

    贝叶斯调参的步骤如下:

    • 定义优化函数(rf_cv)
    • 建立模型
    • 定义待优化的参数
    • 得到优化结果,并返回要优化的分数指标
    from sklearn.model_selection import cross_val_score
    
    """定义优化函数"""
    def rf_cv_lgb(num_leaves, max_depth, bagging_fraction, feature_fraction, bagging_freq, min_data_in_leaf, 
                  min_child_weight, min_split_gain, reg_lambda, reg_alpha):
        # 建立模型
        model_lgb = lgb.LGBMClassifier(boosting_type='gbdt', objective='multiclass', num_class=4,
                                       learning_rate=0.1, n_estimators=5000,
                                       num_leaves=int(num_leaves), max_depth=int(max_depth), 
                                       bagging_fraction=round(bagging_fraction, 2), feature_fraction=round(feature_fraction, 2),
                                       bagging_freq=int(bagging_freq), min_data_in_leaf=int(min_data_in_leaf),
                                       min_child_weight=min_child_weight, min_split_gain=min_split_gain,
                                       reg_lambda=reg_lambda, reg_alpha=reg_alpha,
                                       n_jobs= 8
                                      )
        f1 = make_scorer(f1_score, average='micro')
        val = cross_val_score(model_lgb, X_train_split, y_train_split, cv=5, scoring=f1).mean()
    
        return val
    
    from bayes_opt import BayesianOptimization
    """定义优化参数"""
    bayes_lgb = BayesianOptimization(
        rf_cv_lgb, 
        {
            'num_leaves':(10, 200),
            'max_depth':(3, 20),
            'bagging_fraction':(0.5, 1.0),
            'feature_fraction':(0.5, 1.0),
            'bagging_freq':(0, 100),
            'min_data_in_leaf':(10,100),
            'min_child_weight':(0, 10),
            'min_split_gain':(0.0, 1.0),
            'reg_alpha':(0.0, 10),
            'reg_lambda':(0.0, 10),
        }
    )
    
    """开始优化"""
    bayes_lgb.maximize(n_iter=10)
    
    |   iter    |  target   | baggin... | baggin... | featur... | max_depth | min_ch... | min_da... | min_sp... | num_le... | reg_alpha | reg_la... |
    |  1        |  0.9785   |  0.5174   |  10.78    |  0.8746   |  10.15    |  4.288    |  48.97    |  0.2337   |  42.83    |  6.551    |  9.015    |
    |  2        |  0.9778   |  0.6777   |  41.77    |  0.5291   |  12.15    |  4.16     |  26.39    |  0.2461   |  55.78    |  6.528    |  0.6003   |
    |  3        |  0.9745   |  0.5825   |  68.77    |  0.5932   |  8.36     |  9.296    |  77.74    |  0.7946   |  79.12    |  3.045    |  5.593    |
    |  4        |  0.9802   |  0.9669   |  78.34    |  0.77     |  19.68    |  9.886    |  66.34    |  0.255    |  161.1    |  4.727    |  8.18     |
    |  5        |  0.9836   |  0.9897   |  51.9     |  0.9737   |  16.82    |  2.001    |  42.1     |  0.03563  |  134.2    |  3.437    |  1.368    |
    |  6        |  0.9749   |  0.5575   |  46.2     |  0.6518   |  15.9     |  7.817    |  34.12    |  0.341    |  153.2    |  7.144    |  7.899    |
    |  7        |  0.9793   |  0.9644   |  55.08    |  0.9795   |  18.5     |  2.085    |  41.22    |  0.7031   |  129.9    |  3.369    |  2.717    |
    |  8        |  0.9819   |  0.5926   |  58.23    |  0.6149   |  16.81    |  2.911    |  39.91    |  0.1699   |  137.3    |  2.685    |  2.891    |
    |  9        |  0.983    |  0.7796   |  50.38    |  0.7261   |  17.87    |  3.499    |  37.59    |  0.1404   |  136.1    |  2.442    |  6.621    |
    |  10       |  0.9843   |  0.638    |  49.32    |  0.9282   |  11.33    |  6.504    |  43.21    |  0.288    |  137.7    |  0.2083   |  6.966    |
    |  11       |  0.9798   |  0.8196   |  47.05    |  0.5845   |  9.075    |  2.965    |  46.16    |  0.3984   |  131.6    |  3.634    |  2.601    |
    |  12       |  0.9726   |  0.7688   |  37.57    |  0.9811   |  10.26    |  1.239    |  17.54    |  0.9651   |  46.5     |  8.834    |  6.276    |
    |  13       |  0.9836   |  0.5214   |  48.3     |  0.8203   |  19.13    |  3.129    |  35.47    |  0.08455  |  138.2    |  2.345    |  9.691    |
    |  14       |  0.9738   |  0.5617   |  45.75    |  0.8648   |  18.88    |  4.383    |  46.88    |  0.9315   |  141.8    |  4.968    |  5.563    |
    |  15       |  0.9807   |  0.8046   |  47.05    |  0.6449   |  12.38    |  0.3744   |  41.13    |  0.6808   |  138.7    |  0.8521   |  9.461    |
    =================================================================================================================================================
    
    """显示优化结果"""
    bayes_lgb.max
    
    {'target': 0.9842625,
     'params': {'bagging_fraction': 0.6379596054685973,
      'bagging_freq': 49.319589248277715,
      'feature_fraction': 0.9282486828608231,
      'max_depth': 11.32826513626976,
      'min_child_weight': 6.5044214037514845,
      'min_data_in_leaf': 43.211716584925405,
      'min_split_gain': 0.28802399981965143,
      'num_leaves': 137.7332804262704,
      'reg_alpha': 0.2082701560002398,
      'reg_lambda': 6.966270735649479}}
    

    参数优化完成后,我们可以根据优化后的参数建立新的模型,降低学习率并寻找最优模型迭代次数

    """调整一个较小的学习率,并通过cv函数确定当前最优的迭代次数"""
    base_params_lgb = {
                        'boosting_type': 'gbdt',
                        'objective': 'multiclass',
                        'num_class': 4,
                        'learning_rate': 0.01,
                        'num_leaves': 138,
                        'max_depth': 11,
                        'min_data_in_leaf': 43,
                        'min_child_weight':6.5,
                        'bagging_fraction': 0.64,
                        'feature_fraction': 0.93,
                        'bagging_freq': 49,
                        'reg_lambda': 7,
                        'reg_alpha': 0.21,
                        'min_split_gain': 0.288,
                        'nthread': 10,
                        'verbose': -1,
    }
    
    cv_result_lgb = lgb.cv(
        train_set=train_matrix,
        early_stopping_rounds=1000, 
        num_boost_round=20000,
        nfold=5,
        stratified=True,
        shuffle=True,
        params=base_params_lgb,
        feval=f1_score_vali,
        seed=0
    )
    print('迭代次数{}'.format(len(cv_result_lgb['f1_score-mean'])))
    print('最终模型的f1为{}'.format(max(cv_result_lgb['f1_score-mean'])))
    
    迭代次数4833
    最终模型的f1为0.961641452120875
    

    模型参数已经确定,建立最终模型并对验证集进行验证

    import lightgbm as lgb
    """使用lightgbm 5折交叉验证进行建模预测"""
    cv_scores = []
    for i, (train_index, valid_index) in enumerate(kf.split(X_train, y_train)):
        print('************************************ {} ************************************'.format(str(i+1)))
        X_train_split, y_train_split, X_val, y_val = X_train.iloc[train_index], y_train[train_index], X_train.iloc[valid_index], y_train[valid_index]
    
        train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
        valid_matrix = lgb.Dataset(X_val, label=y_val)
    
        params = {
                    'boosting_type': 'gbdt',
                    'objective': 'multiclass',
                    'num_class': 4,
                    'learning_rate': 0.01,
                    'num_leaves': 138,
                    'max_depth': 11,
                    'min_data_in_leaf': 43,
                    'min_child_weight':6.5,
                    'bagging_fraction': 0.64,
                    'feature_fraction': 0.93,
                    'bagging_freq': 49,
                    'reg_lambda': 7,
                    'reg_alpha': 0.21,
                    'min_split_gain': 0.288,
                    'nthread': 10,
                    'verbose': -1,
        }
    
        model = lgb.train(params, train_set=train_matrix, num_boost_round=4833, valid_sets=valid_matrix, 
                          verbose_eval=1000, early_stopping_rounds=200, feval=f1_score_vali)
        val_pred = model.predict(X_val, num_iteration=model.best_iteration)
        val_pred = np.argmax(val_pred, axis=1)
        cv_scores.append(f1_score(y_true=y_val, y_pred=val_pred, average='macro'))
        print(cv_scores)
    
    print("lgb_scotrainre_list:{}".format(cv_scores))
    print("lgb_score_mean:{}".format(np.mean(cv_scores)))
    print("lgb_score_std:{}".format(np.std(cv_scores)))
    
    ...
    lgb_scotrainre_list:[0.9615056903324599, 0.9597829114711733, 0.9644760387635415, 0.9622009947666585, 0.9607941521618003]
    lgb_score_mean:0.9617519574991267
    lgb_score_std:0.0015797109890455313
    
    
  • 模型调参小总结

    • 集成模型内置的cv函数可以较快的进行单一参数的调节,一般可以用来优先确定树模型的迭代次数

    • 数据量较大的时候(例如本次项目的数据),网格搜索调参会特别特别慢,不建议尝试

    • 集成模型中原生库和sklearn下的库部分参数不一致,需要注意,具体可以参考xgb和lgb的官方API

      xgb原生库APIsklearn库下xgbAPI

      lgb原生库APIsklearn库下lgbAPI

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值