Machine-Learning-for-Algorithmic-Trading-Second-Edition/ gradient boosting

本文作者:何百圣 哈尔滨工业大学(威海) 经济管理学院  数量金融方向 

MLAT系列文章为校内课程作业,以blog的形式记录作业。笔者的课程任务是12gradient_boosting,本篇承接上一篇create dataset,用多种boosting方法对dataset进行处理。

Imports and Settings

import sys, os
import warnings
from time import time
from itertools import product
import joblib
from pathlib import Path
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_validate
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
# needed for HistGradientBoostingClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, HistGradientBoostingClassifier
from sklearn.inspection import partial_dependence, plot_partial_dependence
from sklearn.metrics import roc_auc_score

其他设置

results_path = Path(r'E:/machine learning for algorithmic trading','results', 'baseline')

warnings.filterwarnings('ignore')
sns.set_style("whitegrid")
idx = pd.IndexSlice
np.random.seed(42)

DATA_STORE = r'E:/machine learning for algorithmic trading/wiki.h5'

Prepare Data

这里使用的就是上一节得到的数据集,在原书GitHub中属于第4节。

def get_data(start='2000', end='2018', task='classification', holding_period=1, dropna=False):
    
    idx = pd.IndexSlice
    target = f'target_{holding_period}m'
    
    with pd.HDFStore(DATA_STORE) as store:
        df = store['engineered_features']
        

    if start is not None and end is not None:
        df = df.loc[idx[:, start: end], :]
    if dropna:
        df = df.dropna()
        
    y = (df[target]>0).astype(int)
    #这里target收益率大于零则y为1,否则y为0,应用于classification方法,做方向判断
    
    X = df.drop([c for c in df.columns if c.startswith('target')], axis=1)
    
    return y, X

1. startswith() 用来检查字符串是否以target开头 还可设置 beg与end参数,默认 beg = 0, end = len(string)

Factorize Categories

2. factorize指的是对sector进行分类(例如将建筑业,制造业分为0,1),这里factorize函数还可以加入参数sort,则为有排序的分类,否则先看见的为0,后续出现不同则增加分类
 #返回变量为tuple

cat_cols = ['year', 'month', 'age', 'msize', 'sector']
def factorize_cats(df, cats=['sector']):
    cat_cols = ['year', 'month', 'age', 'msize'] + cats
    for cat in cats:
        df[cat] = pd.factorize(df[cat])[0]
        
    df.loc[:, cat_cols] = df.loc[:, cat_cols].fillna(-1).astype(int)
    return df

One-Hot Encoding

3. one hot编码是将类别变量转换为机器学习算法易于利用的一种形式的过程,get_dummies返回变量为dataframe,为稀疏矩阵。

注意:

#在使用get_dummies时,要防止多重共线性问题,例如:如果将性别数据使用get_dummies,就必须要删除一列,因为知道了一列就知道了另一列
#除了onehot 和 factorize,映射函数map()也可以起到分类的效果
#在分类时,需要注意数据是否有大小效果,例如red和yellow就没有,但是年龄就有

def get_one_hot_data(df, cols=cat_cols[:-1]):
    df = pd.get_dummies(df,
                        columns=cols + ['sector'],
                        #columns参数表明这些columns参与分类
                        prefix=cols + [''],
                        prefix_sep=['_'] * len(cols) + ['']
                        #其实get_dummies函数有默认prefix,这里是为了sector分类前不出现sector
)
    return df.rename(columns={c: c.replace('.0', '') for c in df.columns})

Get Holdout Set

holdout set用于估计交叉验证后的泛化误差

def get_holdout_set(target, features, period=6):
    idx = pd.IndexSlice
    label = target.name
    dates = np.sort(y.index.get_level_values('date').unique())
    cv_start, cv_end = dates[0], dates[-period - 2]
    holdout_start, holdout_end = dates[-period - 1], dates[-1]

    #这里用了大部分的数据来做cross validation,留最后七天做测试集
    
    df = features.join(target.to_frame())
    train = df.loc[idx[:, cv_start: cv_end], :]
    y_train, X_train = train[label], train.drop(label, axis=1)

    test = df.loc[idx[:, holdout_start: holdout_end], :]
    y_test, X_test = test[label], test.drop(label, axis=1)
    return y_train, X_train, y_test, X_test

Load Data

y, features = get_data()
X_dummies = get_one_hot_data(features)
X_factors = factorize_cats(features)

y_clean, features_clean = get_data(dropna=True)
X_dummies_clean = get_one_hot_data(features_clean)
X_factors_clean = factorize_cats(features_clean)
#(clean)将滞后收益率为nan的项都删去了

Cross-Validation Setup

交叉验证,原理比较简单,这里采用了12-fold

class OneStepTimeSeriesSplit:
    """Generates tuples of train_idx, test_idx pairs
    Assumes the index contains a level labeled 'date'"""

    def __init__(self, n_splits=3, test_period_length=1, shuffle=False):
        self.n_splits = n_splits
        self.test_period_length = test_period_length
        self.shuffle = shuffle

    @staticmethod
    def chunks(l, n):
        for i in range(0, len(l), n):
            yield l[i:i + n]

    def split(self, X, y=None, groups=None):
        unique_dates = (X.index
                        .get_level_values('date')
                        .unique()
                        .sort_values(ascending=False)
                        [:self.n_splits*self.test_period_length])

        dates = X.reset_index()[['date']]
        for test_date in self.chunks(unique_dates, self.test_period_length):
            train_idx = dates[dates.date < min(test_date)].index
            test_idx = dates[dates.date.isin(test_date)].index
            if self.shuffle:
                np.random.shuffle(list(train_idx))
            yield train_idx, test_idx
            
            #yield 用于在循环程序里多次返回

    def get_n_splits(self, X, y, groups=None):
        return self.n_splits

class实例化

cv = OneStepTimeSeriesSplit(n_splits=12, 
                            test_period_length=1, 
                            shuffle=False)

run_time = {}

CV Metrics

交叉验证的各项评价指标

metrics = {'balanced_accuracy': 'Accuracy' ,
           'roc_auc': 'AUC',
           'neg_log_loss': 'Log Loss',
           'f1_weighted': 'F1',
           'precision_weighted': 'Precision',
           'recall_weighted': 'Recall'
}

def run_cv(clf, X=X_dummies, y=y, metrics=metrics, cv=cv, fit_params=None, n_jobs=-1):
    start = time()
    #scores是一个字典的形式,key是metrics,value是值
    scores = cross_validate(estimator=clf,
                            X=X, 
                            y=y,
                            scoring=list(metrics.keys()),
                            cv=cv,
                            return_train_score=True,
                            n_jobs=n_jobs,
                            verbose=1,
                            fit_params=fit_params)
    
    duration = time() - start
    return scores, duration 

CV Result Handler Functions

结果处理函数,包括metics dataframe以及plot函数、

4.melt() 是 pivot() 逆转操作函数,也是数据透视的一种处理方法,非常好用,只是一言半语讲不清楚。python melt()用法

def stack_results(scores):
    
    #利用元组创建多重索引
    columns = pd.MultiIndex.from_tuples(
        [tuple(m.split('_', 1)) for m in scores.keys()],
        names=['Dataset', 'Metric'])
    data = np.array(list(scores.values())).T
    df = (pd.DataFrame(data=data,
                       columns=columns)
          .iloc[:, 2:])
    results = pd.melt(df, value_name='Value')
    results.Metric = results.Metric.apply(lambda x: metrics.get(x))
    results.Dataset = results.Dataset.str.capitalize()
    return results


def plot_result(df, model=None, fname=None):
    m = list(metrics.values())
    
    #catplot函数,表示为用分类型数据(categorical data)绘图
    g = sns.catplot(x='Dataset', 
                    y='Value', 
                    hue='Dataset', 
                    col='Metric',
                    data=df, 
                    col_order=m,
                    order=['Train', 'Test'],
                    kind="box", 
                    col_wrap=3,
                    sharey=False,
                    height=4, aspect=1.2)
    #aspect*height = width
    
    df = df.groupby(['Metric', 'Dataset']).Value.mean().unstack().loc[m]
    
    #遍历子图
    for i, ax in enumerate(g.axes.flat):
        s = f"Train: {df.loc[m[i], 'Train'] :>7.4f}\nTest:  {df.loc[m[i], 'Test'] :>7.4f}"
        #用来限制小数位数
        
        #text函数用于给fig增加图例
        ax.text(0.05, 0.85, s, fontsize=10,transform=ax.transAxes,
                bbox=dict(facecolor='white', edgecolor='grey', boxstyle='round,pad=0.5'))
    g.fig.suptitle(model, fontsize=16)
    g.fig.subplots_adjust(top=.9)
    if fname:
        g.savefig(fname, dpi=300);
        
#transform = ax.transAxes是转换坐标系的意思,不加入这一command结果差别很大,但是具体意思也还没搞清楚

Baseline Classifier

接下来的流程比较重复,对不同的算法建模,cross_validatde,接着plot。最后对比不同算法的效果。算法包括

'Baseline', dummy_result,'Random Forest','AdaBoost','Gradient Booster','XGBoost'
'LightGBM Dummies', 'LightGBM Factors'

5.DummyClassfier是一种使用简单规则进行预测的分类器。通常该分类器作为一个简单的基线(baseline)与其他(真实)分类器进行比较。

6.在机器学习中我们训练模型后,需要把模型保存到本地,这里我们采用joblib来保存

baseline

dummy_clf = DummyClassifier(strategy='stratified',
                            random_state=42)

algo = 'dummy_clf'

fname = results_path / f'{algo}.joblib'
if not Path(fname).exists():
    dummy_cv_result, run_time[algo] = run_cv(dummy_clf)
    joblib.dump(dummy_cv_result, fname)
else:
    dummy_cv_result = joblib.load(fname)


dummy_result = stack_results(dummy_cv_result)
dummy_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()

plot_result(dummy_result, model='Dummy Classifier')

RandomForest

7. 参数解释,详见代码段。

rf_clf = RandomForestClassifier(n_estimators=100, #决策树模型的个数
                                criterion='gini', 
                                max_depth=None, 
                                min_samples_split=2, 
                                min_samples_leaf=1, 
                       
# min_sample_split是最低编号。分割所需的样本数量。例如,如果min_sample_split = 6节点中有4个样本,则不会发生分裂(与熵无关)。

# min_sample_leaf另一方面基本上是最小编号。的样本必须是叶节点。
# 假设min_sample_leaf = 3一个包含5个样本的节点可以拆分为两个大小分别为2和3的叶子节点,则该拆分不会发生,因为最小叶子大小为3                                
                                min_weight_fraction_leaf=0.0, 
                                max_features='auto',
                                #代表sqrt(n_features),default状态就是auto,意思是对于每一个树,随机抽取的特征个数是总个数的平方根
                                max_leaf_nodes=None, 
                                min_impurity_decrease=0.0, 
                                min_impurity_split=None, 
                                bootstrap=True, 
                                #是否有放回地采样,bootstrap的概念
                                oob_score=True, 
                                #是否使用带外样本来估计泛化精度
                                n_jobs=-1,
                                random_state=42, 
                                
                                #整数型,默认为0,如果为0则不输出日志,如果为1,则每隔一段时间输出日志,大于1输出日志会更频繁。
                                verbose=1)

#n_jobs指定并行性,默认值为None或者数字1,如果设置成-1,则表示将任务派发到所有CPU上

疑问:random forest 用的clean数据,区别是什么呢

algo = 'random_forest'
fname = results_path / f'{algo}.joblib'
if not Path(fname).exists():
    rf_cv_result, run_time[algo] = run_cv(rf_clf, y=y_clean, X=X_dummies_clean)
    joblib.dump(rf_cv_result, fname)
else:
    rf_cv_result = joblib.load(fname)

rf_result = stack_results(rf_cv_result)
rf_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()

8.对于train 和 test相差过大的问题的探索

#注意到在train set中,random forest完美的拟合了数据,train和test产生了很大的区别,一开始以为是test样本数目太小了,但是增大交叉验证的test set的数据集,也没有显著的变化,并且train set一直为1。这个问题仍待解决
 

plot_result(rf_result, model='Random Forest')

scikit-learn: AdaBoost

9. 参数解释,详见代码段。

#基础模型设置为单层决策树
base_estimator = DecisionTreeClassifier(criterion='gini', 
                                        splitter='best',
                                        max_depth=1, 
                                        min_samples_split=2, 
                                        min_samples_leaf=20, 
                                        min_weight_fraction_leaf=0.0,
                                        max_features=None, 
                                        random_state=None, 
                                        max_leaf_nodes=None, 
                                        min_impurity_decrease=0.0, 
                                        min_impurity_split=None, 
                                        class_weight=None)

# splitter:取值为"best"和"random","best"在特征的所有划分点中找出最优的划分点,
# 适合样本量不大的情况,"random"随机地在部分划分点中找局部最优的划分点,适合样本量非常大的情况,默认选择"best"

# min_weight_fraction_leaf:叶子节点最小的样本权重和,默认取0,
# 即不考虑权重问题,如果小于该数值,该叶子节点会和兄弟节点一起被剪枝(即剔除该叶子节点和其兄弟节点,并停止分裂)。
# 如果较多样本有缺失值或者样本的分布类别偏差很大,则需考虑样本权重问题。

10.为防止Adaboost过拟合,可以向模型加入正则化项,即为每个弱学习器的权重缩减系数ν,也即learning rate,也称之为学习率。 取值范围为(0,1] 取值较大意味着相同的学习效果,迭代次数更少,需要训练的弱学习器更少 取值较小意味着达到一定的误分类数或学习效果,需要更多迭代次数和更多

ada_clf = AdaBoostClassifier(base_estimator=base_estimator,
                             #n_estimator 参数实际上控制了number of boosting stages
                             n_estimators=100,
                             
                             learning_rate=1.0,
                             algorithm='SAMME.R',
                             random_state=42)


11.#'SAMME'代表'使用对样本集分类效果调整弱学习器权重','SAMME.R'代表使用对样本集分类的预测概率调整弱学习器权重,default 为SAMME.R

algo = 'adaboost'
fname = results_path / f'{algo}.joblib'
if not Path(fname).exists():
    ada_cv_result, run_time[algo] = run_cv(ada_clf, y=y_clean, X=X_dummies_clean)
    joblib.dump(ada_cv_result, fname)
else:
    ada_cv_result = joblib.load(fname)

ada_result = stack_results(ada_cv_result)
ada_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()

plot_result(ada_result, model='AdaBoost')

HistGradientBoostingClassifier

The following HistGradientBoostingClassifier initialization code illustrates the key tuning parameters that we previously introduced, in addition to those that we are familiar with from looking at standalone decision tree models.

This estimator is much faster than GradientBoostingClassifier for big datasets (n_samples >= 10 000).

This estimator has native support for missing values (NaNs). During training, the tree grower learns at each split point whether samples with missing values should go to the left or right child, based on the potential gain. When predicting, samples with missing values are assigned to the left or right child consequently. If no missing values were encountered for a given feature during training, then samples with missing values are mapped to whichever child has the most samples.

gb_clf = HistGradientBoostingClassifier(loss='binary_crossentropy',              
                                        learning_rate=0.1,         
                                        
                                        # 在adaboost中n_estimator用于控制 number of boosting stages
                                        # 在histgradientboosting中采用max_iter
                                        max_iter=100,               
                                        min_samples_leaf=20,
                                        max_depth=None,
                                        random_state=None,
                                        max_leaf_nodes=31,           # opt value depends on feature interaction
                                        warm_start=False,

                                        verbose=0,
                                        tol=0.0001)

建模后交叉验证

algo = 'sklearn_gbm'

fname = results_path / f'{algo}.joblib'
if not Path(fname).exists():
    gb_cv_result, run_time[algo] = run_cv(gb_clf, y=y_clean, X=X_dummies_clean)
    joblib.dump(gb_cv_result, fname)
else:
    gb_cv_result = joblib.load(fname)

gb_result = stack_results(gb_cv_result)
gb_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()

plot_result(gb_result, model='Gradient Boosting Classifier')

Partial Dependence Plots(重要)

12.Partial_dependence思路类似于统计中的边际效应,就是在控制其他变量不变的情况下,改变target feature的值,来看模型的fit结果如何变化
但在开始之前,我们还需要去掉公司IPO的时间列数据,避免数据对时间产生过度依赖

partial explaination

13.product(A,B)函数,返回A和B中的元素组成的笛卡尔积的元组  ([0, 1], repeat=2)表示([0, 1],[0, 1])

14.'{:.0%}'.format(y)是格式化的灵活用法之一,应当掌握|

15.当我们只关注某一feature和target之间的关系,那么partial dependency plots的结果就是2D,如果是同时看两个feature,那么最终结果就是一个3D

X_ = X_factors_clean.drop(['year', 'month'], axis=1)

fname = results_path / f'{algo}_model.joblib'
if not Path(fname).exists():
    gb_clf.fit(y=y_clean, X=X_)
    joblib.dump(gb_clf, fname)
else:
    gb_clf = joblib.load(fname)


gb_clf.score(X=X_, y=y_clean)
>>>0.5889181460403748

y_score = gb_clf.predict_proba(X_)[:, 1]
roc_auc_score(y_score=y_score, y_true=y_clean)
>>>0.6183261924270361

画图

fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))

plot_partial_dependence(
    estimator=gb_clf,
    X=X_,
    features=['return_12m', 'return_6m', 'CMA', ('return_12m', 'return_6m')],
    percentiles=(0.05, 0.95),
    n_jobs=-1,
    n_cols=2,
    response_method='decision_function',
    grid_resolution=250,
    ax=axes)

for i, j in product([0, 1], repeat=2):
    if i!=1 or j!= 0:
        axes[i][j].xaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y))) 

axes[1][1].yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y))) 

axes[0][0].set_ylabel('Partial Dependence')
axes[1][0].set_ylabel('Partial Dependence')
axes[0][0].set_xlabel('12-Months Return')
axes[0][1].set_xlabel('6-Months Return')
axes[1][0].set_xlabel('Conservative Minus Aggressive')

axes[1][1].set_xlabel('12-Month Return')
axes[1][1].set_ylabel('6-Months Return')
fig.suptitle('Partial Dependence Plots', fontsize=16)
fig.tight_layout()
fig.subplots_adjust(top=.95)

 3D

targets = ['return_12m', 'return_6m']
pdp, axes = partial_dependence(estimator=gb_clf,
                               features=targets,
                               X=X_,
                               grid_resolution=100)

XX, YY = np.meshgrid(axes[0], axes[1])
Z = pdp[0].reshape(list(map(np.size, axes))).T

fig = plt.figure(figsize=(14, 8))
ax = Axes3D(fig)
surface = ax.plot_surface(XX, YY, Z,
                          rstride=1,
                          cstride=1,
                          cmap=plt.cm.BuPu,
                          edgecolor='k')
ax.set_xlabel('12-Month Return')
ax.set_ylabel('6-Month Return')
ax.set_zlabel('Partial Dependence')
ax.view_init(elev=22, azim=30)
ax.yaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y))) 
ax.xaxis.set_major_formatter(FuncFormatter(lambda y, _: '{:.0%}'.format(y))) 

# fig.colorbar(surface)
fig.suptitle('Partial Dependence by 6- and 12-month Returns', fontsize=16)
fig.tight_layout()

16. ax.view_init() 改变绘制图像的视角,即相机的位置,azim沿着z轴旋转,elev沿着y轴

17. number of values to plot on x axis 代表横坐标上点的数量,越大表示图上的点越多,默认为100,一般来说,grid_resolution参数不要太大,否则图形的锯齿状明显
 
 18.meshgrid()根据输入的坐标向量生成对应的坐标矩阵
meshgrid用法

XGBoost

xgb_clf = XGBClassifier(max_depth=3,                  
                        learning_rate=0.1,            
                        n_estimators=100,             # Number of boosted trees to fit.
                        silent=True,                  # Whether to print messages while running
                        objective='binary:logistic',  # Task and objective or custom objective function
                        booster='gbtree',             # Select booster: gbtree, gblinear or dart

                        n_jobs=-1,                   
                        gamma=0,                      # Min loss reduction for further splits
                        min_child_weight=1,           # Min sum of sample weight(hessian) needed
                        max_delta_step=0,             # Max delta step for each tree's weight estimation
                        subsample=1,                  # Subsample ratio of training samples
                        colsample_bytree=1,           # Subsample ratio of cols for each tree
                        colsample_bylevel=1,          # Subsample ratio of cols for each split
                        reg_alpha=0,                  # L1 regularization term on weights
                        reg_lambda=1,                 # L2 regularization term on weights
                        scale_pos_weight=1,           # Balancing class weights
                        base_score=0.5,               # Initial prediction score; global bias
                        random_state=42)              # random seed

建模后cv

algo = 'xgboost'
fname = results_path / f'{algo}.joblib'
if not Path(fname).exists():
    xgb_cv_result, run_time[algo] = run_cv(xgb_clf)
    joblib.dump(xgb_cv_result, fname)
else:
    xgb_cv_result = joblib.load(fname)

xbg_result = stack_results(xgb_cv_result)
xbg_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()

plot_result(xbg_result, model='XG Boost')

Feature Importance

特征重要性分析

fi = pd.Series(xgb_clf.feature_importances_, 
               index=X_dummies.columns)

fi.nlargest(25).sort_values().plot.barh(figsize=(10, 5), 
                                        title='Feature Importance')
sns.despine()
plt.tight_layout();

LightGBM

lgb_clf = LGBMClassifier(boosting_type='gbdt',
                         objective='binary',          # learning task
                         metric='auc',
                         num_leaves=31,               # Maximum tree leaves for base learners.
                         max_depth=-1,                # Maximum tree depth for base learners, -1 means no limit.
                         learning_rate=0.1,          # Adaptive lr via callback override in .fit() method  
                         n_estimators=100,            # Number of boosted trees to fit
                         subsample_for_bin=200000,    # Number of samples for constructing bins.
                         class_weight=None,           # dict, 'balanced' or None
                         min_split_gain=0.0,          # Minimum loss reduction for further split
                         min_child_weight=0.001,      # Minimum sum of instance weight(hessian)
                         min_child_samples=20,        # Minimum number of data need in a child(leaf)
                         subsample=1.0,               # Subsample ratio of training samples
                         subsample_freq=0,            # Frequency of subsampling, <=0: disabled
                         colsample_bytree=1.0,        # Subsampling ratio of features
                         reg_alpha=0.0,               
                         reg_lambda=0.0,              
                         random_state=42,             # Random number seed; default: C++ seed
                         n_jobs=-1,                   # Number of parallel threads.
                         silent=False,
                         importance_type='gain',      # default: 'split' or 'gain'
                        )

cv

algo = 'lgb_factors'

fname = results_path / f'{algo}.joblib'
if not Path(fname).exists():
    lgb_factor_cv_result, run_time[algo] = run_cv(lgb_clf, X=X_factors, fit_params={'categorical_feature': cat_cols})
    joblib.dump(lgb_factor_cv_result, fname)
else:
    lgb_factor_cv_result = joblib.load(fname)

lgb_factor_result = stack_results(lgb_factor_cv_result)
lgb_factor_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()

plot_result(lgb_factor_result, model='Light GBM | Factors')

algo = 'lgb_dummies'
fname = results_path / f'{algo}.joblib'
if not Path(fname).exists():
    lgb_dummy_cv_result, run_time[algo] = run_cv(lgb_clf)
    joblib.dump(lgb_dummy_cv_result, fname)
else:
    lgb_dummy_cv_result = joblib.load(fname)

lgb_dummy_result = stack_results(lgb_dummy_cv_result)
lgb_dummy_result.groupby(['Metric', 'Dataset']).Value.mean().unstack()

plot_result(lgb_dummy_result, model='Light GBM | Factors')

Compare Results

results = {'Baseline': dummy_result,
           'Random Forest': rf_result,
           'AdaBoost': ada_result,
           'Gradient Booster': gb_result,
           'XGBoost': xbg_result,
           'LightGBM Dummies': lgb_dummy_result,
           'LightGBM Factors': lgb_factor_result}

df = pd.DataFrame()
for model, result in results.items():
    df = pd.concat([df, result.groupby(['Metric', 'Dataset']
                                       ).Value.mean().unstack()['Test'].to_frame(model)], axis=1)

df.T.sort_values('AUC', ascending=False)

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值