机器学习自动调参小试

1. 安装环境

1.1 安装anconda 环境

Linux-Centos7下安装Anaconda(2019年新版)

 

1.2 安装xgboost

conda install py-xgboost

 

1.3 安装ray

pip install --upgrade pip

pip install ray -i https://pypi.doubanio.com/simple

 

2. 脚本

2.1 第一版

#-*- encoding=utf-8 -*-

def get_dataset(input_path):
    df = pd.read_csv(input_path)
    columns_list = [col for col in df.columns if col != 'label']
    data, label = df[columns_list], df['label']
    return data, label

def objective_function(config, checkpoint_dir=None, path=None):
    """
    需要优化的目标函数
    :config: 优化对象,超参范围
    :path: (训练集,OOT文件路径)
    """
    train_path, oot_path = path
    train_data, train_label = get_dataset(train_path)
    train_mat = xgboost.DMatrix(train_data, train_label)
    param = config.copy()
    ...
    

2.2 第二版

def objective_function(config, checkpoint_dir=None, path=None):
    """
    需要优化的目标函数
    :config: 优化对象,超参范围
    :path: (训练集,OOT文件路径)
    """
    train_path, oot_path = path
    train_mat = xgboost.DMatrix(train_path)
    ...

2.3 完整代码

#-*- encoding=utf-8 -*-
 
import os
import time
import pickle
import numpy as np
import xgboost
import sklearn.metrics as metrics
from ray import tune
from ray.tune.suggest.bohb import TuneBOHB
from ray.tune.schedulers import HyperBandForBOHB
 
 
def get_auc_ks(scores, labels):
    """
    计算KS,AUC值
    :param scores: list-like, model scores;
    :param labels: list-like, labels;
    :return: tuple(float, float), auc & ks ;
    """
    flg = False
    if isinstance(labels, xgboost.DMatrix):
        flg = True
        labels = labels.get_label()
    fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    ks = np.max(np.abs(tpr - fpr))
    if flg:
        return [('my_auc', auc), ('KS', ks)]
    else:
        return auc, ks
 
def metric_ks(pred, dtrain):
    """
    ks metric
    :param estimator: 模型
    :param X: 特征
    :param y: label
    """
    scores = pred
    y = dtrain.get_label()
    fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=1)
    ks = np.max(np.abs(tpr - fpr))
    return 'ks', ks
 
def custom_metric(pred, dtrain):
    labels = dtrain.get_label()
    scores = pred
    fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    ks = np.max(np.abs(tpr - fpr))
    return [('auc', auc), ('KS', ks)]

 
def objective_function(config, checkpoint_dir=None, path=None):
    """
    需要优化的目标函数
    :config: 优化对象,超参范围
    :path: (训练集,OOT文件路径)
    """
    train_path, oot_path = path
    train_mat = xgboost.DMatrix(train_path)
    param = config.copy()
    param["max_depth"] = int(param["max_depth"])
    n_estimators = int(param.pop("n_estimators"))
    result = {}
    cv_results = xgboost.cv(param, dtrain=train_mat, num_boost_round=n_estimators, 
                            nfold=5, metrics='logloss', feval=custom_metric, maximize=True, 
                            callbacks=[record_evaluation(result, oot_path)])
    test_score = (result["detail_metrics"]["my_oot"]["auc"][-1], result["detail_metrics"]["my_oot"]["KS"][-1])
    valid_score = (result["detail_metrics"]["my_valid"]["auc"][-1], result["detail_metrics"]["my_valid"]["KS"][-1])
    train_score = (result["detail_metrics"]["my_train"]["auc"][-1], result["detail_metrics"]["my_train"]["KS"][-1])
 
    nfold = len(valid_score[0])
    monitor_metric = sum(valid_score[0]) / nfold
 
    with tune.checkpoint_dir(step=1) as checkpoint_dir:
        path = os.path.join(checkpoint_dir, "cv_result")
        with open(path, 'wb') as f:
            pickle.dump(cv_results, f)
    return tune.report(valid_auc=monitor_metric,
                       test_score=test_score,
                       valid_score=valid_score,
                       train_score=train_score,
                       done=True)
 
def record_evaluation(eval_result, oot_path):
    """
    callback记录xgboost.cv的指标结果,包含train, valid, oot
    :eval_result: dict A dictionary to store the evaluation results.
    :oot_path: OOT Data file path
    """
    if not isinstance(eval_result, dict):
        raise TypeError('eval_result has to be a dictionary')
    eval_result.clear()

    oot_mat = xgboost.DMatrix(oot_path)
    def init(env):
        """internal function"""
        for item in env.evaluation_result_list:
            k = item[0]
            pos = k.index('-')
            key = k[:pos]
            metric = k[pos + 1:]
            if key not in eval_result:
                eval_result[key] = {}
            if metric not in eval_result[key]:
                eval_result[key][metric] = []
            if 'detail_metrics' not in eval_result:
                eval_result['detail_metrics'] = {"my_train": {}, "my_valid": {}, "my_oot": {}}
 
    def callback(env):
        """internal function"""
        if not eval_result:
            init(env)
        for item in env.evaluation_result_list:
            k, v = item[0], item[1]
            pos = k.index('-')
            key = k[:pos]
            metric = k[pos + 1:]
            eval_result[key][metric].append(v)
        tmp = {"my_train": {}, "my_valid": {}, "my_oot": {}}
        for cvpack in env.cvfolds:
            bst = cvpack.bst
            pred_train = bst.predict(cvpack.dtrain)
            pred_valid = bst.predict(cvpack.dtest)
            pred_oot = bst.predict(oot_mat)
            metrics_result_train = dict(custom_metric(pred_train, cvpack.dtrain))
            metrics_result_valid = dict(custom_metric(pred_valid, cvpack.dtest))
            metrics_result_oot = dict(custom_metric(pred_oot, oot_mat))
            for k in metrics_result_oot:
                tmp["my_train"][k] = tmp["my_train"].get(k, [])+ [metrics_result_train[k]]
                tmp["my_valid"][k] = tmp["my_valid"].get(k, [])+ [metrics_result_valid[k]]
                tmp["my_oot"][k] = tmp["my_oot"].get(k, [])+ [metrics_result_oot[k]]
        for k1 in tmp:
            for k2 in tmp[k1]:
                eval_result["detail_metrics"][k1].setdefault(k2, []).append(tmp[k1][k2])
    return callback
 
def hyperopt(param_space, trainpath, testpath, num_eval, name, obj_funcs, log_path='~/ray_results'):
    """
    贝叶斯自动寻参数
    
    :param_space: 参数范围,组合范围
    :X_train: 训练集特征
    :y_train: 寻链接标签
    :X_test: 测试集特征
    :y_test: 测试集标签
    :num_eval: 寻参次数
    :log_path: log文件存储路径
    """
    start = time.time()
    path = (trainpath, testpath)
    opt = TuneBOHB(max_concurrent=2)
    bohb = HyperBandForBOHB(time_attr="training_iteration",
                           max_t=num_eval)
    analysis = tune.run(tune.with_parameters(obj_funcs, path=path), 
                        config=param_space, num_samples=num_eval, local_dir=log_path,
                        metric='valid_auc', mode='max', search_alg=opt, scheduler=bohb,
                        resources_per_trial={"cpu": 5}, name=name)
    best_params = analysis.get_best_config(metric="valid_auc", mode="max")
    best_params["max_depth"] = int(best_params["max_depth"])
    n_estimators = int(best_params.pop("n_estimators"))
    
    train_mat = xgboost.DMatrix(trainpath)
    test_mat = xgboost.DMatrix(testpath)
 
    model = xgboost.train(best_params, train_mat, n_estimators)    
    pred_test = model.predict(test_mat)
    pred_train = model.predict(train_mat)
    print("-----Results-----")
    print("Best model & parameters: {}".format(best_params))
    print("Train Score: {}".format(get_auc_ks(pred_train, train_mat.get_label())))
    print("Test Score: {}".format(get_auc_ks(pred_test, test_mat.get_label())))
    print("Time elapsed: {}".format(time.time() - start))
    print("Parameter combinations evaluated: {}".format(num_eval))
    return None
    
if __name__ == "__main__":
    trainfile_path = "./train.buffer"
    testfile_path = "./oot.buffer"
    name = 'ppdnew_V2'
    control_overfitting = False
    param = {
            'booster': "gbtree",
            'eta': tune.uniform(0.01, 1),
            'seed': 1,
            'max_depth': tune.uniform(3, 5),
            'n_estimators': tune.uniform(50, 500),
            'min_child_weight': tune.uniform(1, 300),
            'colsample_bytree': tune.uniform(0.6, 1.0),
            'subsample': tune.uniform(0.5, 1),
            'lambda': tune.uniform(0.0, 100),
            'alpha': tune.uniform(0.0, 100),
            'scale_pos_weight': tune.uniform(1, 5),
            'n_jobs': 5
        }
    
    print("begin tuning")
    hyperopt(param, trainfile_path, testfile_path, 100, name, obj_funcs=objective_function)
 

 

3. 运行结果

3.1 运行过程中的结果

 

3.2 查看最终结果

from ray import tune
path = "~/ray_results/ppdnew_V2"
ana = tune.Analysis(path)
df = ana.dataframe().sort_values("valid_auc", ascending=False)

print("path: ", path)

diff_time = df.timestamp.max() - df.timestamp.min()
print("total time(h): ", diff_time / 60 / 60)
print("")
print(df.sort_values("valid_auc", ascending=False))
print("average running time, ", df["time_this_iter_s"].mean())
print(df.iloc[0])

 

4. 小结

4.1 环境

配置环境时,踩了很多坑,执行代码时可能会出现“dlopen:cannot load any more object with static TLS:”,找了很多方法尝试无果后,打算配置个conda环境来试试,然后就绕过了这个问题。

 

4.2 代码

(1)刚开始采用xgboost.DMatrix直接加载csv文件,但不清楚如何指定label,导致返回的auc等指标为nan,排查后发现不指定label会默认设置成0;

(2)加载数据采用pandas读取,然后传给xgboost.DMatrix(train_data, train_label)的格式,这种方式速度较慢。

(3)先对csv文件进行处理,转换成xgboost.DMatrix可以加载的二进制文件,代码如下:

#-*- encoding=utf-8 -*-

import sys
import pandas as pd
import xgboost as xgb

if __name__ == "__main__":
    input_path = sys.argv[1]
    output_path = sys.argv[2]
    df = pd.read_csv(input_path)
    df.to_csv("./temp.csv", index=False, header=None)


    mat = xgb.DMatrix("./temp.csv?format=csv&label_column=0")
    mat.save_binary(output_path)

 

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值