Hyperopt

1.简介

Hyperopt是一个python库,结合MongoDB可以进行分布式调参,快速找到相对较优的参数。安装的时候需要指定dev版本才能使用模拟退火调参,也支持暴力调参、随机调参等策略。

2.Win7安装

  • 软件
  • 安装
    • MongoDB一路默认安装即可
    • Hyperopt先进行解压,然后通过命令 python setup.py install

3.Linux服务器安装

  • 软件

    • Hyperopt 同上
    • MongoDB 地址 如果读者用的是Ubuntu或者其他版本可以在官网上下载,然后下面的命令注意改下文件名。MongoDB官网

    • 注:在运行下面的示例时,我这边报错了(no moudle named networkx),不一定都会遇到,如果遇到了,可以通过pip install networkx解决即可

  • 安装

    • Hyperopt安装同上
    • MongoDB先解压,然后在目录下输入如下命令(PS:似乎直接用pip install pymongo也可以)
(cd bin && { for F in ../mongodb-linux-x86_64-3.2.7/bin/* ; do echo "linking $F" ; ln -s $F ; done } )

4.简单使用

# define an objective function
def objective(args):
    case, val = args
    if case == 'case 1':
        return val
    else:
        return val ** 2

# define a search space
from hyperopt import hp
space = hp.choice('a',
    [
        ('case 1', 1 + hp.lognormal('c1', 0, 1)),
        ('case 2', hp.uniform('c2', -10, 10))
    ])

# minimize the objective over the space
from hyperopt import fmin, tpe
best = fmin(objective, space, algo=tpe.suggest, max_evals=100)

print best
# -> {'a': 1, 'c2': 0.01420615366247227}
import hyperopt
print hyperopt.space_eval(space, best)
# -> ('case 2', 0.01420615366247227}

5.配合XGBoost使用

# coding=UTF-8

import pandas as pd
import xgboost as xgb
import numpy as np
import utils as util
from hyperopt import fmin, hp, tpe
import hyperopt
from time import clock
from utils import *

model_name = 'xgb'

def xgb_train(dtrain, dtest, param, offline=True, verbose=True, num_boost_round=1000):
    if verbose:
        if offline:
            watchlist = [(dtrain, 'train'), (dtest, 'test')]
        else:
            watchlist = [(dtrain, 'train')]
        model = xgb.train(param, dtrain, num_boost_round=num_boost_round, evals=watchlist)
        feature_score = model.get_fscore()
        feature_score = sorted(feature_score.items(), key=lambda x: x[1], reverse=True)
        fs = []
        for key, value in feature_score:
            fs.append("{0},{1}\n".format(key, value))
        if offline:
            feature_score_file = './feature_score/offline_feature_score' + '.csv'
        else:
            feature_score_file = './feature_score/online_feature_score' + '.csv'
        f = open(feature_score_file, 'w')
        f.writelines("feature,score\n")
        f.writelines(fs)
        f.close()
    else:
        model = xgb.train(param, dtrain, num_boost_round=num_boost_round)
    return model

def xgb_predict(model, dtest):
    print 'model_best_ntree_limit : {0}\n'.format(model.best_ntree_limit)
    pred_y = model.predict(dtest, ntree_limit=model.best_ntree_limit)
    return pred_y

def tune_xgb(dtrain, dtest):
    tune_reuslt_file = "./log/tune_" + model_name + ".csv"
    f_w = open(tune_reuslt_file, 'w')
    def objective(args):
        params = {
            'booster': 'gbtree',
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'scale_pos_weight': weight,
            # 'lambda': 1000,
            'nthread': n_jobs,
            'eta': args['learning_rate'],
            # 'gamma': args['gamma'],
            'colsample_bytree': args['colsample_bytree'],
            'max_depth': args['max_depth'],
            'subsample': args['subsample']
        }

        #if fs verbose = False
        model = xgb_train(dtrain, dtest, params, offline=True, verbose=False, num_boost_round=int(args['n_estimators']))

        #model.save_model('xgb.model')
        model.dump_model('dump_model_txt')

        pred_y = xgb_predict(model, dtest)
        pred_y[pred_y>0.5] = 1
        pred_y[pred_y<=0.5] = 0
        test_y = dtest.get_label()
        F1 = evalF1(test_y, pred_y)

        xgb_log.write(str(args))
        xgb_log.write('\n')
        xgb_log.write(str(F1))
        xgb_log.write('\n')
        return F1*(-1.0)

    # Searching space
    space = {
        'n_estimators': hp.quniform("n_estimators", 100, 200, 20),
        # 'reg_lambda': hp.loguniform("reg_lambda", np.log(1), np.log(1500)),
        # 'gamma': hp.loguniform("gamma", np.log(0.1), np.log(100)),
        'learning_rate': hp.uniform("learning_rate", 0.05, 0.15),
        'max_depth': 8,
        'subsample': hp.uniform("subsample", 0.5, 0.9),
        'colsample_bytree': hp.uniform("colsample_bytree", 0.5, 0.9),
    }
    best_sln = fmin(objective, space, algo=tpe.suggest, max_evals=150)
    #best_sln = fmin(objective, space, algo=hyperopt.anneal.suggest, max_evals=300)
    pickle.dump(best_sln,f_w,True)
    best_F1 = objective(best_sln)
    xgb_log.write(str(best_F1) + '\n')
    f_w.close()

def test(dtrain, dtest,best_n_estimators):
    final_result = "./log/xgb_online_result.csv"
    f_w = open(final_result, 'w')
    model = xgb_train(dtrain, dtest, init_params, offline, verbose=True,num_boost_round=best_n_estimators)
    pred_y = xgb_predict(model, dtest)
    pred_y[pred_y>0.5] = 1
    pred_y[pred_y<=0.5] = 0
    test_y = dtest.get_label()
    F1 =  evalF1(test_y, pred_y)
    f_w.write(str(F1))
    f_w.close()

if __name__ == '__main__':
    t_start = clock()
    offline = False
    train_x,train_y,test_x,test_y,TF_id,TG_id = load_data(offline)

    #feature selection
#    fs = list(pd.read_csv('./feature_score/offline_feature_score.csv')['feature'])
#    train_x = train_x[fs]
#    test_x = test_x[fs]

    weight = float(len(train_y[train_y==0]))/len(train_y[train_y==1])
    class_weight = {1:weight,0:1}

    print 'Feature Dims : '
    print train_x.shape
    print test_x.shape

    dtrain = xgb.DMatrix(train_x,label=train_y)
    dtest = xgb.DMatrix(test_x,label=test_y)

    del train_x,train_y,test_x,test_y
    gc.collect()

    if offline:
        xgb_log = open(name='./log/xgb_log.txt',mode='w')
        tune_xgb(dtrain, dtest)
        xgb_log.close()
    else:
        tune_reuslt_file = "./log/tune_" + model_name + ".csv"
        f_w = open(tune_reuslt_file, 'r')
        tune_xgb = pickle.load(f_w)
        f_w.close()

        best_n_estimators = int(tune_xgb['n_estimators'])
        best_learning_rate = tune_xgb['learning_rate']
#        best_max_depth = int(tune_xgb['max_depth'])
        best_subsample = tune_xgb['subsample']
        best_colsample_bytree = tune_xgb['colsample_bytree']

        init_params = {
            'booster': 'gbtree',
            'objective': 'binary:logistic',
            'scale_pos_weight': weight,
            'max_depth': 8,
            'subsample': best_subsample,
            'nthread': n_jobs,
            'eval_metric': 'auc',
            'colsample_bytree': best_colsample_bytree,
            'eta': best_learning_rate
        }
        test(dtrain,dtest,best_n_estimators)

    t_finish = clock()
    print('==============Costs time : %s s==============' % str(t_finish - t_start))

6.更多官方文档链接

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
©️2022 CSDN 皮肤主题:编程工作室 设计师:CSDN官方博客 返回首页
评论 3

打赏作者

雪伦_

你的鼓励将是我创作的最大动力

¥2 ¥4 ¥6 ¥10 ¥20
输入1-500的整数
余额支付 (余额:-- )
扫码支付
扫码支付:¥2
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值