Hyperopt

1.简介

Hyperopt是一个python库,结合MongoDB可以进行分布式调参,快速找到相对较优的参数。安装的时候需要指定dev版本才能使用模拟退火调参,也支持暴力调参、随机调参等策略。

2.Win7安装

  • 软件
  • 安装
    • MongoDB一路默认安装即可
    • Hyperopt先进行解压,然后通过命令 python setup.py install

3.Linux服务器安装

  • 软件

    • Hyperopt 同上
    • MongoDB 地址 如果读者用的是Ubuntu或者其他版本可以在官网上下载,然后下面的命令注意改下文件名。MongoDB官网

    • 注:在运行下面的示例时,我这边报错了(no moudle named networkx),不一定都会遇到,如果遇到了,可以通过pip install networkx解决即可

  • 安装

    • Hyperopt安装同上
    • MongoDB先解压,然后在目录下输入如下命令(PS:似乎直接用pip install pymongo也可以)
(cd bin && { for F in ../mongodb-linux-x86_64-3.2.7/bin/* ; do echo "linking $F" ; ln -s $F ; done } )

4.简单使用

# define an objective function
def objective(args):
    case, val = args
    if case == 'case 1':
        return val
    else:
        return val ** 2

# define a search space
from hyperopt import hp
space = hp.choice('a',
    [
        ('case 1', 1 + hp.lognormal('c1', 0, 1)),
        ('case 2', hp.uniform('c2', -10, 10))
    ])

# minimize the objective over the space
from hyperopt import fmin, tpe
best = fmin(objective, space, algo=tpe.suggest, max_evals=100)

print best
# -> {'a': 1, 'c2': 0.01420615366247227}
import hyperopt
print hyperopt.space_eval(space, best)
# -> ('case 2', 0.01420615366247227}

5.配合XGBoost使用

# coding=UTF-8

import pandas as pd
import xgboost as xgb
import numpy as np
import utils as util
from hyperopt import fmin, hp, tpe
import hyperopt
from time import clock
from utils import *

model_name = 'xgb'

def xgb_train(dtrain, dtest, param, offline=True, verbose=True, num_boost_round=1000):
    if verbose:
        if offline:
            watchlist = [(dtrain, 'train'), (dtest, 'test')]
        else:
            watchlist = [(dtrain, 'train')]
        model = xgb.train(param, dtrain, num_boost_round=num_boost_round, evals=watchlist)
        feature_score = model.get_fscore()
        feature_score = sorted(feature_score.items(), key=lambda x: x[1], reverse=True)
        fs = []
        for key, value in feature_score:
            fs.append("{0},{1}\n".format(key, value))
        if offline:
            feature_score_file = './feature_score/offline_feature_score' + '.csv'
        else:
            feature_score_file = './feature_score/online_feature_score' + '.csv'
        f = open(feature_score_file, 'w')
        f.writelines("feature,score\n")
        f.writelines(fs)
        f.close()
    else:
        model = xgb.train(param, dtrain, num_boost_round=num_boost_round)
    return model

def xgb_predict(model, dtest):
    print 'model_best_ntree_limit : {0}\n'.format(model.best_ntree_limit)
    pred_y = model.predict(dtest, ntree_limit=model.best_ntree_limit)
    return pred_y

def tune_xgb(dtrain, dtest):
    tune_reuslt_file = "./log/tune_" + model_name + ".csv"
    f_w = open(tune_reuslt_file, 'w')
    def objective(args):
        params = {
            'booster': 'gbtree',
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'scale_pos_weight': weight,
            # 'lambda': 1000,
            'nthread': n_jobs,
            'eta': args['learning_rate'],
            # 'gamma': args['gamma'],
            'colsample_bytree': args['colsample_bytree'],
            'max_depth': args['max_depth'],
            'subsample': args['subsample']
        }

        #if fs verbose = False
        model = xgb_train(dtrain, dtest, params, offline=True, verbose=False, num_boost_round=int(args['n_estimators']))

        #model.save_model('xgb.model')
        model.dump_model('dump_model_txt')

        pred_y = xgb_predict(model, dtest)
        pred_y[pred_y>0.5] = 1
        pred_y[pred_y<=0.5] = 0
        test_y = dtest.get_label()
        F1 = evalF1(test_y, pred_y)

        xgb_log.write(str(args))
        xgb_log.write('\n')
        xgb_log.write(str(F1))
        xgb_log.write('\n')
        return F1*(-1.0)

    # Searching space
    space = {
        'n_estimators': hp.quniform("n_estimators", 100, 200, 20),
        # 'reg_lambda': hp.loguniform("reg_lambda", np.log(1), np.log(1500)),
        # 'gamma': hp.loguniform("gamma", np.log(0.1), np.log(100)),
        'learning_rate': hp.uniform("learning_rate", 0.05, 0.15),
        'max_depth': 8,
        'subsample': hp.uniform("subsample", 0.5, 0.9),
        'colsample_bytree': hp.uniform("colsample_bytree", 0.5, 0.9),
    }
    best_sln = fmin(objective, space, algo=tpe.suggest, max_evals=150)
    #best_sln = fmin(objective, space, algo=hyperopt.anneal.suggest, max_evals=300)
    pickle.dump(best_sln,f_w,True)
    best_F1 = objective(best_sln)
    xgb_log.write(str(best_F1) + '\n')
    f_w.close()

def test(dtrain, dtest,best_n_estimators):
    final_result = "./log/xgb_online_result.csv"
    f_w = open(final_result, 'w')
    model = xgb_train(dtrain, dtest, init_params, offline, verbose=True,num_boost_round=best_n_estimators)
    pred_y = xgb_predict(model, dtest)
    pred_y[pred_y>0.5] = 1
    pred_y[pred_y<=0.5] = 0
    test_y = dtest.get_label()
    F1 =  evalF1(test_y, pred_y)
    f_w.write(str(F1))
    f_w.close()

if __name__ == '__main__':
    t_start = clock()
    offline = False
    train_x,train_y,test_x,test_y,TF_id,TG_id = load_data(offline)

    #feature selection
#    fs = list(pd.read_csv('./feature_score/offline_feature_score.csv')['feature'])
#    train_x = train_x[fs]
#    test_x = test_x[fs]

    weight = float(len(train_y[train_y==0]))/len(train_y[train_y==1])
    class_weight = {1:weight,0:1}

    print 'Feature Dims : '
    print train_x.shape
    print test_x.shape

    dtrain = xgb.DMatrix(train_x,label=train_y)
    dtest = xgb.DMatrix(test_x,label=test_y)

    del train_x,train_y,test_x,test_y
    gc.collect()

    if offline:
        xgb_log = open(name='./log/xgb_log.txt',mode='w')
        tune_xgb(dtrain, dtest)
        xgb_log.close()
    else:
        tune_reuslt_file = "./log/tune_" + model_name + ".csv"
        f_w = open(tune_reuslt_file, 'r')
        tune_xgb = pickle.load(f_w)
        f_w.close()

        best_n_estimators = int(tune_xgb['n_estimators'])
        best_learning_rate = tune_xgb['learning_rate']
#        best_max_depth = int(tune_xgb['max_depth'])
        best_subsample = tune_xgb['subsample']
        best_colsample_bytree = tune_xgb['colsample_bytree']

        init_params = {
            'booster': 'gbtree',
            'objective': 'binary:logistic',
            'scale_pos_weight': weight,
            'max_depth': 8,
            'subsample': best_subsample,
            'nthread': n_jobs,
            'eval_metric': 'auc',
            'colsample_bytree': best_colsample_bytree,
            'eta': best_learning_rate
        }
        test(dtrain,dtest,best_n_estimators)

    t_finish = clock()
    print('==============Costs time : %s s==============' % str(t_finish - t_start))

6.更多官方文档链接

Hyperopt是一个用于超参数优化的Python包。它使用贝叶斯优化算法和模型选择算法,来寻找模型的最佳超参数。Hyperopt包可以帮助我们自动搜索超参数空间,找到最适合我们模型的超参数组合。这个包非常强大并且易于使用。 要下载Hyperopt包,我们可以通过以下步骤来完成: 1. 安装Python:首先,我们需要在我们的计算机上安装Python。可以从官方网站上下载最新版本的Python,并按照安装向导进行安装。 2. 安装pip:在安装Python之后,我们需要安装pip,它是Python的包管理器。pip可以帮助我们轻松地安装和管理Python包。 3. 打开命令提示符或终端:在Windows系统中,我们可以按下Win键+R键,然后输入"cmd"并按下回车键来打开命令提示符。在类Unix系统中,我们可以按下Ctrl+Alt+T键来打开终端。 4. 使用pip安装Hyperopt:在命令提示符或终端中,我们可以输入以下命令来安装Hyperopt包:pip install hyperopt 5. 等待安装完成:一旦我们输入了上面的命令,pip就会自动下载并安装Hyperopt包。我们只需要耐心等待安装过程完成。 安装完成后,我们就可以在Python中使用Hyperopt包了。通过导入hyperopt模块,我们可以开始使用Hyperopt的超参数优化功能。 总的来说,Hyperopt是一个强大且易于使用的包,它可以帮助我们自动搜索最佳超参数组合。通过按照上述步骤下载和安装Hyperopt包,我们可以方便地在Python中使用它进行超参数优化。
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值