Hyperopt

1.简介

Hyperopt是一个python库,结合MongoDB可以进行分布式调参,快速找到相对较优的参数。安装的时候需要指定dev版本才能使用模拟退火调参,也支持暴力调参、随机调参等策略。

2.Win7安装

  • 软件
  • 安装
    • MongoDB一路默认安装即可
    • Hyperopt先进行解压,然后通过命令 python setup.py install

3.Linux服务器安装

  • 软件

    • Hyperopt 同上
    • MongoDB 地址 如果读者用的是Ubuntu或者其他版本可以在官网上下载,然后下面的命令注意改下文件名。MongoDB官网

    • 注:在运行下面的示例时,我这边报错了(no moudle named networkx),不一定都会遇到,如果遇到了,可以通过pip install networkx解决即可

  • 安装

    • Hyperopt安装同上
    • MongoDB先解压,然后在目录下输入如下命令(PS:似乎直接用pip install pymongo也可以)
(cd bin && { for F in ../mongodb-linux-x86_64-3.2.7/bin/* ; do echo "linking $F" ; ln -s $F ; done } )

4.简单使用

# define an objective function
def objective(args):
    case, val = args
    if case == 'case 1':
        return val
    else:
        return val ** 2

# define a search space
from hyperopt import hp
space = hp.choice('a',
    [
        ('case 1', 1 + hp.lognormal('c1', 0, 1)),
        ('case 2', hp.uniform('c2', -10, 10))
    ])

# minimize the objective over the space
from hyperopt import fmin, tpe
best = fmin(objective, space, algo=tpe.suggest, max_evals=100)

print best
# -> {'a': 1, 'c2': 0.01420615366247227}
import hyperopt
print hyperopt.space_eval(space, best)
# -> ('case 2', 0.01420615366247227}

5.配合XGBoost使用

# coding=UTF-8

import pandas as pd
import xgboost as xgb
import numpy as np
import utils as util
from hyperopt import fmin, hp, tpe
import hyperopt
from time import clock
from utils import *

model_name = 'xgb'

def xgb_train(dtrain, dtest, param, offline=True, verbose=True, num_boost_round=1000):
    if verbose:
        if offline:
            watchlist = [(dtrain, 'train'), (dtest, 'test')]
        else:
            watchlist = [(dtrain, 'train')]
        model = xgb.train(param, dtrain, num_boost_round=num_boost_round, evals=watchlist)
        feature_score = model.get_fscore()
        feature_score = sorted(feature_score.items(), key=lambda x: x[1], reverse=True)
        fs = []
        for key, value in feature_score:
            fs.append("{0},{1}\n".format(key, value))
        if offline:
            feature_score_file = './feature_score/offline_feature_score' + '.csv'
        else:
            feature_score_file = './feature_score/online_feature_score' + '.csv'
        f = open(feature_score_file, 'w')
        f.writelines("feature,score\n")
        f.writelines(fs)
        f.close()
    else:
        model = xgb.train(param, dtrain, num_boost_round=num_boost_round)
    return model

def xgb_predict(model, dtest):
    print 'model_best_ntree_limit : {0}\n'.format(model.best_ntree_limit)
    pred_y = model.predict(dtest, ntree_limit=model.best_ntree_limit)
    return pred_y

def tune_xgb(dtrain, dtest):
    tune_reuslt_file = "./log/tune_" + model_name + ".csv"
    f_w = open(tune_reuslt_file, 'w')
    def objective(args):
        params = {
            'booster': 'gbtree',
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'scale_pos_weight': weight,
            # 'lambda': 1000,
            'nthread': n_jobs,
            'eta': args['learning_rate'],
            # 'gamma': args['gamma'],
            'colsample_bytree': args['colsample_bytree'],
            'max_depth': args['max_depth'],
            'subsample': args['subsample']
        }

        #if fs verbose = False
        model = xgb_train(dtrain, dtest, params, offline=True, verbose=False, num_boost_round=int(args['n_estimators']))

        #model.save_model('xgb.model')
        model.dump_model('dump_model_txt')

        pred_y = xgb_predict(model, dtest)
        pred_y[pred_y>0.5] = 1
        pred_y[pred_y<=0.5] = 0
        test_y = dtest.get_label()
        F1 = evalF1(test_y, pred_y)

        xgb_log.write(str(args))
        xgb_log.write('\n')
        xgb_log.write(str(F1))
        xgb_log.write('\n')
        return F1*(-1.0)

    # Searching space
    space = {
        'n_estimators': hp.quniform("n_estimators", 100, 200, 20),
        # 'reg_lambda': hp.loguniform("reg_lambda", np.log(1), np.log(1500)),
        # 'gamma': hp.loguniform("gamma", np.log(0.1), np.log(100)),
        'learning_rate': hp.uniform("learning_rate", 0.05, 0.15),
        'max_depth': 8,
        'subsample': hp.uniform("subsample", 0.5, 0.9),
        'colsample_bytree': hp.uniform("colsample_bytree", 0.5, 0.9),
    }
    best_sln = fmin(objective, space, algo=tpe.suggest, max_evals=150)
    #best_sln = fmin(objective, space, algo=hyperopt.anneal.suggest, max_evals=300)
    pickle.dump(best_sln,f_w,True)
    best_F1 = objective(best_sln)
    xgb_log.write(str(best_F1) + '\n')
    f_w.close()

def test(dtrain, dtest,best_n_estimators):
    final_result = "./log/xgb_online_result.csv"
    f_w = open(final_result, 'w')
    model = xgb_train(dtrain, dtest, init_params, offline, verbose=True,num_boost_round=best_n_estimators)
    pred_y = xgb_predict(model, dtest)
    pred_y[pred_y>0.5] = 1
    pred_y[pred_y<=0.5] = 0
    test_y = dtest.get_label()
    F1 =  evalF1(test_y, pred_y)
    f_w.write(str(F1))
    f_w.close()

if __name__ == '__main__':
    t_start = clock()
    offline = False
    train_x,train_y,test_x,test_y,TF_id,TG_id = load_data(offline)

    #feature selection
#    fs = list(pd.read_csv('./feature_score/offline_feature_score.csv')['feature'])
#    train_x = train_x[fs]
#    test_x = test_x[fs]

    weight = float(len(train_y[train_y==0]))/len(train_y[train_y==1])
    class_weight = {1:weight,0:1}

    print 'Feature Dims : '
    print train_x.shape
    print test_x.shape

    dtrain = xgb.DMatrix(train_x,label=train_y)
    dtest = xgb.DMatrix(test_x,label=test_y)

    del train_x,train_y,test_x,test_y
    gc.collect()

    if offline:
        xgb_log = open(name='./log/xgb_log.txt',mode='w')
        tune_xgb(dtrain, dtest)
        xgb_log.close()
    else:
        tune_reuslt_file = "./log/tune_" + model_name + ".csv"
        f_w = open(tune_reuslt_file, 'r')
        tune_xgb = pickle.load(f_w)
        f_w.close()

        best_n_estimators = int(tune_xgb['n_estimators'])
        best_learning_rate = tune_xgb['learning_rate']
#        best_max_depth = int(tune_xgb['max_depth'])
        best_subsample = tune_xgb['subsample']
        best_colsample_bytree = tune_xgb['colsample_bytree']

        init_params = {
            'booster': 'gbtree',
            'objective': 'binary:logistic',
            'scale_pos_weight': weight,
            'max_depth': 8,
            'subsample': best_subsample,
            'nthread': n_jobs,
            'eval_metric': 'auc',
            'colsample_bytree': best_colsample_bytree,
            'eta': best_learning_rate
        }
        test(dtrain,dtest,best_n_estimators)

    t_finish = clock()
    print('==============Costs time : %s s==============' % str(t_finish - t_start))

6.更多官方文档链接

评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值