https://pypi.org/project/xgboost/#files
https://my.oschina.net/u/2293326/blog/1838923
https://dask-ml.readthedocs.io/en/stable/modules/generated/dask_ml.xgboost.XGBClassifier.html
https://github.com/dmlc/xgboost/issues/2073
https://blog.csdn.net/zc02051126/article/details/46711047
有空还需要学怎么利用之前训练好到模型,继续添加新数据学习
######
'''
https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
'''
######
#coding=utf-8
#Import libraries:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics #Additional scklearn functions
from sklearn.grid_search import GridSearchCV
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
import pickle
import os
import time
rcParams['figure.figsize'] = 12, 4
#train = pd.read_csv('part-format-all.csv', sep='\t', header=0)
train = pd.read_csv('part-format-all.csv', sep='\t', header=0)
#test = pd.read_csv('test-format.csv', sep='\t', header=0)
test = pd.read_csv('part-format-test.csv', sep='\t', header=0)
#print train
target = 'label'
#IDcol = 'ID'
#Choose all predictors except target & IDcols
predictors = [x for x in train.columns if x not in [target]]
print "predictors: ", predictors
def create_feature_list(features):
outfile = open('feature.list', 'w')
i = 0
for feat in features:
outfile.write('{0}\n'.format(feat))
outfile.close()
def create_feature_map(features):
outfile = open('feature.map', 'w')
i = 0
for feat in features:
outfile.write('{0}\t{1}\tq\n'.format(i, feat))
i = i + 1
outfile.close()
def create_feature_info(features):
create_feature_list(features)
create_feature_map(features)
def report_model_accuracy(alg):
#Predict training set:
# fix incompatiable between xgboost and sklean
alg.get_booster().feature_names = predictors
dtest_predictions = alg.predict(test[predictors],validate_features=predictors)
dtest_predprob = alg.predict_proba(test[predictors])[:,1]
print "Model Report"
print "Accuracy : %.4g" % metrics.accuracy_score(test[target].values, dtest_predictions)
print "AUC Score (Train): %f" % metrics.roc_auc_score(test[target], dtest_predprob)
#print alg
#feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
feat_imp = alg.get_booster().get_fscore()
feat_imp = sorted(feat_imp.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)
print feat_imp
#feat_imp = pd.Series(feat_imp)
#feat_imp.plot(kind='bar', title='Feature Importances')
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='auc', early_stopping_rounds=early_stopping_rounds)#, show_progress=False)
alg.set_params(n_estimators=cvresult.shape[0])
#Fit the xgborithm on the data
alg.fit(dtrain[predictors], dtrain[target],eval_metric='auc',verbose=True)
#print model report
report_model_accuracy(alg)
def train_rough_param():
tic_start = time.time()
alg = XGBClassifier( base_score=0.5,
booster='gbtree',
colsample_bylevel=1,
colsample_bytree=0.8, # 生成树时进行的列采样
gamma=0, # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这样子
learning_rate=0.1, # 学习率
max_delta_step=0, # 最大增量步长,我们允许每个树的权重估计, 通常1-10
max_depth=5, # 构建树的深度,越大越容易过拟合
min_child_weight=1,
# 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言
# ,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本
# 这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting
missing=None,
n_estimators=1000, # 树的个数
nthread=None, # cpu 线程数 默认最大 decrepted, using n_jobs instead
n_jobs=4,
objective='binary:logistic', # multi:softmax, multi:softprob
# num_class=2, # 类别数,多分类与 multisoftmax 并用
seed=None, # 随机种子
random_state=27, # 随机种子将被弃用,使用random_state
reg_alpha=0, # L1 正则项参数
reg_lambda=1, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合
scale_pos_weight=1, # 如果取值大于0的话,在样本不平衡的情况下有助快速收敛平衡正负权重
silent=True, # 设置成1则没有运行信息输出,最好是设置为0.是否在运行升级时打印消息
subsample=0.8) # 随机采样训练样本 训练实例的子采样比
modelfit(alg, train, predictors)
create_feature_info(predictors)
tic_end = time.time()
print "train rough model time cost: %.2fs" % (tic_end - tic_start)
return alg
def update_model(alg, param):
for item, val in param.iteritems():
eval("alg.set_params(" + item + "=" + str(val) + ")")
return alg
def tune_model(alg, param):
#print xgb
# silent error of constructor does not seem to set parameter missing
tic_start = time.time()
alg.set_params(missing=None)
gsearch = GridSearchCV(estimator = alg, param_grid = param, scoring='roc_auc', iid=False, cv=5)
gsearch.fit(train[predictors], train[target])
print "gsearch.grid_scores_:", gsearch.grid_scores_
print "gsearch.best_params_:", gsearch.best_params_
print "gsearch.best_score_:", gsearch.best_score_
alg = gsearch.best_estimator_
report_model_accuracy(alg)
tic_end = time.time()
print "tune model time cost: %.2fs" % (tic_end - tic_start)
return alg
def main():
model_path = "alg1.pickle.dat"
xgb1 = []
FORCE_TRAIN_ROUGH = True
# rough model
print "\n=== rough model: ==="
if not FORCE_TRAIN_ROUGH and os.path.exists(model_path):
with open(model_path, 'rb') as f:
alg1 = pickle.load(f)
print "found model, loaded"
else:
alg1 = train_rough_param()
pickle.dump(alg1, open("xbg1.pickle.dat", "wb"))
print alg1
# Tune max_depth and min_child_weight
print "\n=== after tune max_depth and min_child_weight: ==="
param2 = {'max_depth' : range(3,10,2), 'min_child_weight' : range(1,6,2)}
alg2 = tune_model(alg1, param2)
print alg2
# Tune gamma
print "\n=== after tune gamma: ==="
param3 = {'gamma':[i/10.0 for i in range(0,5)]}
alg3 = tune_model(alg2, param3)
print alg3
# Tune subsample and colsample_bytree
print "\n=== after tune subsample and colsample_bytree: ==="
param4 = {'subsample':[i/10.0 for i in range(6,10)], 'colsample_bytree':[i/10.0 for i in range(6,10)]}
alg4 = tune_model(alg3, param4)
print alg4
# Tuning regularization
print "\n=== after tune regularization: ==="
param5 = {'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]}
alg5 = tune_model(alg4, param5)
print alg5
# Tuning learning rate
print "\n=== after tune learning rate: ==="
param6 = {'learning_rate':[0.01,0.05,0.1,0.2]}
alg6 = tune_model(alg5, param6)
print alg6
alg_final = alg6
# dump for c++ use
# dump model with feature map
#alg_final.dump_model('dump.raw.txt','featmap.txt')
#pickle.dump(alg_final, open("alg_final.pickle.dat", "wb"))
alg_final._Booster.save_model("alg_final.bin")
#alg_final._Booster.dump_model('alg_final.raw.txt','featmap.txt')
print "\n=== all process done OK ==="
if __name__ == "__main__":
main()