N e s t i m a t o r s Nestimators Nestimators
导入所需工具包
from xgboost import XGBClassifier
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from matplotlib import pyplot
import seaborn as sns
%matplotlib inline
读取数据
# path to where the data lies
#dpath = '/Users/qing/desktop/XGBoost/data/'
dpath = './data/'
train = pd.read_csv(dpath +"Otto_train.csv")
train.head()
Variable Identification
选择该数据集是因为的数据特征单一,我们可以在特征工程方面少做些工作,集中精力放在参数调优上
Target 分布,看看各类样本分布是否均衡
sns.countplot(train.target);
pyplot.xlabel('target');
pyplot.ylabel('Number of occurrences');
每类样本分布不是很均匀,所以交叉验证时也考虑各类样本按比例抽取
# drop ids and get labels
y_train = train['target']
y_train = y_train.map(lambda s: s[6:])
y_train = y_train.map(lambda s: int(s)-1)
train = train.drop(["id", "target"], axis=1)
X_train = np.array(train)
各类样本不均衡,交叉验证是采用StratifiedKFold,在每折采样时各类样本按比例采样
# prepare cross validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)
默认参数,此时学习率为0.1,比较大,观察弱分类数目的大致范围
(采用默认参数配置,看看模型是过拟合还是欠拟合)
def modelfit(alg, X_train, y_train, useTrainCV=True, cv_folds=None, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgb_param['num_class'] = 9
xgtrain = xgb.DMatrix(X_train, label = y_train)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], folds =cv_folds,
metrics='mlogloss', early_stopping_rounds=early_stopping_rounds)
n_estimators = cvresult.shape[0]
alg.set_params(n_estimators = n_estimators)
print (cvresult)
#result = pd.DataFrame(cvresult) #cv缺省返回结果为DataFrame
#result.to_csv('my_preds.csv', index_label = 'n_estimators')
cvresult.to_csv('my_preds_4_1.csv', index_label = 'n_estimators')
# plot
test_means = cvresult['test-mlogloss-mean']
test_stds = cvresult['test-mlogloss-std']
train_means = cvresult['train-mlogloss-mean']
train_stds = cvresult['train-mlogloss-std']
x_axis = range(0, n_estimators)
pyplot.errorbar(x_axis, test_means, yerr=test_stds ,label='Test')
pyplot.errorbar(x_axis, train_means, yerr=train_stds ,label='Train')
pyplot.title("XGBoost n_estimators vs Log Loss")
pyplot.xlabel( 'n_estimators' )
pyplot.ylabel( 'Log Loss' )
pyplot.savefig( 'n_estimators.png' )
#Fit the algorithm on the data
alg.fit(X_train, y_train, eval_metric='mlogloss')
#Predict training set:
train_predprob = alg.predict_proba(X_train)
logloss = log_loss(y_train, train_predprob)
#Print model report:
print ("logloss of train :" )
print (logloss)
#params = {"objective": "multi:softprob", "eval_metric":"mlogloss", "num_class": 9}
xgb1 = XGBClassifier(
learning_rate =0.1,
n_estimators=1000, #数值大没关系,cv会自动返回合适的n_estimators
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.3,
colsample_bytree=0.8,
colsample_bylevel=0.7,
objective= 'multi:softprob',
seed=3)
modelfit(xgb1, X_train, y_train, cv_folds = kfold)
cvresult = pd.DataFrame.from_csv('my_preds_4_1.csv')
# plot
test_means = cvresult['test-mlogloss-mean']
test_stds = cvresult['test-mlogloss-std']
train_means = cvresult['train-mlogloss-mean']
train_stds = cvresult['train-mlogloss-std']
x_axis = range(0, cvresult.shape[0])
pyplot.errorbar(x_axis, test_means, yerr=test_stds ,label='Test')
pyplot.errorbar(x_axis, train_means, yerr=train_stds ,label='Train')
pyplot.title("XGBoost n_estimators vs Log Loss")
pyplot.xlabel( 'n_estimators' )
pyplot.ylabel( 'Log Loss' )
pyplot.savefig( 'n_estimators4_1.png' )
pyplot.show()
cvresult = pd.DataFrame.from_csv('my_preds_4_1.csv')
cvresult = cvresult.iloc[100:]
# plot
test_means = cvresult['test-mlogloss-mean']
test_stds = cvresult['test-mlogloss-std']
train_means = cvresult['train-mlogloss-mean']
train_stds = cvresult['train-mlogloss-std']
x_axis = range(100,cvresult.shape[0]+100)
fig = pyplot.figure(figsize=(10, 10), dpi=100)
pyplot.errorbar(x_axis, test_means, yerr=test_stds ,label='Test')
pyplot.errorbar(x_axis, train_means, yerr=train_stds ,label='Train')
pyplot.title("XGBoost n_estimators vs Log Loss")
pyplot.xlabel( 'n_estimators' )
pyplot.ylabel( 'Log Loss' )
pyplot.savefig( 'n_estimators_detail.png' )
pyplot.show()