- 背景
根据“kaggle案例--Instacart Market Basket Analysis(1) ”生成的数据,对xgboost与lightGBM进行效果比较。
- 效果比较
数据量: (847466, 20)
xgboost训练时间41s, 精度0.27
lightgbm 训练时间9s, 精度0.28
可以发现, lightgbm训练速度确实比xgboost快很多,且精度损失不大。
- 测试代码
import os
import datetime
from datetime import datetime
import pandas as pd
from sklearn.cross_validation import train_test_split
os.chdir(r'd:\pywork\Instacart')
data=pd.read_csv('data.txt')
train = data.loc[data.eval_set == "train",:]
train.drop(['eval_set', 'user_id', 'product_id', 'order_id'], axis=1, inplace=True)
train.loc[:, 'reordered'] = train.reordered.fillna(0)
X_train, X_val, y_train, y_val = train_test_split(train.drop('reordered', axis=1), train.reordered, test_size=0.9, random_state=42)
import xgboost
d_train = xgboost.DMatrix(X_train, y_train)
xgb_params = {
"objective" : "reg:logistic"
,"eval_metric" : "logloss"
,"eta" : 0.1
,"max_depth" : 6
,"min_child_weight" :10
,"gamma" :0.70
,"subsample" :0.76
,"colsample_bytree" :0.95
,"alpha" :2e-05
,"lambda" :10
}
watchlist= [(d_train, "train")]
xgb_start=datetime.now()
bst = xgboost.train(params=xgb_params, dtrain=d_train, num_boost_round=80, evals=watchlist, verbose_eval=10)
xgb_end=datetime.now()
print 'spendt time :'+ str((xgb_end-xgb_start).seconds)+'(s)'
xgboost.plot_importance(bst)
''' train result:
[0] train-logloss:0.625642
[10] train-logloss:0.335753
[20] train-logloss:0.269213
[30] train-logloss:0.252115
[40] train-logloss:0.247442
[50] train-logloss:0.245712
[60] train-logloss:0.244735
[70] train-logloss:0.243973
[79] train-logloss:0.243472
spendt time :41(s)
'''
pre_data = xgboost.DMatrix(X_val, y_val)
predict=bst.predict(pre_data)
X_val['reorder']=y_val
X_val['pre']=predict
print "test score is :"
precision=float(len(X_val[(X_val['pre']>0.5) & (X_val['reorder']==1)]))/\
float(len(X_val[X_val['pre']>0.5]))
recall=float(len(X_val[(X_val['pre']>0.5) & (X_val['reorder']==1)]))/\
float(len(X_val[X_val['reorder']==1]))
f1_score=2*(precision*recall)/(precision+recall)
'f1_score: 0.27986198335189855'
import numpy as np
import lightgbm as lgb
labels = np.array(y_train, dtype=np.int8)
d_train = lgb.Dataset(X_train,
label=labels)
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': {'binary_logloss'},
'num_leaves': 96,
'max_depth': 10,
'feature_fraction': 0.9,
'bagging_fraction': 0.95,
'bagging_freq': 5
}
ROUNDS = 100
watchlist=[d_train]
lgb_start=datetime.now()
bst = lgb.train(params=params, train_set=d_train, num_boost_round=ROUNDS,valid_sets=watchlist,verbose_eval=10)
lgb_end=datetime.now()
print 'spendt time :'+str((lgb_end-lgb_start).seconds)+'(s)'
''' lgb train score:
[10] training's binary_logloss: 0.348039
[20] training's binary_logloss: 0.271007
[30] training's binary_logloss: 0.250972
[40] training's binary_logloss: 0.245258
[50] training's binary_logloss: 0.242898
[60] training's binary_logloss: 0.241338
[70] training's binary_logloss: 0.240099
[80] training's binary_logloss: 0.239047
[90] training's binary_logloss: 0.238009
[100] training's binary_logloss: 0.236996
spendt time :9(s)
'''
X_val=X_val.drop(['reorder','pre'], axis=1)
predict=bst.predict(X_val)
X_val['reorder']=y_val
X_val['pre']=predict
print "test score is :"
precision=float(len(X_val[(X_val['pre']>0.5) & (X_val['reorder']==1)]))/\
float(len(X_val[X_val['pre']>0.5]))
recall=float(len(X_val[(X_val['pre']>0.5) & (X_val['reorder']==1)]))/\
float(len(X_val[X_val['reorder']==1]))
f1_score=2*(precision*recall)/(precision+recall)
print 'F1 score is :'+str(f1_score)
'F1 score is :0.28706938762'