LightGBM
- 更快的训练速度和更高的效率: LightGBM使用基于直方图的算法。例如,它将连续的特征值分桶(buckets)装进离散的箱子(bins),这是的训练过程中变得更快。
- 更低的内存占用:使用离散的箱子(bins)保存并替换连续值导致更少的内存占用。
- 更高的准确率(相比于其他任何提升算法) : 它通过leaf-wise分裂方法产生比level-wise分裂方法更复杂的树,这就是实现更高准确率的主要因素。然而,它有时候或导致过拟合,但是我们可以通过设置 max-depth 参数来防止过拟合的发生。
- 大数据处理能力: 相比于XGBoost,由于它在训练时间上的缩减,它同样能够具有处理大数据的能力。
- 支持并行学习
二分类任务
params = {'objective': 'binary', #定义的目标函数
'boosting': 'gbdt',
'metric': {'binary_logloss', 'auc'}, ##评价函数选择
'learning_rate': 0.02,
'max_depth': -1,
'num_leaves': 38, #结果对最终效果影响较大,越大值越好,太大会出现过拟合
"feature_fraction": 0.9,
"bagging_freq": 1,
"bagging_fraction": 0.7,
"bagging_seed": 11,
"min_sum_hessian_in_leaf": 6,
'min_data_in_leaf': 50,
"lambda_l1": 0.1, #l1正则
# 'lambda_l2': 0.001, #l2正则
"verbosity": -1,
"nthread": 4, #线程数量,-1表示全部线程,线程越多,运行的速度越快
"random_state": 2019, #随机数种子,可以防止每次运行的结果不一致
# 'device': 'gpu' #如果安装的事gpu版本的lightgbm,可以加快运算
}
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
prob_oof = np.zeros((train_x.shape[0], ))
test_pred_prob = np.zeros((test.shape[0], ))
## train and predict
feature_importance_df = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train)):
print("fold {}".format(fold_ + 1))
trn_data = lgb.Dataset(train_x.iloc[trn_idx], label=train_y[trn_idx])
val_data = lgb.Dataset(train_x.iloc[val_idx], label=train_y[val_idx])
clf = lgb.train(params,
trn_data,
num_round,
valid_sets=[trn_data, val_data],
verbose_eval=20,
early_stopping_rounds=60)
prob_oof[val_idx] = clf.predict(train_x.iloc[val_idx], num_iteration=clf.best_iteration)
fold_importance_df = pd.DataFrame()
fold_importance_df["Feature"] = features
fold_importance_df["importance"] = clf.feature_importance()
fold_importance_df["fold"] = fold_ + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
test_pred_prob += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits
threshold = 0.5
for pred in test_pred_prob:
result = 1 if pred > threshold else 0
回归
params = {'objective': 'regression',
'boosting': 'gbdt',
'metric': 'mae',
'learning_rate': 0.02,
'max_depth': -1,
'num_leaves': 38,
"feature_fraction": 0.9,
"bagging_freq": 1,
"bagging_fraction": 0.7,
"bagging_seed": 11,
"min_sum_hessian_in_leaf": 6,
'min_data_in_leaf': 50,
"lambda_l1": 0.1,
"verbosity": -1,
"nthread": 4,
"random_state": 2019,
# 'device': 'gpu'
}
def mean_absolute_percentage_error(y_true, y_pred):
return np.mean(np.abs((y_true - y_pred) / (y_true))) * 100
def smape_func(preds, dtrain):
label = dtrain.get_label().values
epsilon = 0.1
summ = np.maximum(0.5 + epsilon, np.abs(label) + np.abs(preds) + epsilon)
smape = np.mean(np.abs(label - preds) / summ) * 2
return 'smape', float(smape), False
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof = np.zeros(train_x.shape[0])
predictions = np.zeros(test.shape[0])
train_y = np.log1p(train_y) # Data smoothing
feature_importance_df = pd.DataFrame()
for fold_n, (trn_idx, val_idx) in enumerate(folds.split(train_x)):
print("fold {}".format(fold_n + 1))
trn_data = lgb.Dataset(train_x.iloc[trn_idx], label=train_y.iloc[trn_idx])
val_data = lgb.Dataset(train_x.iloc[val_idx], label=train_y.iloc[val_idx])
clf = lgb.train(params,
trn_data,
num_round,
valid_sets=[trn_data, val_data],
verbose_eval=200,
early_stopping_rounds=200)
oof[val_idx] = clf.predict(train_x.iloc[val_idx], num_iteration=clf.best_iteration)
fold_importance_df = pd.DataFrame()
fold_importance_df["Feature"] = features
fold_importance_df["importance"] = clf.feature_importance()
fold_importance_df["fold"] = fold_ + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
predictions += clf.predict(test, num_iteration=clf.best_iteration) / folds.n_splits
print('mse %.6f' % mean_squared_error(train_y, oof))
print('mae %.6f' % mean_absolute_error(train_y, oof))
result = np.expm1(predictions) #reduction
result = predictions
多分类
params = {'objective': 'multiclass',
'num_class': 33,
'boosting': 'gbdt',
'metric': 'multi_logloss',
'learning_rate': 0.02,
'max_depth': -1,
'num_leaves': 38,
"feature_fraction": 0.9,
"bagging_freq": 1,
"bagging_fraction": 0.7,
"bagging_seed": 11,
"min_sum_hessian_in_leaf": 6,
'min_data_in_leaf': 50,
"lambda_l1": 0.1,
# 'lambda_l2': 0.001,
"verbosity": -1,
"nthread": 4,
"random_state": 2019,
# 'device': 'gpu'
}
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
prob_oof = np.zeros((train_x.shape[0], 33))
test_pred_prob = np.zeros((test.shape[0], 33))
## train and predict
feature_importance_df = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train)):
print("fold {}".format(fold_ + 1))
trn_data = lgb.Dataset(train_x.iloc[trn_idx], label=train_y.iloc[trn_idx])
val_data = lgb.Dataset(train_x.iloc[val_idx], label=train_y.iloc[val_idx])
clf = lgb.train(params,
trn_data,
num_round,
valid_sets=[trn_data, val_data],
verbose_eval=20,
early_stopping_rounds=60)
prob_oof[val_idx] = clf.predict(train_x.iloc[val_idx], num_iteration=clf.best_iteration)
fold_importance_df = pd.DataFrame()
fold_importance_df["Feature"] = features
fold_importance_df["importance"] = clf.feature_importance()
fold_importance_df["fold"] = fold_ + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
test_pred_prob += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits
result = np.argmax(test_pred_prob, axis=1)
特征曲线
import matplotlib.pyplot as plt
def plot_feature_importance(train, model):
list_feature_name = list(train.columns[1:])
list_feature_importance = list(model.feature_importance(importance_type='split', iteration=-1))
dataframe_feature_importance = pd.DataFrame({'feature_name': list_feature_name, 'importance': list_feature_importance})
print(dataframe_feature_importance)
x = range(len(list_feature_name))
plt.xticks(x, list_feature_name, rotation=45, fontsize=10)
plt.plot(x, list_feature_importance)
plt.show()