逻辑回归模型
1 学习内容
逻辑回归模型
树模型
集成模型
基于bagging思想的集成模型
随机森林模型
随机树模型
基于boosting思想的集成模型
基于stacking思想的集成模型
模型对比与性能评估
模型调参
贪心调参方法
网格调参方法
贝叶斯调参方法
2 代码实现
2.1 导入数据
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
2.2 五折交叉验证
sample_feature['notRepairedDamage'] = sample_feature['notRepairedDamage'].astype(np.float32)
train = sample_feature[continuous_feature_names + ['price']]
train_X = train[continuous_feature_names]
train_y = train['price']
# 自定义损失函数
def myFeval(preds, xgbtrain):
label = xgbtrain.get_label()
score = mean_absolute_error(np.expm1(label), np.expm1(preds))
return 'myFeval', score, False
param = {'boosting_type': 'gbdt',
'num_leaves': 64,
'max_depth': 10,
"lambda_l2": 1, # 防止过拟合
"lambda_l1": 1, # 防止过拟合
'min_data_in_leaf': 20, # 防止过拟合,好像都不用怎么调
'objective': 'regression_l1',
'learning_rate': 0.01,
"min_child_samples": 20,
'verbosity': -1,
"feature_fraction": 0.8,
"bagging_freq": 1,
"bagging_fraction": 0.8,
"bagging_seed": 11,
"metric": 'mae',
}
folds = KFold(n_splits=5, shuffle=True, random_state=2020)
oof_lgb = np.zeros(len(X_data))
predictions_lgb = np.zeros(len(X_test))
predictions_train_lgb = np.zeros(len(X_data))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_data, Y_data)):
print("fold n°{}".format(fold_ + 1))
trn_data = lgb.Dataset(X_data[trn_idx], Y_data[trn_idx])
val_data = lgb.Dataset(X_data[val_idx], Y_data[val_idx])
num_round = 50000
clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=300,
early_stopping_rounds=300, feval=myFeval)
feature_list_name = clf.feature_name()
df = pd.DataFrame(feature_list_name, columns=['feature'])
df['importance'] = list(clf.feature_importance())
df = df.sort_values(by='importance', ascending=False)
df.to_csv("feature_importance" + str(fold_) + ".csv", index=False)
oof_lgb[val_idx] = clf.predict(X_data[val_idx], num_iteration=clf.best_iteration)
predictions_lgb += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits
predictions_train_lgb += clf.predict(X_data, num_iteration=clf.best_iteration) / folds.n_splits
print("lightgbm score: {:<8.8f}".format(mean_absolute_error(np.expm1(oof_lgb), np.expm1(Y_data))))
在实际的训练中,结果对于训练集的拟合程度一般挺好,但是对于训练集之外的数据,它的拟合程度并不能令我们满意。因此我们通常并不会把所有的数据集都拿来训练,而是摘出一部分来进行测试,相对客观的判断这些参数对训练集之外的数据的符合程度。这种思想就称为交叉验证
2.3 模拟真实业务情况
import datetime
sample_feature = sample_feature.reset_index(drop=True)
split_point = len(sample_feature) // 5 * 4
train = sample_feature.loc[:split_point].dropna()
val = sample_feature.loc[split_point:].dropna()
train_X = train[continuous_feature_names]
train_y_ln = np.log(train['price'] + 1)
val_X = val[continuous_feature_names]
val_y_ln = np.log(val['price'] + 1)
model = model.fit(train_X, train_y_ln)
mean_absolute_error(val_y_ln, model.predict(val_X))
# 0.19443858353490887
2.4 模型调参
2.4.1 贪心调参算法
best_obj = dict()
for obj in objective:
model = LGBMRegressor(objective=obj)
score = np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
best_obj[obj] = score
best_leaves = dict()
for leaves in num_leaves:
model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0], num_leaves=leaves)
score = np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
best_leaves[leaves] = score
best_depth = dict()
for depth in max_depth:
model = LGBMRegressor(objective=min(best_obj.items(), key=lambda x:x[1])[0],
num_leaves=min(best_leaves.items(), key=lambda x:x[1])[0],
max_depth=depth)
score = np.mean(cross_val_score(model, X=train_X, y=train_y_ln, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)))
best_depth[depth] = score
2.4.2 贝叶斯调参算法
from bayes_opt import BayesianOptimization
def rf_cv(num_leaves, max_depth, min_child_samples,bagging_fraction,feature_fraction,bagging_freq):
val = cross_val_score(
LGBMRegressor(objective = 'regression_l1',
num_leaves=int(num_leaves),
max_depth=int(max_depth),
min_child_samples = int(min_child_samples),
boosting_type = 'gbdt',
lambda_l2 = 1, # 防止过拟合
lambda_l1 = 1, # 防止过拟合
min_data_in_leaf = 20, # 防止过拟合,好像都不用怎么调
learning_rate = 0.1,
verbosity = -1,
bagging_fraction=round(bagging_fraction, 2),
feature_fraction=round(feature_fraction, 2),
bagging_freq=int(bagging_freq),
bagging_seed = 11,
metric = 'mae',
),
X=train_X, y=train_y, verbose=0, cv = 5, scoring=make_scorer(mean_absolute_error)
).mean()
return 1 - val
rf_bo = BayesianOptimization(
rf_cv,
{
'num_leaves': (2, 100),
'max_depth': (2, 100),
'min_child_samples' : (2, 100),
'bagging_fraction':(0.5, 1.0),
'feature_fraction':(0.5, 1.0),
'bagging_freq':(0, 100),
}
)
3 总结
本次完成了建模与调参,以此将我们的模型训练到更好,并对其进行了验证。