由于是比赛,我们需要把效果尽可能的提升,所以可以采用一些集成学习算法,对各个模型的预测结果进一步压榨,进行多模型融合,得到最终的结果。
目录
1. Stacking
之前的每个模型,包括机器学习模型和深度学习模型,都有在训练集上的预测结果(train_samples,classes)和测试集上的预测结果(test_samples,classes)。
我们可以挑选其中的n个模型,将其在训练集和测试集上的预测结果在水平方向上进行拼接(train_samples,classes*n),(test_samples,classes*n)。
然后构造一个新的训练集,输入是n个模型在训练集上的预测结果的拼接(train_samples,classes*n),输出/标签是每个训练样本对应的真实标签(train_samples).为了使模型更加丰富,我们使用三个深度不同(叶子结点数不同)的LightGBM模型(浅,中,深),每个深度10个随机种子,共30个不同的模型。每个模型都有500个树。基于新的训练集进行训练。
每个模型采用10折交叉验证,每个模型会得到一个在训练集上的预测结果(train_samples,classes)以及10个在测试集上的预测结果,这10个结果直接取平均(test_samples,classes)。所以最终会得到30个在训练集上的预测结果(train_samples,classes)和30个在测试集上的预测结果。
def get_layer1_input():
fasttext_train = np.load('../../oof_pred/fasttext/1/fasttext_trainable_train_0.7588.npy')
fasttext_test = np.load('../../oof_pred/fasttext/1/fasttext_trainable_test_0.7588.npy')
fasttext_char_train1 = np.load('../../oof_pred/fasttext_char/1/fasttext_char_train_0.7156.npy')
fasttext_char_test1 = np.load('../../oof_pred/fasttext_char/1/fasttext_char_test_0.7156.npy')
fasttext_char_train = fasttext_char_train1
fasttext_char_test = fasttext_char_test1
# high_dropout_rnn_train = np.load(
# '../../oof_pred/high_dropout_pooled_bilstm/1/high_dropout_pooled_bilstm_train_0.7611.npy')
# high_dropout_rnn_test = np.load(
# '../../oof_pred/high_dropout_pooled_bilstm/1/high_dropout_pooled_bilstm_test_0.7611.npy')
# pooled_bilstm_train = np.load('../../oof_pred/pooled_bilstm/1/pooled_bilstm_train_0.7623.npy')
# pooled_bilstm_test = np.load('../../oof_pred/pooled_bilstm/1/pooled_bilstm_test_0.7623.npy')
#
# pooled_bilstm_2layer_train = np.load('../../oof_pred/pooled_bilstm_2layer/1/pooled_bilstm_2layer_train_0.7647.npy')
# pooled_bilstm_2layer_test = np.load('../../oof_pred/pooled_bilstm_2layer/1/pooled_bilstm_2layer_test_0.7647.npy')
textcnn_train = np.load('../../oof_pred/textcnn/1/textcnn_train_0.7602.npy')
textcnn_test = np.load('../../oof_pred/textcnn/1/textcnn_test_0.7602.npy')
textcnn_char_train = np.load('../../oof_pred/textcnn_char/1/textcnn_char_train_0.7288.npy')
textcnn_char_test = np.load('../../oof_pred/textcnn_char/1/textcnn_char_test_0.7288.npy')
# textgru_ultimate_train = np.load('../../oof_pred/textgru_ultimate/1/textgru_ultimate_train_0.7665.npy')
# textgru_ultimate_test = np.load('../../oof_pred/textgru_ultimate/1/textgru_ultimate_test_0.7665.npy')
# textgrucnn_train = np.load('../../oof_pred/textgrucnn/1/textgrucnn_train_0.7683.npy')
# textgrucnn_test = np.load('../../oof_pred/textgrucnn/1/textgrucnn_test_0.7683.npy')
lgbm_svd_train = np.load('../../oof_pred/lgbm_svd_train_0.7373.npy')
lgbm_svd_test = np.load('../../oof_pred/lgbm_svd_test_0.7373.npy')
linearsvc_svd_train = np.load('../../oof_pred/linearsvc_svd_train_0.7078.npy')
linearsvc_svd_test = np.load('../../oof_pred/linearsvc_svd_test_0.7078.npy')
linearsvc_train = np.load('../../oof_pred/linearsvc_word_train_0.7772.npy')
linearsvc_test = np.load('../../oof_pred/linearsvc_word_test_0.7772.npy')
lr_train = np.load('../../oof_pred/lr_word_train_0.7720.npy')
lr_test = np.load('../../oof_pred/lr_word_test_0.7720.npy')
multinomialnb_train = np.load('../../oof_pred/multinomialNB_word_train_0.7375.npy')
multinomialnb_test = np.load('../../oof_pred/multinomialNB_word_test_0.7375.npy')
rnn_based_model_train = np.load('../../oof_pred/rnn_based_model_train_0.7817.npy')
rnn_based_model_test = np.load('../../oof_pred/rnn_based_model_test_0.7817.npy')
model_names = ['fasttext', 'fasttext_char', 'rnn_based_model', 'textcnn', 'textcnn_char',
'lgbm_svd', 'linearsvc_svd', 'linear_svc', 'lr', 'mnb']
train_data = (
fasttext_train, fasttext_char_train, rnn_based_model_train, textcnn_train, textcnn_char_train,
lgbm_svd_train,
linearsvc_svd_train, linearsvc_train, lr_train, multinomialnb_train)
train_label = np.load('../../data/label.npy')
test_x = (
fasttext_test, fasttext_char_test, rnn_based_model_test, textcnn_test, textcnn_char_test, lgbm_svd_test,
linearsvc_svd_test, linearsvc_test, lr_test, multinomialnb_test)
train_data = np.hstack(train_data)
test_x = np.hstack(test_x)
return train_data, train_label, test_x
def stacking_layer1_oof_pred(model, model_name, train_data, train_label, test_x, num_fold, layer=1):
fold_len = train_data.shape[0] // num_fold
skf_indices = []
skf = StratifiedKFold(n_splits=num_fold, shuffle=True, random_state=2018)
for i, (train_idx, valid_idx) in enumerate(skf.split(np.ones(train_data.shape[0]), train_label)):
skf_indices.extend(valid_idx.tolist())
train_pred = np.zeros((train_data.shape[0], 19))
test_pred = np.zeros((test_x.shape[0], 19))
for fold in range(num_fold):
print(f'Processing fold {fold}...')
fold_start = fold * fold_len
fold_end = (fold + 1) * fold_len
if fold == num_fold - 1:
fold_end = train_data.shape[0]
train_indices = skf_indices[:fold_start] + skf_indices[fold_end:]
test_indices = skf_indices[fold_start:fold_end]
train_x = train_data[train_indices]
train_y = train_label[train_indices]
cv_test_x = train_data[test_indices]
model.fit(train_x, train_y)
pred = model.predict_proba(cv_test_x)
train_pred[test_indices] = pred
pred = model.predict_proba(test_x)
test_pred += pred / num_fold
y_pred = np.argmax(train_pred, axis=1)
score = f1_score(train_label, y_pred, average='macro')
print(score)
pred_dir = f'../../oof_pred/layer{layer}_{model_name}/{model.random_state}/'
if not os.path.exists(pred_dir):
os.makedirs(pred_dir)
train_path = pred_dir + f'train_{score:.6f}'
test_path = pred_dir + f'test_{score:.6f}'
np.save(train_path, train_pred)
np.save(test_path, test_pred)
if __name__ == '__main__':
train_data, train_label, test_data = get_layer1_input()
# shallow lgbm
for i in range(1, 11):
model = LGBMClassifier(num_leaves=7, learning_rate=0.05, n_estimators=500, subsample=0.8, colsample_bytree=0.8,
random_state=i)
stacking_layer1_oof_pred(model, f'lgbm_7leaves', train_data, train_label, test_data, 10)
# medium lgbm
for i in range(1, 11):
model = LGBMClassifier(num_leaves=31, learning_rate=0.05, n_estimators=500, subsample=0.8, colsample_bytree=0.8,
random_state=i)
stacking_layer1_oof_pred(model, f'lgbm_31leaves', train_data, train_label, test_data, 10)
# deep lgbm
for i in range(1, 11):
model = LGBMClassifier(num_leaves=127, learning_rate=0.05, n_estimators=500, subsample=0.8,
colsample_bytree=0.8, random_state=i)
stacking_layer1_oof_pred(model, f'lgbm_127leaves', train_data, train_label, test_data, 10)
2. HillClimbing
从当前节点开始,和周围节点进行比较,若当前节点值最大,就返回当前节点作为最大值;否则就用最大的邻居节点,替换当前节点,实现向山峰高处攀爬的目的。如此循环直到达到最高点。优点:避免遍历所有节点,缺点:局部最优。
基于30个在训练集上的预测结果(train_samples,classes)及其对应的标签(train_samples),进行迭代,计算每个结果对应的权重;然后用这些权重对30个在测试集上的预测结果作加权和,最终得到一个在测试集上的预测结果(test_samples,classes).再按行取argmax,得到在测试集上的预测标签,进行提交即可。
def faster_hill_climbing_ensemble(model_names, train_preds, test_preds, n_iter=20, file_name=None):
"""
Run Hill climbing ensemble to find sub-optimal weights for models.
:param model_names: list of model names
:param train_preds: list of out-of-fold predictions of different models on training set
:param test_preds: list of out-of-fold predictions of different models on test set
:param n_iter: number of iteration
:param file_name: save blending result if file_name is provided.
:rtype: None
"""
y_true = np.load('../../data/label.npy')
pred_indices = []
# rt = []
best_f1 = -1
best_pred_indices = None
for iter in range(n_iter):
best_pred_index = -1
current_best_f1 = -1
# Select a best prediction
for i, pred in enumerate(train_preds):
pred_indices.append(i)
coefs = Counter(pred_indices)
y_pred = np.zeros((len(y_true), 19))
total = len(pred_indices)
for idx in coefs.keys():
y_pred += (coefs[idx] / total) * train_preds[idx]
y_pred = y_pred.argmax(axis=1)
f1 = f1_score(y_true, y_pred, average='macro')
if f1 > current_best_f1:
best_pred_index = i
current_best_f1 = f1
pred_indices.pop(-1)
pred_indices.append(best_pred_index)
if current_best_f1 > best_f1:
best_f1 = current_best_f1
best_pred_indices = pred_indices.copy()
print(f'Epoch {iter}: {current_best_f1:.6f}')
counter = Counter(best_pred_indices)
total = len(best_pred_indices)
for i, count in counter.items():
print(f'{model_names[i]} : {count/total:.4f}')
ensemble_train = np.zeros_like(train_preds[0])
ensemble_test = np.zeros_like(test_preds[0])
for i in range(len(model_names)):
ensemble_train += train_preds[i] * (counter[i] / total)
ensemble_test += test_preds[i] * (counter[i] / total)
score = f1_score(y_true, np.argmax(ensemble_train, axis=1), average='macro')
print(f'Hill climb blending of {len(model_names)} models: f1_macro {score:.6f}')
if file_name is not None:
np.save(f'../../oof_pred/{file_name}_train_{score:.6f}', ensemble_train)
np.save(f'../../oof_pred/{file_name}_test_{score:.6f}', ensemble_test)