本次,我尝试了集成学习以及深度学习的方法,进行上分。
继上次我尝试的所有特征工程方案中,效果最好的依旧是采取历史平移+差分特征+窗口统计的办法:
# 合并训练数据和测试数据
data = pd.concat([train, test], axis=0).reset_index(drop=True)
data = data.sort_values(['id','dt'], ascending=False).reset_index(drop=True)
# 历史平移
for i in range(10,36):
data[f'target_shift{i}'] = data.groupby('id')['target'].shift(i)
# 历史平移 + 差分特征
for i in range(1,4):
data[f'target_shift10_diff{i}'] = data.groupby('id')['target_shift10'].diff(i)
# 窗口统计
for win in [15,30,50,70]:
data[f'target_win{win}_mean'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').mean().values
data[f'target_win{win}_max'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').max().values
data[f'target_win{win}_min'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').min().values
data[f'target_win{win}_std'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').std().values
# 历史平移 + 窗口统计
for win in [7,14,28,35,50,70]:
data[f'target_shift10_win{win}_mean'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').mean().values
data[f'target_shift10_win{win}_max'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').max().values
data[f'target_shift10_win{win}_min'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').min().values
data[f'target_shift10_win{win}_sum'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').sum().values
data[f'target_shift710win{win}_std'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').std().values
随后,我们可以尝试融合多个模型的方法,“三个臭皮匠,顶个诸葛亮”,在集成中收获好的结果不失为一种策略。我们将LightGBM、XGBoost、CatBoost三个模型进行平均融合:
def cv_model(clf, train_x, train_y, test_x, clf_name, seed=2024):
'''
clf:调用模型
train_x:训练数据
train_y:训练数据对应标签
test_x:测试数据
clf_name:选择使用模型名
seed:随机种子
'''
folds = 5 # 5 折交叉验证
kf = KFold(n_splits=folds, shuffle=True, random_state=seed) # 创建一个 5 折交叉验证的迭代器
"huffle=True:这个参数指定了在分割数据之前是否应该打乱数据的顺序。设置为 True 可以帮助减少因数据顺序带来的偏差。"
oof = np.zeros(train_x.shape[0])
"""
oof 被初始化为一个零数组,其形状与训练集 train_x 相同。
然后,在交叉验证的每一次迭代中,模型对当前折叠之外的训练数据进行预测,并将这些预测值累加到 oof 数组中对应的位置。
"""
test_predict = np.zeros(test_x.shape[0])
cv_scores = []
for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
print('************************************ {} ************************************'.format(str(i + 1)))
trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], \
train_y[valid_index]
if clf_name == "lgb":
train_matrix = clf.Dataset(trn_x, label=trn_y)
valid_matrix = clf.Dataset(val_x, label=val_y)
params = {
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': 'mae',
'min_child_weight': 6,
'num_leaves': 2 ** 6,
'lambda_l2': 10,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 4,
'learning_rate': 0.1,
'seed': 2023,
'nthread': 16,
'verbose': -1,
}
model = clf.train(params, train_matrix, 1000, valid_sets=[train_matrix, valid_matrix],
categorical_feature=[], callbacks=[log_evaluation(period=500), early_stopping(stopping_rounds=500)])
val_pred = model.predict(val_x, num_iteration=model.best_iteration)
test_pred = model.predict(test_x, num_iteration=model.best_iteration)
if clf_name == "xgb":
xgb_params = {
'booster': 'gbtree',
'objective': 'reg:squarederror',
'eval_metric': 'mae',
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.1,
'tree_method': 'hist',
'seed': 520,
'nthread': 16
}
train_matrix = clf.DMatrix(trn_x, label=trn_y)
valid_matrix = clf.DMatrix(val_x, label=val_y)
test_matrix = clf.DMatrix(test_x)
watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]
# callbacks = [log_evaluation(period=200), early_stopping(stopping_rounds=100)]
# model = clf.train(xgb_params, train_matrix, num_boost_round=1000, evals=watchlist,
# callbacks=callbacks)
callbacks = [xgb.callback.EvaluationMonitor(period=200), xgb.callback.EarlyStopping(rounds=100)]
model = xgb.train(xgb_params, train_matrix, num_boost_round=1000, evals=watchlist,
callbacks=callbacks)
val_pred = model.predict(valid_matrix)
test_pred = model.predict(test_matrix)
if clf_name == "cat":
params = {'learning_rate': 0.1, 'depth': 5, 'bootstrap_type': 'Bernoulli', 'random_seed': 2023,
'od_type': 'Iter', 'od_wait': 100, 'random_seed': 11, 'allow_writing_files': False}
model = clf(iterations=1000, **params)
model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
metric_period=200,
use_best_model=True,
cat_features=[],
verbose=1)
val_pred = model.predict(val_x)
test_pred = model.predict(test_x)
oof[valid_index] = val_pred
test_predict += test_pred / kf.n_splits
score = mean_absolute_error(val_y, val_pred)
cv_scores.append(score)
print(cv_scores)
return oof, test_predict
# 进行数据切分
train = data[data.target.notnull()].reset_index(drop=True) # 筛选 'target' 列非空的行作为训练集
test = data[data.target.isnull()].reset_index(drop=True) # 筛选 'target' 列为空的行作为测试集
# 确定输入特征
train_cols = [f for f in data.columns if f not in ['id', 'target']]
# 选择lightgbm模型
lgb_oof, lgb_test = cv_model(lgb, train[train_cols], train['target'], test[train_cols], 'lgb')
# 选择xgboost模型
xgb_oof, xgb_test = cv_model(xgb, train[train_cols], train['target'], test[train_cols], 'xgb')
# 选择catboost模型
cat_oof, cat_test = cv_model(CatBoostRegressor, train[train_cols], train['target'], test[train_cols], 'cat')
# 进行取平均融合
final_test = (lgb_test + xgb_test + cat_test) / 3
# 保存输出结果
submission = pd.DataFrame({'id': test['id'], 'dt': test['dt'], 'target': final_test})
submission.to_csv('LXC_submit3.csv', index=False)
经过训练,最终LightGBM的表现最好,每一折上的MAE为:[6.7571203686390335, 6.730582989827899, 6.724678885249694, 6.733685712828951, 6.730582989827899];XGBoost次之,为[6.949069924415079, 6.916349468103253, 6.908121121841548, 6.918552004659792, 6.895434754913011];CatBoost效果最差,为[7.099260258793829, 7.060875208529937, 7.059477679678101, 7.069109887881056, 7.044158168142322]。我们将三者取平均融合,最终的上分结果为:236.60.整体结果还是有一定收获,算是上了些分,但是不如做好特征工程的LightGBM。
接着,我尝试了使用深度神经网络的做法,这里尝试了LSTM。因为本人入门深度学习是使用的PyTorch,故在这里就写了PyTorch的代码:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
# 读取数据
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# 数据预处理
def preprocess_data(df, look_back=100):
grouped = df.groupby('id')
datasets = {}
for id, group in grouped:
datasets[id] = group.values
X, Y = [], []
for id, data in datasets.items():
for i in range(10, 15): # 每个id构建5个序列
a = data[i:(i + look_back), 3]
a = np.append(a, np.array([0]*(100-len(a))))
X.append(a[::-1])
Y.append(data[i-10:i, 3][::-1])
OOT = []
for id, data in datasets.items():
a = data[:100, 3]
a = np.append(a, np.array([0]*(100-len(a))))
OOT.append(a[::-1])
return np.array(X, dtype=np.float64), np.array(Y, dtype=np.float64), np.array(OOT, dtype=np.float64)
class SequenceDataset(Dataset):
def __init__(self, X, Y):
self.X = torch.tensor(X, dtype=torch.float32).unsqueeze(-1)
self.Y = torch.tensor(Y, dtype=torch.float32).unsqueeze(-1)
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
return self.X[idx], self.Y[idx]
# 定义模型
class LSTMModel(nn.Module):
def __init__(self, look_back, n_features, n_output):
super(LSTMModel, self).__init__()
self.lstm1 = nn.LSTM(n_features, 50, batch_first=True)
self.lstm2 = nn.LSTM(50, 50, batch_first=True)
self.fc = nn.Linear(50, 1)
self.look_back = look_back
self.n_output = n_output
def forward(self, x):
x, _ = self.lstm1(x)
x = x[:, -1, :].unsqueeze(1).repeat(1, self.n_output, 1)
x, _ = self.lstm2(x)
x = self.fc(x)
return x
# 参数设置
look_back = 100 # 序列长度
n_features = 1 # 假设每个时间点只有一个特征
n_output = 10 # 预测未来10个时间单位的值
# 预处理数据
X, Y, OOT = preprocess_data(train, look_back=look_back)
# 构建数据集和数据加载器
dataset = SequenceDataset(X, Y)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
# 构建模型
model = LSTMModel(look_back, n_features, n_output)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练模型
num_epochs = 10
for epoch in range(num_epochs):
for batch_X, batch_Y in dataloader:
optimizer.zero_grad()
output = model(batch_X)
loss = criterion(output, batch_Y)
loss.backward()
optimizer.step()
print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
# 进行预测
OOT_tensor = torch.tensor(OOT, dtype=torch.float32).unsqueeze(-1)
model.eval()
with torch.no_grad():
predicted_values = model(OOT_tensor).squeeze(-1)
# 将预测值保存到CSV文件中
predicted_values = predicted_values[:, 0].numpy() # 取第一个时间步的预测值
# 将预测结果放入test数据框
test['target'] = predicted_values
# 将结果保存为CSV文件
test[['id', 'dt', 'target']].to_csv('submit_stacking.csv', index=False)
效果没有想象的好,结果就不给出了。
最后做一个总结吧,令我感触最深的就是以下两点:
(1)在处理机器学习问题时,集成学习往往是有效的上分策略;
(2)特征工程非常重要,可以直接决定结果的成败;
到这里,第二期夏令营的学习任务也就结束了,上分过程真的很开心。也是非常感谢Datawhale团队!!!