Datawhale AI夏令营 机器学习Task3 笔记打卡

本次,我尝试了集成学习以及深度学习的方法,进行上分。

继上次我尝试的所有特征工程方案中,效果最好的依旧是采取历史平移+差分特征+窗口统计的办法:

# 合并训练数据和测试数据
data = pd.concat([train, test], axis=0).reset_index(drop=True)
data = data.sort_values(['id','dt'], ascending=False).reset_index(drop=True)

# 历史平移
for i in range(10,36):
    data[f'target_shift{i}'] = data.groupby('id')['target'].shift(i)

# 历史平移 + 差分特征
for i in range(1,4):
    data[f'target_shift10_diff{i}'] = data.groupby('id')['target_shift10'].diff(i)
    
# 窗口统计
for win in [15,30,50,70]:
    data[f'target_win{win}_mean'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').mean().values
    data[f'target_win{win}_max'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').max().values
    data[f'target_win{win}_min'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').min().values
    data[f'target_win{win}_std'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').std().values

# 历史平移 + 窗口统计
for win in [7,14,28,35,50,70]:
    data[f'target_shift10_win{win}_mean'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').mean().values
    data[f'target_shift10_win{win}_max'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').max().values
    data[f'target_shift10_win{win}_min'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').min().values
    data[f'target_shift10_win{win}_sum'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').sum().values
    data[f'target_shift710win{win}_std'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').std().values

随后,我们可以尝试融合多个模型的方法,“三个臭皮匠,顶个诸葛亮”,在集成中收获好的结果不失为一种策略。我们将LightGBM、XGBoost、CatBoost三个模型进行平均融合:

def cv_model(clf, train_x, train_y, test_x, clf_name, seed=2024):
    '''
    clf:调用模型
    train_x:训练数据
    train_y:训练数据对应标签
    test_x:测试数据
    clf_name:选择使用模型名
    seed:随机种子
    '''
    folds = 5    # 5 折交叉验证
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)    # 创建一个 5 折交叉验证的迭代器
    "huffle=True:这个参数指定了在分割数据之前是否应该打乱数据的顺序。设置为 True 可以帮助减少因数据顺序带来的偏差。"
    oof = np.zeros(train_x.shape[0])
    """
    oof 被初始化为一个零数组,其形状与训练集 train_x 相同。
    然后,在交叉验证的每一次迭代中,模型对当前折叠之外的训练数据进行预测,并将这些预测值累加到 oof 数组中对应的位置。
    """
    test_predict = np.zeros(test_x.shape[0])
    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i + 1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], \
                                     train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)
            params = {
                'boosting_type': 'gbdt',
                'objective': 'regression',
                'metric': 'mae',
                'min_child_weight': 6,
                'num_leaves': 2 ** 6,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2023,
                'nthread': 16,
                'verbose': -1,
            }
            model = clf.train(params, train_matrix, 1000, valid_sets=[train_matrix, valid_matrix],
                              categorical_feature=[], callbacks=[log_evaluation(period=500), early_stopping(stopping_rounds=500)])

            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)

        if clf_name == "xgb":
            xgb_params = {
                'booster': 'gbtree',
                'objective': 'reg:squarederror',
                'eval_metric': 'mae',
                'max_depth': 5,
                'lambda': 10,
                'subsample': 0.7,
                'colsample_bytree': 0.7,
                'colsample_bylevel': 0.7,
                'eta': 0.1,
                'tree_method': 'hist',
                'seed': 520,
                'nthread': 16
            }
            train_matrix = clf.DMatrix(trn_x, label=trn_y)
            valid_matrix = clf.DMatrix(val_x, label=val_y)
            test_matrix = clf.DMatrix(test_x)

            watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]

            # callbacks = [log_evaluation(period=200), early_stopping(stopping_rounds=100)]
            # model = clf.train(xgb_params, train_matrix, num_boost_round=1000, evals=watchlist,
            #                   callbacks=callbacks)
            callbacks = [xgb.callback.EvaluationMonitor(period=200), xgb.callback.EarlyStopping(rounds=100)]
            model = xgb.train(xgb_params, train_matrix, num_boost_round=1000, evals=watchlist,
                              callbacks=callbacks)

            val_pred = model.predict(valid_matrix)
            test_pred = model.predict(test_matrix)

        if clf_name == "cat":
            params = {'learning_rate': 0.1, 'depth': 5, 'bootstrap_type': 'Bernoulli', 'random_seed': 2023,
                      'od_type': 'Iter', 'od_wait': 100, 'random_seed': 11, 'allow_writing_files': False}

            model = clf(iterations=1000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      metric_period=200,
                      use_best_model=True,
                      cat_features=[],
                      verbose=1)

            val_pred = model.predict(val_x)
            test_pred = model.predict(test_x)

        oof[valid_index] = val_pred
        test_predict += test_pred / kf.n_splits

        score = mean_absolute_error(val_y, val_pred)
        cv_scores.append(score)
        print(cv_scores)

    return oof, test_predict


# 进行数据切分
train = data[data.target.notnull()].reset_index(drop=True)    # 筛选 'target' 列非空的行作为训练集
test = data[data.target.isnull()].reset_index(drop=True)    # 筛选 'target' 列为空的行作为测试集


# 确定输入特征
train_cols = [f for f in data.columns if f not in ['id', 'target']]

# 选择lightgbm模型
lgb_oof, lgb_test = cv_model(lgb, train[train_cols], train['target'], test[train_cols], 'lgb')
# 选择xgboost模型
xgb_oof, xgb_test = cv_model(xgb, train[train_cols], train['target'], test[train_cols], 'xgb')
# 选择catboost模型
cat_oof, cat_test = cv_model(CatBoostRegressor, train[train_cols], train['target'], test[train_cols], 'cat')

# 进行取平均融合
final_test = (lgb_test + xgb_test + cat_test) / 3

# 保存输出结果
submission = pd.DataFrame({'id': test['id'], 'dt': test['dt'], 'target': final_test})
submission.to_csv('LXC_submit3.csv', index=False)

经过训练,最终LightGBM的表现最好,每一折上的MAE为:[6.7571203686390335, 6.730582989827899, 6.724678885249694, 6.733685712828951, 6.730582989827899];XGBoost次之,为[6.949069924415079, 6.916349468103253, 6.908121121841548, 6.918552004659792, 6.895434754913011];CatBoost效果最差,为[7.099260258793829, 7.060875208529937, 7.059477679678101, 7.069109887881056, 7.044158168142322]。我们将三者取平均融合,最终的上分结果为:236.60.整体结果还是有一定收获,算是上了些分,但是不如做好特征工程的LightGBM。

接着,我尝试了使用深度神经网络的做法,这里尝试了LSTM。因为本人入门深度学习是使用的PyTorch,故在这里就写了PyTorch的代码:

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# 读取数据
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 数据预处理
def preprocess_data(df, look_back=100):
    grouped = df.groupby('id')
    datasets = {}
    for id, group in grouped:
        datasets[id] = group.values

    X, Y = [], []
    for id, data in datasets.items():
        for i in range(10, 15): # 每个id构建5个序列
            a = data[i:(i + look_back), 3]
            a = np.append(a, np.array([0]*(100-len(a))))
            X.append(a[::-1])
            Y.append(data[i-10:i, 3][::-1])

    OOT = []
    for id, data in datasets.items():
        a = data[:100, 3]
        a = np.append(a, np.array([0]*(100-len(a))))
        OOT.append(a[::-1])

    return np.array(X, dtype=np.float64), np.array(Y, dtype=np.float64), np.array(OOT, dtype=np.float64)

class SequenceDataset(Dataset):
    def __init__(self, X, Y):
        self.X = torch.tensor(X, dtype=torch.float32).unsqueeze(-1)
        self.Y = torch.tensor(Y, dtype=torch.float32).unsqueeze(-1)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

# 定义模型
class LSTMModel(nn.Module):
    def __init__(self, look_back, n_features, n_output):
        super(LSTMModel, self).__init__()
        self.lstm1 = nn.LSTM(n_features, 50, batch_first=True)
        self.lstm2 = nn.LSTM(50, 50, batch_first=True)
        self.fc = nn.Linear(50, 1)
        self.look_back = look_back
        self.n_output = n_output
    
    def forward(self, x):
        x, _ = self.lstm1(x)
        x = x[:, -1, :].unsqueeze(1).repeat(1, self.n_output, 1)
        x, _ = self.lstm2(x)
        x = self.fc(x)
        return x

# 参数设置
look_back = 100  # 序列长度
n_features = 1  # 假设每个时间点只有一个特征
n_output = 10  # 预测未来10个时间单位的值

# 预处理数据
X, Y, OOT = preprocess_data(train, look_back=look_back)

# 构建数据集和数据加载器
dataset = SequenceDataset(X, Y)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# 构建模型
model = LSTMModel(look_back, n_features, n_output)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
num_epochs = 10
for epoch in range(num_epochs):
    for batch_X, batch_Y in dataloader:
        optimizer.zero_grad()
        output = model(batch_X)
        loss = criterion(output, batch_Y)
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# 进行预测
OOT_tensor = torch.tensor(OOT, dtype=torch.float32).unsqueeze(-1)
model.eval()
with torch.no_grad():
    predicted_values = model(OOT_tensor).squeeze(-1)

# 将预测值保存到CSV文件中
predicted_values = predicted_values[:, 0].numpy()  # 取第一个时间步的预测值

# 将预测结果放入test数据框
test['target'] = predicted_values

# 将结果保存为CSV文件
test[['id', 'dt', 'target']].to_csv('submit_stacking.csv', index=False)

效果没有想象的好,结果就不给出了。

最后做一个总结吧,令我感触最深的就是以下两点:

(1)在处理机器学习问题时,集成学习往往是有效的上分策略;

(2)特征工程非常重要,可以直接决定结果的成败;

到这里,第二期夏令营的学习任务也就结束了,上分过程真的很开心。也是非常感谢Datawhale团队!!!

  • 6
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值