1. 思路
1.1 特征工程
原始数据数据仅仅只是当下(当前时刻/日/月)的数据,然而,对于连续时间数据来说,当前时间点的数据的产生和当前数据之前的一些数据是强相关的,特征工程的重点之一就是通过数据之间的相关性来提高数据集的质量,以便更好地进行数据的拟合和预测。
通过历史数据平移、滑动窗口等方法,可以很好地实现特征工程。
1.2 LSTM(Long Short-Term Memory)
长短期记忆网络(Long Short-Term Memory, LSTM)是一种特殊类型的循环神经网络(Recurrent Neural Network, RNN),旨在克服传统RNN在处理序列数据时遇到的梯度消失和梯度爆炸问题。LSTM由Sepp Hochreiter和Jürgen Schmidhuber于1997年提出,通过引入一种称为“记忆单元”的结构,使得网络能够选择性地记住或遗忘信息,从而有效捕捉长期依赖关系。
LSTM的核心机制是其独特的门控架构,包括输入门、输出门和遗忘门。这些门通过Sigmoid激活函数控制信息的流动。具体而言,遗忘门决定哪些信息应从记忆单元中丢弃;输入门负责确定哪些新信息被存储到记忆单元中,并通过tanh函数产生候选值;输出门则控制记忆单元中的信息如何影响网络的当前输出。此外,LSTM的记忆单元使用tanh激活函数来更新和存储状态信息,这有助于保持状态的稳定性。
这种门控机制使LSTM能够在处理时间序列数据时,根据需要调整其记忆和遗忘策略,即使在序列非常长的情况下也能保持良好的性能。因此,LSTM广泛应用于自然语言处理、语音识别、时间序列预测以及生物信息学等多个领域,尤其在处理具有复杂模式和长期依赖性的序列任务上表现出色。
参考资料:LSTM从入门到精通(形象的图解,详细的代码和注释,完美的数学推导过程)_lstm模型-CSDN博客
2. 关键代码
2.1 特征工程
# 特征工程t
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
# 合并训练数据和测试数据
data = pd.concat([train, test], axis=0).reset_index(drop=True)
data = data.sort_values(['id','dt'], ascending=False).reset_index(drop=True)
# 历史平移
for i in range(10,36):
data[f'target_shift{i}'] = data.groupby('id')['target'].shift(i)
# 历史平移 + 差分特征
for i in range(1,4):
data[f'target_shift10_diff{i}'] = data.groupby('id')['target_shift10'].diff(i)
# 窗口统计
for win in [15,30,50,70]:
data[f'target_win{win}_mean'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').mean().values
data[f'target_win{win}_max'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').max().values
data[f'target_win{win}_min'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').min().values
data[f'target_win{win}_std'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').std().values
# 历史平移 + 窗口统计
for win in [7,14,28,35,50,70]:
data[f'target_shift10_win{win}_mean'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').mean().values
data[f'target_shift10_win{win}_max'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').max().values
data[f'target_shift10_win{win}_min'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').min().values
data[f'target_shift10_win{win}_sum'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').sum().values
data[f'target_shift710win{win}_std'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').std().values
# 进行数据切分
# drop=True意味着原始的索引将被新的从0开始的索引替代
# data.target.isnull()会生成一个布尔型的Series,其中True表示target列中的值是空的,False表示值是非空的
# test中的target数据待测 因此为空
train = data[data.target.notnull()].reset_index(drop=True)
test = data[data.target.isnull()].reset_index(drop=True)
2.2 LSTM
# 数据预处理
def preprocess_data(df, look_back=100, n_sequences=5, sequence_start=10):
grouped = df.groupby('id')
X, Y, OOT = [], [], []
for id, group in grouped:
data = group.values
# 准备训练数据集
for i in range(sequence_start, sequence_start + n_sequences):
if i + look_back <= len(data):
a = data[i:(i + look_back), 3]
a = np.pad(a, (0, max(0, 100 - len(a))), mode='constant')
X.append(a[::-1])
Y.append(data[i - sequence_start:i, 3][::-1])
# 准备测试数据集
a = data[:100, 3]
a = np.pad(a, (0, max(0, 100 - len(a))), mode='constant')
OOT.append(a[::-1])
return np.array(X, dtype=np.float64), np.array(Y, dtype=np.float64), np.array(OOT, dtype=np.float64)
# 定义模型
def build_model(look_back, n_features, n_output):
model = Sequential([
LSTM(50, input_shape=(look_back, n_features)),
RepeatVector(n_output),
LSTM(50, return_sequences=True),
TimeDistributed(Dense(1))
])
model.compile(loss='mean_squared_error', optimizer=Adam(0.001))
return model
# 构建和训练模型
look_back = 100 # 序列长度
n_features = 1 # 假设每个时间点只有一个特征
n_output = 10 # 预测未来10个时间单位的值
# 预处理数据
X, Y, OOT = preprocess_data(train, look_back=look_back)
# 构建模型
model = build_model(look_back, n_features, n_output)
3. 完整代码
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, RepeatVector, TimeDistributed
from tensorflow.keras.optimizers import Adam
# 特征工程t
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
# 合并训练数据和测试数据
data = pd.concat([train, test], axis=0).reset_index(drop=True)
data = data.sort_values(['id','dt'], ascending=False).reset_index(drop=True)
# 历史平移
for i in range(10,36):
data[f'target_shift{i}'] = data.groupby('id')['target'].shift(i)
# 历史平移 + 差分特征
for i in range(1,4):
data[f'target_shift10_diff{i}'] = data.groupby('id')['target_shift10'].diff(i)
# 窗口统计
for win in [15,30,50,70]:
data[f'target_win{win}_mean'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').mean().values
data[f'target_win{win}_max'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').max().values
data[f'target_win{win}_min'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').min().values
data[f'target_win{win}_std'] = data.groupby('id')['target'].rolling(window=win, min_periods=3, closed='left').std().values
# 历史平移 + 窗口统计
for win in [7,14,28,35,50,70]:
data[f'target_shift10_win{win}_mean'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').mean().values
data[f'target_shift10_win{win}_max'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').max().values
data[f'target_shift10_win{win}_min'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').min().values
data[f'target_shift10_win{win}_sum'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').sum().values
data[f'target_shift710win{win}_std'] = data.groupby('id')['target_shift10'].rolling(window=win, min_periods=3, closed='left').std().values
# 进行数据切分
# drop=True意味着原始的索引将被新的从0开始的索引替代
# data.target.isnull()会生成一个布尔型的Series,其中True表示target列中的值是空的,False表示值是非空的
# test中的target数据待测 因此为空
train = data[data.target.notnull()].reset_index(drop=True)
test = data[data.target.isnull()].reset_index(drop=True)
# 数据预处理
def preprocess_data(df, look_back=100, n_sequences=5, sequence_start=10):
grouped = df.groupby('id')
X, Y, OOT = [], [], []
for id, group in grouped:
data = group.values
# 准备训练数据集
for i in range(sequence_start, sequence_start + n_sequences):
if i + look_back <= len(data):
a = data[i:(i + look_back), 3]
a = np.pad(a, (0, max(0, 100 - len(a))), mode='constant')
X.append(a[::-1])
Y.append(data[i - sequence_start:i, 3][::-1])
# 准备测试数据集
a = data[:100, 3]
a = np.pad(a, (0, max(0, 100 - len(a))), mode='constant')
OOT.append(a[::-1])
return np.array(X, dtype=np.float64), np.array(Y, dtype=np.float64), np.array(OOT, dtype=np.float64)
# 定义模型
def build_model(look_back, n_features, n_output):
model = Sequential([
LSTM(50, input_shape=(look_back, n_features)),
RepeatVector(n_output),
LSTM(50, return_sequences=True),
TimeDistributed(Dense(1))
])
model.compile(loss='mean_squared_error', optimizer=Adam(0.001))
return model
# 构建和训练模型
look_back = 100 # 序列长度
n_features = 1 # 假设每个时间点只有一个特征
n_output = 10 # 预测未来10个时间单位的值
# 预处理数据
X, Y, OOT = preprocess_data(train, look_back=look_back)
# 构建模型
model = build_model(look_back, n_features, n_output)
from tensorflow.keras.callbacks import EarlyStopping
# 训练模型
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X, Y, epochs=100, batch_size=16, validation_split=0.2, callbacks=[early_stopping])
# 预测测试数据
predictions = model.predict(OOT)
# 保存结果文件到本地
test['target'] = predicted_values.flatten()
test[['id','dt','target']].to_csv('submit.csv', index=None)
4. 结果
由于epoch和batch_size大小的选择对模型的训练结果是直接相关的,因此本部分加入了对不同epoch和batch_size训练结果的讨论。参考资料:batchsize太小的缺点&随着batchsize逐渐增大的优缺点&如何平衡batchsize的大小_batchsize太小会怎样-CSDN博客
1) epoch = 10 batch_size=64
2) epoch = 10 batch_size=32
3) epoch = 10 batch_size=16
4) EarlyStopping