完整Transformer(encoder+decoder)
- 示例模型代码
示例模型代码
和上一个DRSN代码一样是在同一期调试好的,一直没时间发出来,现在直接上代码
# 导入所需的库
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
# 定义一些超参数
batch_size = 32 # 批次大小
seq_len = 50 # 序列长度
n_features = 26 # 特征数量
n_heads = 2 # 多头注意力的头数
d_model = 26 # 模型维度
d_ff = 256 # 前馈网络的维度
n_layers = 3 # 编码器和解码器的层数
dropout = 0.1 # dropout比率
lr = 0.001 # 学习率
epochs = 100 # 训练轮数
# 定义transformer模型类
class Transformer(nn.Module):
def __init__(self, n_features, n_heads, d_model, d_ff, n_layers, dropout):
super(Transformer, self).__init__()
self.n_features = n_features # 特征数量
self.n_heads = n_heads # 多头注意力的头数
self.d_model = d_model # 模型维度
self.d_ff = d_ff # 前馈网络的维度
self.n_layers = n_layers # 编码器和解码器的层数
self.dropout = dropout # dropout比率
# 定义编码器层,包含多头自注意力,前馈网络,残差连接和层归一化
self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads, dim_feedforward=d_ff, dropout=dropout)
# 定义编码器,包含n_layers个编码器层
self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=n_layers)
# 定义解码器层,包含多头自注意力,多头编码器-解码器注意力,前馈网络,残差连接和层归一化
self.decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=n_heads, dim_feedforward=d_ff, dropout=dropout)
# 定义解码器,包含n_layers个解码器层
self.decoder = nn.TransformerDecoder(self.decoder_layer, num_layers=n_layers)
# 定义线性层,将模型输出映射到一个值,作为预测结果
self.linear = nn.Linear(d_model, 1)
def forward(self, src, tgt):
# src: 输入序列,形状为(seq_len, batch_size, n_features)
# tgt: 目标序列,形状为(seq_len, batch_size, n_features)
# 输出: 预测序列,形状为(seq_len, batch_size, 1)
# 将输入序列和目标序列转换为模型维度
src = src.view(seq_len, batch_size, -1) * np.sqrt(self.d_model)
tgt = tgt.view(seq_len, batch_size, -1) * np.sqrt(self.d_model)
# 对输入序列和目标序列进行编码器和解码器的处理,得到输出序列
output = self.decoder(tgt, self.encoder(src))
# 对输出序列进行线性层的映射,得到预测序列
output = self.linear(output)
return output
# 定义数据处理函数,将数据集划分为训练集和测试集,并制作成批次
def data_process(data, seq_len, batch_size):
# data: 数据集,形状为(n_samples, n_features)
# seq_len: 序列长度
# batch_size: 批次大小
# 返回: 训练集的输入序列,训练集的目标序列,测试集的输入序列,测试集的目标序列
# 将数据集按照8:2的比例划分为训练集和测试集
train_size = int(len(data) * 0.8)
train_data = data[:train_size]
test_data = data[train_size:]
# 定义训练集的输入序列和目标序列的列表
train_src = []
train_tgt = []
# 遍历训练集,每次取seq_len个样本作为输入序列,下一个样本作为目标序列
for i in range(len(train_data) - seq_len - 1):
train_src.append(train_data[i:i+seq_len])
train_tgt.append(train_data[i+1:i+seq_len+1])
# 将训练集的输入序列和目标序列转换为张量,并调整形状为(seq_len, batch_size, n_features)
train_src = torch.tensor(train_src).transpose(0, 1).float()
train_tgt = torch.tensor(train_tgt).transpose(0, 1).float()
# 定义测试集的输入序列和目标序列的列表
test_src = []
test_tgt = []
# 遍历测试集,每次取seq_len个样本作为输入序列,下一个样本作为目标序列
for i in range(len(test_data) - seq_len - 1):
test_src.append(test_data[i:i+seq_len])
test_tgt.append(test_data[i+1:i+seq_len+1])
# 将测试集的输入序列和目标序列转换为张量,并调整形状为(seq_len, batch_size, n_features)
test_src = torch.tensor(test_src).transpose(0, 1).float()
test_tgt = torch.tensor(test_tgt).transpose(0, 1).float()
return train_src, train_tgt, test_src, test_tgt
# 加载CMAPSS数据集,只取第一个子数据集作为示例
data = pd.read_csv('D:/PredictiveMaintenance/Turbo_CMAPSS/dataset/train_FD001.txt', sep=' ', header=None)
data.drop([26, 27], axis=1, inplace=True) # 删除最后两列空值
# 对数据进行归一化处理,使其在0到1之间
scaler = MinMaxScaler()
data = scaler.fit_transform(data)
# 调用数据处理函数,得到训练集和测试集的输入序列和目标序列
train_src, train_tgt, test_src, test_tgt = data_process(data, seq_len, batch_size)
# 实例化transformer模型,并打印查看
model = Transformer(n_features, n_heads, d_model, d_ff, n_layers, dropout)
print(model)
# 定义优化器和损失函数
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.MSELoss()
# 定义训练函数,对模型进行训练,并记录训练损失
def train(model, optimizer, criterion, epochs):
# model: transformer模型
# optimizer: 优化器
# criterion: 损失函数
# epochs: 训练轮数
# 将模型设置为训练模式
model.train()
# 定义一个列表,用于记录每轮的训练损失
losses = []
# 遍历每一轮训练
for epoch in range(epochs):
# 初始化每轮的总损失为0
total_loss = 0
# 遍历每一个批次
for i in range(0, len(train_src), batch_size):
# 获取当前批次的输入序列和目标序列
src = train_src[:, i:i+batch_size, :]
tgt = train_tgt[:, i:i+batch_size, :]
# 将优化器的梯度清零
optimizer.zero_grad()
# 将输入序列和目标序列输入模型,得到预测序列
output = model(src, tgt)
# 计算预测序列和目标序列之间的均方误差
loss = criterion(output, tgt[:, :, 0].unsqueeze(2))
# 反向传播,计算梯度
loss.backward()
# 更新模型参数
optimizer.step()
# 累加每个批次的损失
total_loss += loss.item()
# 计算每轮的平均损失,并添加到列表中
avg_loss = total_loss / len(train_src)
losses.append(avg_loss)
# 打印每轮的训练信息
print(f'Epoch {epoch+1}, Train Loss: {avg_loss:.4f}')
return losses
# 调用训练函数,对模型进行训练,并记录训练损失
train_losses = train(model, optimizer, criterion, epochs)
模型的结构
Transformer(
(encoder_layer): TransformerEncoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=26, out_features=26, bias=True)
)
(linear1): Linear(in_features=26, out_features=256, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(linear2): Linear(in_features=256, out_features=26, bias=True)
(norm1): LayerNorm((26,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((26,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
)
(encoder): TransformerEncoder(
(layers): ModuleList(
(0): TransformerEncoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=26, out_features=26, bias=True)
)
(linear1): Linear(in_features=26, out_features=256, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(linear2): Linear(in_features=256, out_features=26, bias=True)
(norm1): LayerNorm((26,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((26,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
)
(1): TransformerEncoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=26, out_features=26, bias=True)
)
(linear1): Linear(in_features=26, out_features=256, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(linear2): Linear(in_features=256, out_features=26, bias=True)
(norm1): LayerNorm((26,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((26,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
)
(2): TransformerEncoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=26, out_features=26, bias=True)
)
(linear1): Linear(in_features=26, out_features=256, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(linear2): Linear(in_features=256, out_features=26, bias=True)
(norm1): LayerNorm((26,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((26,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
)
)
)
(decoder_layer): TransformerDecoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=26, out_features=26, bias=True)
)
(multihead_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=26, out_features=26, bias=True)
)
(linear1): Linear(in_features=26, out_features=256, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(linear2): Linear(in_features=256, out_features=26, bias=True)
(norm1): LayerNorm((26,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((26,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((26,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
(dropout3): Dropout(p=0.1, inplace=False)
)
(decoder): TransformerDecoder(
(layers): ModuleList(
(0): TransformerDecoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=26, out_features=26, bias=True)
)
(multihead_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=26, out_features=26, bias=True)
)
(linear1): Linear(in_features=26, out_features=256, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(linear2): Linear(in_features=256, out_features=26, bias=True)
(norm1): LayerNorm((26,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((26,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((26,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
(dropout3): Dropout(p=0.1, inplace=False)
)
(1): TransformerDecoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=26, out_features=26, bias=True)
)
(multihead_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=26, out_features=26, bias=True)
)
(linear1): Linear(in_features=26, out_features=256, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(linear2): Linear(in_features=256, out_features=26, bias=True)
(norm1): LayerNorm((26,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((26,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((26,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
(dropout3): Dropout(p=0.1, inplace=False)
)
(2): TransformerDecoderLayer(
(self_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=26, out_features=26, bias=True)
)
(multihead_attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=26, out_features=26, bias=True)
)
(linear1): Linear(in_features=26, out_features=256, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(linear2): Linear(in_features=256, out_features=26, bias=True)
(norm1): LayerNorm((26,), eps=1e-05, elementwise_affine=True)
(norm2): LayerNorm((26,), eps=1e-05, elementwise_affine=True)
(norm3): LayerNorm((26,), eps=1e-05, elementwise_affine=True)
(dropout1): Dropout(p=0.1, inplace=False)
(dropout2): Dropout(p=0.1, inplace=False)
(dropout3): Dropout(p=0.1, inplace=False)
)
)
)
(linear): Linear(in_features=26, out_features=1, bias=True)
)
模型的运行
Epoch 1, Train Loss: 0.0195
Epoch 2, Train Loss: 0.0071
Epoch 3, Train Loss: 0.0010
Epoch 4, Train Loss: 0.0027
Epoch 5, Train Loss: 0.0015
Epoch 6, Train Loss: 0.0005
Epoch 7, Train Loss: 0.0011
Epoch 8, Train Loss: 0.0011
Epoch 9, Train Loss: 0.0005
Epoch 10, Train Loss: 0.0004
Epoch 11, Train Loss: 0.0006
Epoch 12, Train Loss: 0.0006
Epoch 13, Train Loss: 0.0003
Epoch 14, Train Loss: 0.0003
Epoch 15, Train Loss: 0.0003
Epoch 16, Train Loss: 0.0004
Epoch 17, Train Loss: 0.0003
Epoch 18, Train Loss: 0.0002
Epoch 19, Train Loss: 0.0003
Epoch 20, Train Loss: 0.0003
Epoch 21, Train Loss: 0.0003
Epoch 22, Train Loss: 0.0002
Epoch 23, Train Loss: 0.0002
Epoch 24, Train Loss: 0.0002
Epoch 25, Train Loss: 0.0002
Epoch 26, Train Loss: 0.0002
Epoch 27, Train Loss: 0.0002
Epoch 28, Train Loss: 0.0002
Epoch 29, Train Loss: 0.0002
Epoch 30, Train Loss: 0.0002
Epoch 31, Train Loss: 0.0002
Epoch 32, Train Loss: 0.0002
Epoch 33, Train Loss: 0.0002
Epoch 34, Train Loss: 0.0002
Epoch 35, Train Loss: 0.0002
Epoch 36, Train Loss: 0.0001
Epoch 37, Train Loss: 0.0002
Epoch 38, Train Loss: 0.0001
Epoch 39, Train Loss: 0.0001
Epoch 40, Train Loss: 0.0001
Epoch 41, Train Loss: 0.0001
Epoch 42, Train Loss: 0.0001
Epoch 43, Train Loss: 0.0001
Epoch 44, Train Loss: 0.0001
Epoch 45, Train Loss: 0.0001
Epoch 46, Train Loss: 0.0001
Epoch 47, Train Loss: 0.0001
Epoch 48, Train Loss: 0.0001
Epoch 49, Train Loss: 0.0001
Epoch 50, Train Loss: 0.0001
Epoch 51, Train Loss: 0.0001
Epoch 52, Train Loss: 0.0001
Epoch 53, Train Loss: 0.0001
Epoch 54, Train Loss: 0.0001
Epoch 55, Train Loss: 0.0001
Epoch 56, Train Loss: 0.0001
Epoch 57, Train Loss: 0.0001
Epoch 58, Train Loss: 0.0001
Epoch 59, Train Loss: 0.0001
Epoch 60, Train Loss: 0.0001
Epoch 61, Train Loss: 0.0001
Epoch 62, Train Loss: 0.0001
Epoch 63, Train Loss: 0.0001
Epoch 64, Train Loss: 0.0001
Epoch 65, Train Loss: 0.0001
Epoch 66, Train Loss: 0.0001
Epoch 67, Train Loss: 0.0001
Epoch 68, Train Loss: 0.0001
Epoch 69, Train Loss: 0.0001
Epoch 70, Train Loss: 0.0001
Epoch 71, Train Loss: 0.0001
Epoch 72, Train Loss: 0.0001
Epoch 73, Train Loss: 0.0001
Epoch 74, Train Loss: 0.0001
Epoch 75, Train Loss: 0.0001
Epoch 76, Train Loss: 0.0001
Epoch 77, Train Loss: 0.0001
Epoch 78, Train Loss: 0.0001
Epoch 79, Train Loss: 0.0001
Epoch 80, Train Loss: 0.0001
Epoch 81, Train Loss: 0.0001
Epoch 82, Train Loss: 0.0001
Epoch 83, Train Loss: 0.0000
Epoch 84, Train Loss: 0.0001
Epoch 85, Train Loss: 0.0000
Epoch 86, Train Loss: 0.0000
Epoch 87, Train Loss: 0.0000
Epoch 88, Train Loss: 0.0000
Epoch 89, Train Loss: 0.0000
Epoch 90, Train Loss: 0.0000
Epoch 91, Train Loss: 0.0000
Epoch 92, Train Loss: 0.0000
Epoch 93, Train Loss: 0.0000
Epoch 94, Train Loss: 0.0000
Epoch 95, Train Loss: 0.0000
Epoch 96, Train Loss: 0.0000
Epoch 97, Train Loss: 0.0000
Epoch 98, Train Loss: 0.0000
Epoch 99, Train Loss: 0.0000
Epoch 100, Train Loss: 0.0000
以上案例能直接跑通,但是最终的预测效果却很一般
train | test |
---|---|
MSE 2353.149 | MSE 610.5841 |
RMSE 48.5093 | RMSE 24.71 |
Scoring_2008 1303432320.0 | Scoring_2008 2939.488 |