有了数据和核心的ProbAttention,接下来就可以复现完整的Informer(图1)了。
图1:Informer模型结构[1]
如果对图1还不太熟悉的同学,可以参考图2。由图2可知,informer与Transformer之间的差异并不大。
主要区别是ProbAttention和可能的encoder最后一层的conv(源代码中用来distil)。
图2:参考Transformer画的Informer模型结构
Informer三大基本要素: AttentionBlock, MLPBlock, ConvBlock
上次博文已复现了ProbAttention和AttentionBlock (attention + add&norm),而Conv又是基础模块,所以整个Informer不难复现。
MLPBlock
MLPBlock (mlp+add&norm):源代码中采用1x1卷积的形式,相对较为繁琐。为了匹配卷积对应(batch_size, channels, seq_len)的特点,需要将输入先进行维度调整,得到结果后再将维度调整回来。
# 源代码
y = x = self.norm1(x)
y = self.dropout(self.activation(self.conv1(y.transpose(-1,1))))
y = self.dropout(self.conv2(y).transpose(-1,1))
return self.norm2(x+y), attn
直接用mlp则不存在此问题。
class MLPBlock(nn.Module):
def __init__(self, d_model, d_ff=None, dropout=0.1, act="relu"):
super().__init__()
d_ff = d_ff or d_model * 4
act = nn.ReLU() if act=="relu" else nn.GELU()
self.net = nn.Sequential(
nn.Linear(d_model, d_ff),
act,
nn.Dropout(dropout),
nn.Linear(d_ff, d_model),
nn.Dropout(dropout)
)
self.norm = nn.LayerNorm(d_model)
def forward(self, x):
out = self.net(x)
return self.norm(out + x)
ConvBlock
有了AttentionBlock和MLPBlock,只差ConvBlock就可以完整实现Informer了。如上所述,需要对输入的维度进行调整。
class ConvBlock(nn.Module):
def __init__(self, cin):
super().__init__()
self.net= nn.Sequential(
nn.Conv1d(cin, cin, 3, 1, 1, bias=False),
nn.BatchNorm1d(cin),
nn.ELU(),
nn.MaxPool1d(3, 2, 1)
)
def forward(self, x):
"""
x -- (N, L, D)
"""
x = x.transpose(1, 2)
return self.net(x).transpose(1, 2)
有了基本要素,然后就是堆积木,实现Informer了。
编码器 Encoder
先复现一个EncoderLayer,然后再叠加即可。 需要注意的是,EncoderLayer中的conv可能会用,也可能不用。 不用的时候,用 nn.Identity()
来表示,确保模型整整齐齐。
class EncoderLayer(nn.Module):
def __init__(
self,
attn_type,
d_model,
num_heads=8,
d_ff=None,
use_conv=False,
attn_dropout=0.1,
dropout=0.1,
factor=5,
act="relu"
):
super().__init__()
self.attention = AttentionBlock(attn_type=attn_type, d_model=d_model, num_heads=num_heads, use_mask=False, dropout=attn_dropout, factor=factor)
self.mlp = MLPBlock(d_model=d_model, d_ff=d_ff, dropout=dropout, act=act)
if use_conv:
self.conv = ConvBlock(d_model)
else:
self.conv = nn.Identity()
def forward(self, x, mask=None):
out = self.attention(x, x, x, mask)
out = self.mlp(out)
out = self.conv(out)
return out
叠加N次,得到Encoder。即便使用ConvBlock,Encoder组后一层也是不需要conv的,所以需要单独对待。
class Encoder(nn.Module):
def __init__(
self,
num_layers,
attn_type,
d_model,
num_heads=8,
d_ff=None,
use_conv=False,
attn_dropout=0.1,
dropout=0.1,
factor=5,
act="relu"
):
super().__init__()
# attention + mlp + conv
self.layers = nn.ModuleList([
EncoderLayer(
attn_type,
d_model,
num_heads,
d_ff,
use_conv,
attn_dropout,
dropout,
factor,
act
) for _ in range(num_layers - 1)
])
self.layers.append(
EncoderLayer(
attn_type,
d_model,
num_heads,
d_ff,
False,
attn_dropout,
dropout,
factor,
act
)
)
def forward(self, x, mask=None):
for layer in self.layers:
x = layer(x, mask)
return x
解码器 Decoder
一样样的,先复现DecoderLayer。简单堆积木,没有太多可talk的。 仅需注意,cross attention用的是VanillaAttention即可。 第一个多头自相关注意力,可用Vanilla,也可用Prob,根据 attn_type
决定。
class DecoderLayer(nn.Module):
def __init__(self, attn_type, d_model, num_heads, d_ff=None, attn_dropout=0.1, dropout=0.1, factor=5, act="relu"):
super().__init__()
self.attention = AttentionBlock(attn_type, d_model, num_heads, True, attn_dropout, factor)
self.cross_attention = AttentionBlock("vanilla", d_model, num_heads, False, attn_dropout, factor)
self.mlp = MLPBlock(d_model, d_ff, dropout, act)
def forward(self, x, cross, mask=None, cross_mask=None):
out = self.attention(x, x, x, mask)
out = self.cross_attention(out, cross, cross, cross_mask)
out = self.mlp(out)
return out
class Decoder(nn.Module):
def __init__(self, num_layers, attn_type, d_model, num_heads, d_ff=None, attn_dropout=0.1, dropout=0.1, factor=5, act="relu"):
super().__init__()
self.layers = nn.ModuleList([
DecoderLayer(
attn_type,
d_model,
num_heads,
d_ff,
attn_dropout,
dropout,
factor,
act
) for _ in range(num_layers)
])
def forward(self, x, cross, mask=None, cross_mask=None):
for layer in self.layers:
x = layer(x, cross, mask, cross_mask)
return x
Embedder
与Transformer一样,需要对位置信息进行编码。 Informer对输入(token)、位置(pos)和时间信息(temporal)都进行了编码,然后汇总。 对源代码中的TokenEmbedding和TemporalEmbedding进行了简化。
# 位置编码直接从[2]中复制过来
class TokenEmbedding(nn.Module):
def __init__(self, d_model):
super(TokenEmbedding, self).__init__()
self.embedder = nn.LazyLinear(d_model)
def forward(self, x):
return self.embedder(x)
class PositionalEmbedding(nn.Module):
def __init__(self, d_model, max_len=5000):
super(PositionalEmbedding, self).__init__()
# Compute the positional encodings once in log space.
pe = torch.zeros(max_len, d_model).float()
pe.require_grad = False
position = torch.arange(0, max_len).float().unsqueeze(1)
div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
def forward(self, x):
return self.pe[:, :x.size(1)]
class TemporalEmbedding(nn.Module):
def __init__(self, d_model):
super().__init__()
self.embedder = nn.LazyLinear(d_model)
def forward(self, x):
return self.embedder(x)
class DataEmbedding(nn.Module):
def __init__(self, d_model, dropout=0.1):
super(DataEmbedding, self).__init__()
self.value_embedding = TokenEmbedding(d_model=d_model)
self.position_embedding = PositionalEmbedding(d_model=d_model)
self.temporal_embedding = TemporalEmbedding(d_model=d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x, x_mark):
x = self.value_embedding(x) + self.position_embedding(x) + self.temporal_embedding(x_mark)
return self.dropout(x)
Informer
ok,最后就是完整模型了。
class Informer(nn.Module):
def __init__(
self,
cout,
pred_len,
attn_type,
d_model=512,
num_heads=8,
e_layers=3,
d_layers=2,
d_ff=None,
attn_dropout=0.1,
dropout=0.05,
act="relu",
use_conv=True,
factor=5
):
super().__init__()
self.pred_len = pred_len
# obtain embeddings
self.encoder_embedding = DataEmbedding(d_model, dropout)
self.decoder_embedding = DataEmbedding(d_model, dropout)
self.encoder = Encoder(e_layers, attn_type, d_model, num_heads, d_ff, use_conv, attn_dropout, dropout, factor, act)
self.decoder = Decoder(d_layers, attn_type, d_model, num_heads, d_ff, attn_dropout, dropout, factor, act)
self.proj_out = nn.Linear(d_model, cout)
def forward(self, x, xt, y, yt, x_mask=None, y_mask=None, xy_mask=None):
encoder_embedding = self.encoder_embedding(x, xt)
decoder_embedding = self.decoder_embedding(y, yt)
encoder_out = self.encoder(encoder_embedding, x_mask)
decoder_out = self.decoder(decoder_embedding, encoder_out, y_mask, xy_mask)
out = self.proj_out(decoder_out)
return out[:, -self.pred_len:, :]
最后的最后,复现这么顺利,还得感谢万Former之祖,Transformer的贡献(真身镇楼,图3)。
图3:Former之祖Transformer[3]
参考文献
[2] 2021, Zhou et al., https://github.com/zhouhaoyi/Informer2020
[3] 2017, Vaswani et al., Attention Is All You Need; https://arxiv.org/abs/1706.03762