在Vision Transformer中,1D位置嵌入和2D位置编码是通过在输入的图像或序列中添加额外的位置信息来实现的。下面是实现1D位置嵌入和2D位置编码的代码示例:
1. 1D位置嵌入
```python
import torch
import torch.nn as nn
class ViT(nn.Module):
def __init__(self, img_size=224, patch_size=16, num_classes=1000, dim=768, depth=12, heads=12, mlp_dim=3072, dropout=0.1):
super().__init__()
self.num_patches = (img_size // patch_size) ** 2
self.patch_size = patch_size
self.pos_embedding = nn.Parameter(torch.randn(1, self.num_patches + 1, dim))
self.patch_embed = nn.Conv2d(3, dim, kernel_size=patch_size, stride=patch_size, bias=False)
self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
self.dropout = nn.Dropout(dropout)
self.transformer = nn.ModuleList([
nn.TransformerEncoderLayer(d_model=dim, nhead=heads, dim_feedforward=mlp_dim, dropout=dropout)
for _ in range(depth)
])
self.fc = nn.Linear(dim, num_classes)
def forward(self, x):
b, c, h, w = x.shape
x = self.patch_embed(x).flatten(2).transpose(1, 2)
x = torch.cat([self.cls_token.repeat(b, 1, 1), x], dim=1)
x = x + self.pos_embedding[:, :(self.num_patches + 1)]
x = self.dropout(x)
for transformer_layer in self.transformer:
x = transformer_layer(x)
x = x.mean(dim=1)
x = self.fc(x)
return x
```
在这个代码中,`self.pos_embedding` 是一个可学习的参数,其 shape 为 `(1, num_patches + 1, dim)`,其中 `num_patches` 是输入图像被分成的 patch 的数量,`dim` 是 Transformer 的隐藏维度。`self.cls_token` 是一个用于表示整个序列或图像的特殊 token,它也是一个可学习的参数,其 shape 为 `(1, 1, dim)`。在 forward 函数中,我们首先对输入图像进行 patch embedding,然后将 cls token 和位置嵌入加到 patch embedding 的结果中。最后,我们将得到的序列输入到 Transformer 中。
2. 2D位置编码
```python
import torch
import torch.nn as nn
class ViT(nn.Module):
def __init__(self, img_size=224, patch_size=16, num_classes=1000, dim=768, depth=12, heads=12, mlp_dim=3072, dropout=0.1):
super().__init__()
self.num_patches = (img_size // patch_size) ** 2
self.patch_size = patch_size
self.pos_embedding = nn.Parameter(torch.randn(1, dim, img_size // patch_size, img_size // patch_size))
self.patch_embed = nn.Conv2d(3, dim, kernel_size=patch_size, stride=patch_size, bias=False)
self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
self.dropout = nn.Dropout(dropout)
self.transformer = nn.ModuleList([
nn.TransformerEncoderLayer(d_model=dim, nhead=heads, dim_feedforward=mlp_dim, dropout=dropout)
for _ in range(depth)
])
self.fc = nn.Linear(dim, num_classes)
def forward(self, x):
b, c, h, w = x.shape
x = self.patch_embed(x).flatten(2).transpose(1, 2)
x = torch.cat([self.cls_token.repeat(b, 1, 1), x], dim=1)
x = x + self.pos_embedding
x = self.dropout(x)
for transformer_layer in self.transformer:
x = transformer_layer(x)
x = x.mean(dim=1)
x = self.fc(x)
return x
```
在这个代码中,`self.pos_embedding` 是一个可学习的参数,其 shape 为 `(1, dim, img_size // patch_size, img_size // patch_size)`,其中 `img_size // patch_size` 是输入图像被分成的 patch 的数量。在 forward 函数中,我们首先对输入图像进行 patch embedding,然后将 cls token 和位置编码加到 patch embedding 的结果中。最后,我们将得到的序列输入到 Transformer 中。