VIT模型简洁理解版代码

VIT模型简洁理解版代码

## from https://github.com/lucidrains/vit-pytorch
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

from torch import nn
from torch import Tensor
from PIL import Image
from torchvision.transforms import Compose, Resize, ToTensor
from einops import rearrange, reduce, repeat
from einops.layers.torch import Rearrange, Reduce
from torchsummary import summary

 # einops张量操作神器
# helpers


def pair(t):
    return t if isinstance(t, tuple) else (t, t)

# classes

class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)

class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)

class Attention(nn.Module):
    def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.1):
        super().__init__()
        inner_dim = dim_head *  heads
        project_out = not (heads == 1 and dim_head == dim)

        self.heads = heads
        self.scale = dim_head ** -0.5

        self.attend = nn.Softmax(dim = -1)
        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)

        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        ) if project_out else nn.Identity()

    def forward(self, x): ## 最重要的都是forword函数了
        qkv = self.to_qkv(x).chunk(3, dim = -1)
        ## 对tensor张量分块 x :1 197 1024   qkv 最后 是一个元组,tuple,长度是3,每个元素形状:1 197 1024
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)
        # 分成多少个Head,与TRM生成qkv 的方式不同, 要更简单,不需要区分来自Encoder还是Decoder

        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale

        attn = self.attend(dots)

        out = torch.matmul(attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        return self.to_out(out)

class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
        super().__init__()
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),
                PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))
            ]))
    def forward(self, x):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
        return x
# 1. VIT整体架构从这里开始
class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
        super().__init__()
        # 初始化函数内,是将输入的图片,得到 img_size ,patch_size 的宽和高
        image_height, image_width = pair(image_size) ## 224*224 *3
        patch_height, patch_width = pair(patch_size)## 16 * 16  *3
        #图像尺寸必须能被patch大小整除
        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width) ## 步骤1.一个图像 分成 N 个patch
        patch_dim = channels * patch_height * patch_width
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),# 步骤2.1将patch 铺开
            nn.Linear(patch_dim, dim), # 步骤2.2 然后映射到指定的embedding的维度
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        x = self.to_patch_embedding(img)  ## img 1 3 224 224  输出形状x : 1 196 1024
        b, n, _ = x.shape ## 
        #将cls 复制 batch_size 份
        cls_tokens = repeat(self.cls_token, '() n d -> b n d', b = b)
        # 将cls token在维度1 扩展到输入上
        x = torch.cat((cls_tokens, x), dim=1)
        # 添加位置编码
        x += self.pos_embedding[:, :(n + 1)]
        x = self.dropout(x)
        # 输入TRM
        x = self.transformer(x)

        x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]

        x = self.to_latent(x)
        return self.mlp_head(x)



v = ViT(
    image_size = 224,
    patch_size = 16,
    num_classes = 1000,
    dim = 1024,
    depth = 6,
    heads = 16,
    mlp_dim = 2048,
    dropout = 0.1,
    emb_dropout = 0.1
)

img = torch.randn(1, 3, 224, 224)

preds = v(img)   # (1, 1000)


  • 4
    点赞
  • 34
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
以下是基于TensorFlow的ViT模型的图像分类代码示例: ```python import tensorflow as tf import tensorflow_datasets as tfds from tensorflow.keras.layers.experimental.preprocessing import Resizing from vit_keras import vit, utils # Load the CIFAR-10 dataset (ds_train, ds_test), ds_info = tfds.load('cifar10', split=['train', 'test'], with_info=True) # Define some constants NUM_CLASSES = ds_info.features['label'].num_classes IMAGE_SIZE = 72 # ViT requires images to be divisible by 8 # Preprocess the data def preprocess_data(data): x = data['image'] y = tf.one_hot(data['label'], NUM_CLASSES) x = Resizing(IMAGE_SIZE, IMAGE_SIZE)(x) / 255.0 # Resize and normalize return x, y ds_train = ds_train.map(preprocess_data).batch(32).prefetch(tf.data.experimental.AUTOTUNE) ds_test = ds_test.map(preprocess_data).batch(32).prefetch(tf.data.experimental.AUTOTUNE) # Define the ViT model model = vit.vit_b16( image_size=IMAGE_SIZE, activation='softmax', classes=NUM_CLASSES, include_top=True, pretrained=True ) # Compile the model model.compile( optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.CategoricalCrossentropy(), metrics=[tf.keras.metrics.CategoricalAccuracy()] ) # Train the model model.fit( ds_train, validation_data=ds_test, epochs=10 ) # Evaluate the model model.evaluate(ds_test) ``` 这个代码示例使用了 TensorFlow Datasets 中的 CIFAR-10 数据集,并使用了 ViT-B16 模型进行图像分类。首先,我们定义了一些常量,包括类别数量和图像大小。然后,我们定义了一个函数来对数据进行预处理,包括缩放和归一化。接下来,我们使用 `vit.vit_b16` 函数定义了 ViT 模型,并将其编译。最后,我们使用 `model.fit` 函数训练模型,并使用 `model.evaluate` 函数评估模型
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值