Vit学习及代码示例(可跑通,帮助理解)

Vit学习及代码示例(可跑通,帮助理解)

原理:
在这里插入图片描述
在这里插入图片描述

关键代码实现示例:
(一)各关键模块

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy
from PIL import Image

# test code for image2emb
batch_size, imageChannel, width, height = 1, 3, 8, 8
patch_size = 4
model_dim = 8
max_num_token = 16
patch_depth = patch_size * patch_size * imageChannel  # one patch size of the image
image = torch.randn(batch_size, imageChannel, width, height)
weight = torch.randn(patch_depth, model_dim)  # Conv2D:model_dim是outputChannel,patch_depth是结果conv size*intputChannel


# 1:convert image to embedding vector sequance
def image2emb_naive(image, patch_size, weight):
    # image shape: batch_size*imageChannel*width*height
    patch = F.unfold(image, kernel_size=patch_size, stride=patch_size).transpose(-1, -2)
    patch_embedding = patch @ weight  # patch size(patch depth) -> 'nlp' embedding
    return patch_embedding


# patch_embeddding_naive = image2emb_naive(image, patch_size, weight)
# print(patch_embeddding_naive.shape)


kernel = weight.transpose(0, 1).reshape((-1, imageChannel, patch_size, patch_size))  # outputChannel*imageChannel(inputChannel)*height*width


def image2emb_conv(image, kernel, stride):
    conv_output = F.conv2d(image, kernel, stride=stride)  # batch_size*outputChannel*height*width
    batch_size, outChannel, height, width = conv_output.shape
    patch_embedding = conv_output.reshape(batch_size, outChannel, height * width).transpose(-1, -2)
    return patch_embedding


patch_embeddding_conv = image2emb_conv(image, kernel, patch_size)
print(patch_embeddding_conv.shape)


##########################################
# 2: add prepared CLS token embedding
cls_token_embedding = torch.randn(batch_size,1,model_dim,requires_grad=True)
token_embedding = torch.cat([cls_token_embedding,patch_embeddding_conv],dim=1)


#########################################
# 3: add position embedding
position_embedding_table = torch.randn(max_num_token,model_dim,requires_grad=True)
seq_len = token_embedding.shape[1]
position_embedding = torch.tile(position_embedding_table[:seq_len],[token_embedding.shape[0],1,1])
token_embedding += position_embedding


#########################################
# 4:pass embedding to Transformer Encoder

encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim,nhead=8)
transformer_encoder = nn.TransformerEncoder(encoder_layer,num_layers=6)


encoder_output = transformer_encoder(token_embedding)  # of course,there can have mask


# 5: do classfication
cls_token_output = encoder_output[:,0,:] # Batch_size,位置,channel数目
num_classes = 10
label = torch.randint(10,(batch_size,))
linear_layer = nn.Linear(model_dim,num_classes)
logits = linear_layer(cls_token_output)  # 未过softmax
loss_fn = nn.CrossEntropyLoss()
loss = loss_fn(logits,label)
print(loss)

结果:
在这里插入图片描述

(二)整理成常规训练模式

###################################################################
# 可训练模式(常规训练测试模式,encoder结构中未加mask)
import os
import numpy as np
import torch
from PIL import Image
import matplotlib.pyplot as plt
from torch import nn, optim
from torch.utils.data import DataLoader
from torchvision import models
import torch.nn.functional as F
from pytorch_grad_cam import GradCAM
from pytorch_grad_cam.utils.image import show_cam_on_image
from torchvision import transforms


# 上述的Naive形式为原始的patch构造embedding(NLP)方式
# 此操作为卷积构造,可看成用CNN+Transformer形式(当然更进一步的修改类似)
def image2emb_conv(image, kernel, stride):
    conv_output = F.conv2d(image, kernel, stride=stride)  # batch_size*outputChannel*height*width
    batch_size, outChannel, height, width = conv_output.shape
    patch_embedding = conv_output.reshape(batch_size, outChannel, height * width).transpose(-1, -2)
    # print(patch_embedding.shape)
    return patch_embedding


def make_token_embedding(patch_embeddding_conv):
    # print(patch_embeddding_conv.shape)
    # 即构造VIT输入[CLS embedding;N 个 patch embedding]or[CLS embedding;height * width embedding]
    # N :patch_num,D:model_dim(即Conv的outputChannel)
    # height * width == patch_num * patch_size ^ 2

    # 2: add prepared CLS token embedding(class embeddding)
    # 注意:同样可以不加CLS embedding 而在VIT输出使用average pooling得到最终的image presentation
    # 原文ViT是为了尽可能是模型结构接近原始的Transformer,所以采用了类似于BERT的做法,加入特殊字符
    cls_token_embedding = torch.randn(batch_size, 1, model_dim, requires_grad=True)
    token_embedding = torch.cat([cls_token_embedding, patch_embeddding_conv], dim=1)
    # print(token_embedding.shape)

    # 3: add position embedding(patch_num+1,model_dim):(N+1,D),代码中为一维位置编码,EG:3x3共9个patch,patch编码为1到9
    # 也可用二维位置编码,EG:patch编码为11,12,13,21,22,23,31,32,33
    # 或也可用相对位置编码(eg:9相对1距离为8),但各position embedding应展成embedding或用mask方式加入
    position_embedding_table = torch.randn(max_num_token, model_dim, requires_grad=True)
    seq_len = token_embedding.shape[1]

    position_embedding = torch.tile(position_embedding_table[:seq_len], [token_embedding.shape[0], 1, 1])
    # print(position_embedding.shape)
    token_embedding += position_embedding

    return token_embedding


#########################################
# 4:pass embedding to Vit Model
class VitModel(nn.Module):
    def __init__(self):
        super(VitModel, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim, nhead=nhead)
        self.num_layers = num_layers
        self.linear = nn.Linear(model_dim, num_classes)
        self.softmax = nn.Softmax(dim=0)

    def forward(self, token_embedding):
        transformer_encoder = nn.TransformerEncoder(self.encoder_layer, self.num_layers)
        x = transformer_encoder(token_embedding)
        cls_token_output = x[:, 0, :]  # Batch_size,位置,channel数目
        y = self.softmax(self.linear(cls_token_output))
        return y


# def train(model, dataset, lr, batch_size, num_epochs):
#     data_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, drop_last=False)
#     criterion = nn.CrossEntropyLoss()
#     optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.99)
#     for epoch in range(num_epochs):
#         losses = 0
#         for images in data_loader:
#             target = targets
#             outputs = model(input)
#             # print(outputs.shape)
#             loss = criterion(outputs, target)  # 训练集、测试集和标签的设定对模型效果影响很大
#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()
#             losses = losses + loss.item()
#         if (epoch + 1) % 5 == 0:
#             print('Epoch:', '%04d' % (epoch + 1), 'loss =', '{:.6f}'.format(losses / (data_loader.__len__())))


if __name__ == '__main__':
    # 通常在一个很大的数据集上预训练ViT,然后在下游任务相对小的数据集上微调,已有研究表明在分辨率更高的图片上微调比在在分辨率更低的图片上预训练效果更好
    image = Image.open("./cat.png").convert('RGB')
    image = transforms.Resize((450, 450))(image)  # 保持长宽比的resize方法
    # img = transforms.Resize((448,448))(img)  # 直接resize成正方形的方法
    image = transforms.ToTensor()(image)
    image = transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])(image)

    ############################################################
    # 1: make token_embedding attributes
    batch_size = 1
    imageChannel, width, height = image.shape[0], image.shape[1], image.shape[2]
    # print(imageChannel, width, height)
    image = image.unsqueeze(0)  # 拓展维度, 拓展batch_size那一维
    # print(image.shape)
    patch_size = 4
    model_dim = 8
    patch_depth = patch_size * patch_size * imageChannel  # one patch size of the image
    weight = torch.randn(model_dim, patch_depth)  # Conv2D:model_dim是outputChannel,patch_depth是结果conv size*intputChannel
    kernel = weight.reshape((-1, imageChannel, patch_size, patch_size))

    patch_embeddding_conv = image2emb_conv(image, kernel, patch_size)

    max_num_token = height*width + 1  # height*width == patch_num*patch_size^2, max_num_token >= height*width(patch_num*patch_size^2) + 1(class embedding)

    token_embedding = make_token_embedding(patch_embeddding_conv)

    ##############################
    # define model attributes
    nhead = 8
    num_layers = 6
    num_classes = 10

    model = VitModel()
    label = torch.randint(10, (batch_size,))
    criterion = nn.CrossEntropyLoss()
    output = model(token_embedding)
    loss = criterion(output, label)
    print(loss)

输入图片:cat.png
在这里插入图片描述
结果:
在这里插入图片描述

PLus:
原理细节可以看这两个链接:
https://zhuanlan.zhihu.com/p/445122996

https://blog.csdn.net/verse_armour/article/details/128336786?utm_medium=distribute.pc_relevant.none-task-blog-2defaultbaidujs_baidulandingword~default-0-128336786-blog-122799541.pc_relevant_3mothn_strategy_and_data_recovery&spm=1001.2101.3001.4242.1&utm_relevant_index=3
Huggingface工具:https://blog.csdn.net/m0_56722835/article/details/127437259

以上仅为VIT相关学习示例,其本身还是一般人直接训练不起的,但是pre-training用于迁移学习做下游任务,或借鉴VIT的结构做一些任务还是OK的。
例如:
在这里插入图片描述
也可用图片patches在transformer的输出,做一个GAP,然后去做最后的分类任务,另外得注意调参。

  • 1
    点赞
  • 16
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
ViT(Vision Transformer)是一种基于Transformer的图像分类模型,它将图像分割成一系列的图像块,并使用Transformer编码器来处理这些图像块。以下是一个使用PyTorch实现ViT的简单代码示例: ```python import torch import torch.nn as nn import torch.optim as optim from torchvision import transforms, datasets from torch.utils.data import DataLoader from torchsummary import summary # 定义ViT模型 class ViT(nn.Module): def __init__(self, image_size, patch_size, num_classes, dim): super(ViT, self).__init__() num_patches = (image_size // patch_size) ** 2 self.patch_embedding = nn.Conv2d(3, dim, kernel_size=patch_size, stride=patch_size) self.positional_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim)) self.transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=dim, nhead=8), num_layers=12) self.fc = nn.Linear(dim, num_classes) def forward(self, x): x = self.patch_embedding(x) x = x.flatten(2).transpose(1, 2) x = torch.cat([self.positional_embedding, x], dim=1) x = self.transformer(x) x = x[:, 0] x = self.fc(x) return x # 数据预处理 transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # 加载数据集 train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform) test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform) train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False) # 创建模型实例 model = ViT(image_size=224, patch_size=16, num_classes=10, dim=256) # 定义损失函数和优化器 criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.001) # 训练模型 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) for epoch in range(10): running_loss = 0.0 for i, (inputs, labels) in enumerate(train_loader): inputs, labels = inputs.to(device), labels.to(device) optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.item() print(f"Epoch {epoch+1} loss: {running_loss/len(train_loader)}") # 测试模型 model.eval() correct = 0 total = 0 with torch.no_grad(): for inputs, labels in test_loader: inputs, labels = inputs.to(device), labels.to(device) outputs = model(inputs) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() accuracy = 100 * correct / total print(f"Test accuracy: {accuracy}%") # 打印模型结构 summary(model, (3, 224, 224)) ``` 这段代码实现了一个简单的ViT模型,使用CIFAR-10数据集进行训练和测试。代码中包括了模型的定义、数据预处理、数据加载、训练过程、测试过程以及模型结构的打印。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值