文章目录
安装环境
# 1. 环境安装pytorch
# 2. 安装tqdm
pip install ftfy regex tqdm
# 3. 安装clip
pip install git+https://github.com/openai/CLIP.git
# 内网使用pip install git+https://github.91chi.fun/https://github.com/openai/CLIP.git
API
# 1. 返回可以用的模型
clip.available_models()
['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px']
# 2. 返回对应的模型和图像转换器
model, preprocess = clip.load("ViT-B/32")
# 3. preprocess将Image转换成tensor[3, 224, 224],然后unsqueeze(0)转成[batch_size, 3, 3, 224]后才能输入模型
image = preprocess(Image.open("CLIP.png")).unsqueeze(0)
# 4. 将多个句子[batch_size]的每个句子转换成向量[batch_size, context_length]
# 每个句子开头加一个BOS(49406) EOS(49407),然后填充到长度context_length(默认值为77)
# (若长度大于context_length-2,需设置参数truncate=True,然后返回值为BOS 内容 EOS,即EOS没有被切割掉)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device) # [3, 77]
# 5. 获取多个图片的特征
image_features = model.encode_image(image)
# 6. 获取多个文本的特征
text_features = model.encode_text(text)
# 7. 获取 多个图片和多个文本 之间余弦相似度(0~1)
logits_per_image, logits_per_text = model(image, text)
各模型shape&dtype
ViT-B/32
# 512 (224, 224)
image torch.Size([B, 3, 224, 224]) torch.float32
text torch.Size([B, 77]) torch.int32
image_features torch.Size([B, 512]) torch.float16
text_features torch.Size([B, 512]) torch.float16
ViT-B/16
# 512 (224, 224)
image torch.Size([B, 3, 224, 224]) torch.float32
text torch.Size([B, 77]) torch.int32
image_features torch.Size([B, 512]) torch.float16
text_features torch.Size([B, 512]) torch.float16
ViT-L/14
# 768 (224, 224)
image torch.Size([B, 3, 224, 224]) torch.float32
text torch.Size([B, 77]) torch.int32
image_features torch.Size([B, 768]) torch.float16
text_features torch.Size([B, 768]) torch.float16
ViT-L/14@336px
# 768 (336, 336)
image torch.Size([B, 3, 336, 336]) torch.float32
text torch.Size([B, 77]) torch.int32
image_features torch.Size([B, 768]) torch.float16
text_features torch.Size([B, 768]) torch.float16
各模型preprocess&tokenize
# 无论load哪个模型,clip.tokenize返回值均相同
# preprocess的不同在于返回的图片尺寸不同,
# 因此 ViT-B/32 ViT-B/16 ViT-L/14的preprocess返回值相同,ViT-L/14@336px的preprocess返回值与它们不同
简单使用
示例1
import torch
import clip
from PIL import Image
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
image = preprocess(Image.open("CLIP.png")).unsqueeze(0).to(device)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)
with torch.no_grad():
image_features = model.encode_image(image)
text_features = model.encode_text(text)
logits_per_image, logits_per_text = model(image, text)
probs = logits_per_image.softmax(dim=-1).cpu().numpy()
print("Label probs:", probs) # prints: [[0.9927937 0.00421068 0.00299572]]
示例2
import os
import clip
import torch
from torchvision.datasets import CIFAR100
# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)
# Download the dataset
cifar100 = CIFAR100(root=os.path.expanduser("~/.cache"), download=True, train=False)
# Prepare the inputs
image, class_id = cifar100[3637]
image_input = preprocess(image).unsqueeze(0).to(device)
text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in cifar100.classes]).to(device)
# Calculate features
with torch.no_grad():
image_features = model.encode_image(image_input)
text_features = model.encode_text(text_inputs)
# Pick the top 5 most similar labels for the image
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
# 这里乘100没什么用,意思是用百分比表示相似度
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
values, indices = similarity[0].topk(5)
# Print the result
print("\nTop predictions:\n")
for value, index in zip(values, indices):
print(f"{cifar100.classes[index]:>16s}: {100 * value.item():.2f}%")
示例3
import os
import clip
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR100
from tqdm import tqdm
# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)
# Load the dataset
root = os.path.expanduser("~/.cache")
train = CIFAR100(root, download=True, train=True, transform=preprocess)
test = CIFAR100(root, download=True, train=False, transform=preprocess)
def get_features(dataset):
all_features = []
all_labels = []
with torch.no_grad():
for images, labels in tqdm(DataLoader(dataset, batch_size=100)):
features = model.encode_image(images.to(device))
all_features.append(features)
all_labels.append(labels)
return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()
# Calculate the image features
train_features, train_labels = get_features(train)
test_features, test_labels = get_features(test)
# Perform logistic regression
classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1)
classifier.fit(train_features, train_labels)
# Evaluate using the logistic regression classifier
predictions = classifier.predict(test_features)
accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
print(f"Accuracy = {accuracy:.3f}")
(重要)固定或更新CLIP参数
关于detach
# 因为我们的模型只用到了CLIP视觉的编码器,所以我们只输出视觉编码器的参数有没有变化即可
# 不打开位置1和位置2,全部输出False,即所有参数都进行了更新
# 仅打开位置1,CLIP的参数为True,Linear为False,即Linear的参数更新
# 仅打开位置2,CLIP的参数为Flase,Linear为True,即只有CLIP的参数更新
import os
import clip
from torch import nn
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10
from torch.nn import functional as F
import torch
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.model, self.preprocess = clip.load('ViT-B/32', 'cpu')
self.linear = nn.Linear(512, 10)
# 位置2
# for param in self.linear.parameters():
# param.requires_grad = False
def forward(self, x):
features = self.model.encode_image(x)
# 位置1
# features = features.detach()
return self.linear(features)
net = Net()
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2)
root = os.path.expanduser("~/.cache")
train = CIFAR10(root, download=True, train=True, transform=net.preprocess)
train = next(iter(DataLoader(train, batch_size=8)))
storeParam = {}
for name, param in net.model.visual.named_parameters():
storeParam[name] = param.detach().clone()
for name, param in net.linear.named_parameters():
storeParam[name] = param.detach().clone()
for i in range(10):
out = net(train[0])
loss = F.cross_entropy(out, train[1])
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(loss.item())
for name, param in net.model.visual.named_parameters():
print(f"{name} {torch.equal(param, storeParam[name])}")
for name, param in net.linear.named_parameters():
print(f"{name} {torch.equal(param, storeParam[name])}")
CLIP层结构
VIT-B/32
CLIP(
# 图片相关
(visual): VisionTransformer(
(conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
(ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(transformer): Transformer(
(resblocks): Sequential(
(0): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(1): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(2): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(3): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(4): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(5): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(6): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(7): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(8): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(9): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(10): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(11): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
)
(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
# 文本相关
(transformer): Transformer(
(resblocks): Sequential(
(0): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(1): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(2): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(3): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(4): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(5): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(6): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(7): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(8): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(9): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(10): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(11): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
)
)
(token_embedding): Embedding(49408, 512)
(ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
VIT-B/16
CLIP(
# 图片相关
(visual): VisionTransformer(
(conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
(ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(transformer): Transformer(
(resblocks): Sequential(
(0): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(1): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(2): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(3): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(4): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(5): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(6): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(7): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(8): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(9): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(10): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(11): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
)
(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
# 文本相关
(token_embedding): Embedding(49408, 512)
(transformer): Transformer(
(resblocks): Sequential(
(0): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(1): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(2): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(3): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(4): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(5): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(6): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(7): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(8): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(9): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(10): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
(11): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
)
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=512, out_features=2048, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=2048, out_features=512, bias=True)
)
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
)
)
(ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
VIT-L/14
CLIP(
# 图片相关
(visual): VisionTransformer(
(conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
(ln_pre): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(transformer): Transformer(
(resblocks): Sequential(
(0): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(1): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(2): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(3): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(4): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(5): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(6): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(7): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(8): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(9): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(10): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(11): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(12): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(13): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(14): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(15): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(16): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(17): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(18): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(19): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(20): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(21): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(22): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(23): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
)
(ln_post): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
# 文本相关
(transformer): Transformer(
(resblocks): Sequential(
(0): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(1): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(2): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(3): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(4): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(5): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(6): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(7): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(8): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(9): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(10): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(11): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
)
(token_embedding): Embedding(49408, 768)
(ln_final): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
VIT-L/14@336px
CLIP(
# 图片相关
(visual): VisionTransformer(
(conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
(ln_pre): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(transformer): Transformer(
(resblocks): Sequential(
(0): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(1): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(2): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(3): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(4): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(5): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(6): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(7): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(8): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(9): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(10): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(11): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(12): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(13): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(14): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(15): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(16): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(17): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(18): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(19): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(20): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(21): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(22): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
(23): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
)
(ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=1024, out_features=4096, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=4096, out_features=1024, bias=True)
)
(ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
)
)
(ln_post): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)
# 文本相关
(transformer): Transformer(
(resblocks): Sequential(
(0): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(1): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(2): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(3): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(4): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(5): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(6): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(7): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(8): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(9): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(10): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(11): ResidualAttentionBlock(
(attn): MultiheadAttention(
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
)
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): Sequential(
(c_fc): Linear(in_features=768, out_features=3072, bias=True)
(gelu): QuickGELU()
(c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
)
(token_embedding): Embedding(49408, 768)
(ln_final): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
CLIP参数结构
VIT-B
包括VIT-B/32和VIT-B/16
# 计算余弦相似度时的权重,详细代码为:
# normalized features
# image_features = image_features / image_features.norm(dim=1, keepdim=True)
# text_features = text_features / text_features.norm(dim=1, keepdim=True)
# # cosine similarity as logits
# logit_scale = self.logit_scale.exp()
# logits_per_image = logit_scale * image_features @ text_features.t()
# logits_per_text = logits_per_image.t()
logit_scale
# 图片相关
visual.conv1.weight
visual.class_embedding
visual.positional_embedding
visual.ln_pre.weight
visual.ln_pre.bias
visual.transformer.resblocks.0.attn.in_proj_weight
visual.transformer.resblocks.0.attn.in_proj_bias
visual.transformer.resblocks.0.attn.out_proj.weight
visual.transformer.resblocks.0.attn.out_proj.bias
visual.transformer.resblocks.0.ln_1.weight
visual.transformer.resblocks.0.ln_1.bias
visual.transformer.resblocks.0.mlp.c_fc.weight
visual.transformer.resblocks.0.mlp.c_fc.bias
visual.transformer.resblocks.0.mlp.c_proj.weight
visual.transformer.resblocks.0.mlp.c_proj.bias
visual.transformer.resblocks.0.ln_2.weight
visual.transformer.resblocks.0.ln_2.bias
visual.transformer.resblocks.1.attn.in_proj_weight
visual.transformer.resblocks.1.attn.in_proj_bias
visual.transformer.resblocks.1.attn.out_proj.weight
visual.transformer.resblocks.1.attn.out_proj.bias
visual.transformer.resblocks.1.ln_1.weight
visual.transformer.resblocks.1.ln_1.bias
visual.transformer.resblocks.1.mlp.c_fc.weight
visual.transformer.resblocks.1.mlp.c_fc.bias
visual.transformer.resblocks.1.mlp.c_proj.weight
visual.transformer.resblocks.1.mlp.c_proj.bias
visual.transformer.resblocks.1.ln_2.weight
visual.transformer.resblocks.1.ln_2.bias
visual.transformer.resblocks.2.attn.in_proj_weight
visual.transformer.resblocks.2.attn.in_proj_bias
visual.transformer.resblocks.2.attn.out_proj.weight
visual.transformer.resblocks.2.attn.out_proj.bias
visual.transformer.resblocks.2.ln_1.weight
visual.transformer.resblocks.2.ln_1.bias
visual.transformer.resblocks.2.mlp.c_fc.weight
visual.transformer.resblocks.2.mlp.c_fc.bias
visual.transformer.resblocks.2.mlp.c_proj.weight
visual.transformer.resblocks.2.mlp.c_proj.bias
visual.transformer.resblocks.2.ln_2.weight
visual.transformer.resblocks.2.ln_2.bias
visual.transformer.resblocks.3.attn.in_proj_weight
visual.transformer.resblocks.3.attn.in_proj_bias
visual.transformer.resblocks.3.attn.out_proj.weight
visual.transformer.resblocks.3.attn.out_proj.bias
visual.transformer.resblocks.3.ln_1.weight
visual.transformer.resblocks.3.ln_1.bias
visual.transformer.resblocks.3.mlp.c_fc.weight
visual.transformer.resblocks.3.mlp.c_fc.bias
visual.transformer.resblocks.3.mlp.c_proj.weight
visual.transformer.resblocks.3.mlp.c_proj.bias
visual.transformer.resblocks.3.ln_2.weight
visual.transformer.resblocks.3.ln_2.bias
visual.transformer.resblocks.4.attn.in_proj_weight
visual.transformer.resblocks.4.attn.in_proj_bias
visual.transformer.resblocks.4.attn.out_proj.weight
visual.transformer.resblocks.4.attn.out_proj.bias
visual.transformer.resblocks.4.ln_1.weight
visual.transformer.resblocks.4.ln_1.bias
visual.transformer.resblocks.4.mlp.c_fc.weight
visual.transformer.resblocks.4.mlp.c_fc.bias
visual.transformer.resblocks.4.mlp.c_proj.weight
visual.transformer.resblocks.4.mlp.c_proj.bias
visual.transformer.resblocks.4.ln_2.weight
visual.transformer.resblocks.4.ln_2.bias
visual.transformer.resblocks.5.attn.in_proj_weight
visual.transformer.resblocks.5.attn.in_proj_bias
visual.transformer.resblocks.5.attn.out_proj.weight
visual.transformer.resblocks.5.attn.out_proj.bias
visual.transformer.resblocks.5.ln_1.weight
visual.transformer.resblocks.5.ln_1.bias
visual.transformer.resblocks.5.mlp.c_fc.weight
visual.transformer.resblocks.5.mlp.c_fc.bias
visual.transformer.resblocks.5.mlp.c_proj.weight
visual.transformer.resblocks.5.mlp.c_proj.bias
visual.transformer.resblocks.5.ln_2.weight
visual.transformer.resblocks.5.ln_2.bias
visual.transformer.resblocks.6.attn.in_proj_weight
visual.transformer.resblocks.6.attn.in_proj_bias
visual.transformer.resblocks.6.attn.out_proj.weight
visual.transformer.resblocks.6.attn.out_proj.bias
visual.transformer.resblocks.6.ln_1.weight
visual.transformer.resblocks.6.ln_1.bias
visual.transformer.resblocks.6.mlp.c_fc.weight
visual.transformer.resblocks.6.mlp.c_fc.bias
visual.transformer.resblocks.6.mlp.c_proj.weight
visual.transformer.resblocks.6.mlp.c_proj.bias
visual.transformer.resblocks.6.ln_2.weight
visual.transformer.resblocks.6.ln_2.bias
visual.transformer.resblocks.7.attn.in_proj_weight
visual.transformer.resblocks.7.attn.in_proj_bias
visual.transformer.resblocks.7.attn.out_proj.weight
visual.transformer.resblocks.7.attn.out_proj.bias
visual.transformer.resblocks.7.ln_1.weight
visual.transformer.resblocks.7.ln_1.bias
visual.transformer.resblocks.7.mlp.c_fc.weight
visual.transformer.resblocks.7.mlp.c_fc.bias
visual.transformer.resblocks.7.mlp.c_proj.weight
visual.transformer.resblocks.7.mlp.c_proj.bias
visual.transformer.resblocks.7.ln_2.weight
visual.transformer.resblocks.7.ln_2.bias
visual.transformer.resblocks.8.attn.in_proj_weight
visual.transformer.resblocks.8.attn.in_proj_bias
visual.transformer.resblocks.8.attn.out_proj.weight
visual.transformer.resblocks.8.attn.out_proj.bias
visual.transformer.resblocks.8.ln_1.weight
visual.transformer.resblocks.8.ln_1.bias
visual.transformer.resblocks.8.mlp.c_fc.weight
visual.transformer.resblocks.8.mlp.c_fc.bias
visual.transformer.resblocks.8.mlp.c_proj.weight
visual.transformer.resblocks.8.mlp.c_proj.bias
visual.transformer.resblocks.8.ln_2.weight
visual.transformer.resblocks.8.ln_2.bias
visual.transformer.resblocks.9.attn.in_proj_weight
visual.transformer.resblocks.9.attn.in_proj_bias
visual.transformer.resblocks.9.attn.out_proj.weight
visual.transformer.resblocks.9.attn.out_proj.bias
visual.transformer.resblocks.9.ln_1.weight
visual.transformer.resblocks.9.ln_1.bias
visual.transformer.resblocks.9.mlp.c_fc.weight
visual.transformer.resblocks.9.mlp.c_fc.bias
visual.transformer.resblocks.9.mlp.c_proj.weight
visual.transformer.resblocks.9.mlp.c_proj.bias
visual.transformer.resblocks.9.ln_2.weight
visual.transformer.resblocks.9.ln_2.bias
visual.transformer.resblocks.10.attn.in_proj_weight
visual.transformer.resblocks.10.attn.in_proj_bias
visual.transformer.resblocks.10.attn.out_proj.weight
visual.transformer.resblocks.10.attn.out_proj.bias
visual.transformer.resblocks.10.ln_1.weight
visual.transformer.resblocks.10.ln_1.bias
visual.transformer.resblocks.10.mlp.c_fc.weight
visual.transformer.resblocks.10.mlp.c_fc.bias
visual.transformer.resblocks.10.mlp.c_proj.weight
visual.transformer.resblocks.10.mlp.c_proj.bias
visual.transformer.resblocks.10.ln_2.weight
visual.transformer.resblocks.10.ln_2.bias
visual.transformer.resblocks.11.attn.in_proj_weight
visual.transformer.resblocks.11.attn.in_proj_bias
visual.transformer.resblocks.11.attn.out_proj.weight
visual.transformer.resblocks.11.attn.out_proj.bias
visual.transformer.resblocks.11.ln_1.weight
visual.transformer.resblocks.11.ln_1.bias
visual.transformer.resblocks.11.mlp.c_fc.weight
visual.transformer.resblocks.11.mlp.c_fc.bias
visual.transformer.resblocks.11.mlp.c_proj.weight
visual.transformer.resblocks.11.mlp.c_proj.bias
visual.transformer.resblocks.11.ln_2.weight
visual.transformer.resblocks.11.ln_2.bias
visual.ln_post.weight
visual.ln_post.bias
visual.proj
# 文本相关
token_embedding.weight
positional_embedding
transformer.resblocks.0.attn.in_proj_weight
transformer.resblocks.0.attn.in_proj_bias
transformer.resblocks.0.attn.out_proj.weight
transformer.resblocks.0.attn.out_proj.bias
transformer.resblocks.0.ln_1.weight
transformer.resblocks.0.ln_1.bias
transformer.resblocks.0.mlp.c_fc.weight
transformer.resblocks.0.mlp.c_fc.bias
transformer.resblocks.0.mlp.c_proj.weight
transformer.resblocks.0.mlp.c_proj.bias
transformer.resblocks.0.ln_2.weight
transformer.resblocks.0.ln_2.bias
transformer.resblocks.1.attn.in_proj_weight
transformer.resblocks.1.attn.in_proj_bias
transformer.resblocks.1.attn.out_proj.weight
transformer.resblocks.1.attn.out_proj.bias
transformer.resblocks.1.ln_1.weight
transformer.resblocks.1.ln_1.bias
transformer.resblocks.1.mlp.c_fc.weight
transformer.resblocks.1.mlp.c_fc.bias
transformer.resblocks.1.mlp.c_proj.weight
transformer.resblocks.1.mlp.c_proj.bias
transformer.resblocks.1.ln_2.weight
transformer.resblocks.1.ln_2.bias
transformer.resblocks.2.attn.in_proj_weight
transformer.resblocks.2.attn.in_proj_bias
transformer.resblocks.2.attn.out_proj.weight
transformer.resblocks.2.attn.out_proj.bias
transformer.resblocks.2.ln_1.weight
transformer.resblocks.2.ln_1.bias
transformer.resblocks.2.mlp.c_fc.weight
transformer.resblocks.2.mlp.c_fc.bias
transformer.resblocks.2.mlp.c_proj.weight
transformer.resblocks.2.mlp.c_proj.bias
transformer.resblocks.2.ln_2.weight
transformer.resblocks.2.ln_2.bias
transformer.resblocks.3.attn.in_proj_weight
transformer.resblocks.3.attn.in_proj_bias
transformer.resblocks.3.attn.out_proj.weight
transformer.resblocks.3.attn.out_proj.bias
transformer.resblocks.3.ln_1.weight
transformer.resblocks.3.ln_1.bias
transformer.resblocks.3.mlp.c_fc.weight
transformer.resblocks.3.mlp.c_fc.bias
transformer.resblocks.3.mlp.c_proj.weight
transformer.resblocks.3.mlp.c_proj.bias
transformer.resblocks.3.ln_2.weight
transformer.resblocks.3.ln_2.bias
transformer.resblocks.4.attn.in_proj_weight
transformer.resblocks.4.attn.in_proj_bias
transformer.resblocks.4.attn.out_proj.weight
transformer.resblocks.4.attn.out_proj.bias
transformer.resblocks.4.ln_1.weight
transformer.resblocks.4.ln_1.bias
transformer.resblocks.4.mlp.c_fc.weight
transformer.resblocks.4.mlp.c_fc.bias
transformer.resblocks.4.mlp.c_proj.weight
transformer.resblocks.4.mlp.c_proj.bias
transformer.resblocks.4.ln_2.weight
transformer.resblocks.4.ln_2.bias
transformer.resblocks.5.attn.in_proj_weight
transformer.resblocks.5.attn.in_proj_bias
transformer.resblocks.5.attn.out_proj.weight
transformer.resblocks.5.attn.out_proj.bias
transformer.resblocks.5.ln_1.weight
transformer.resblocks.5.ln_1.bias
transformer.resblocks.5.mlp.c_fc.weight
transformer.resblocks.5.mlp.c_fc.bias
transformer.resblocks.5.mlp.c_proj.weight
transformer.resblocks.5.mlp.c_proj.bias
transformer.resblocks.5.ln_2.weight
transformer.resblocks.5.ln_2.bias
transformer.resblocks.6.attn.in_proj_weight
transformer.resblocks.6.attn.in_proj_bias
transformer.resblocks.6.attn.out_proj.weight
transformer.resblocks.6.attn.out_proj.bias
transformer.resblocks.6.ln_1.weight
transformer.resblocks.6.ln_1.bias
transformer.resblocks.6.mlp.c_fc.weight
transformer.resblocks.6.mlp.c_fc.bias
transformer.resblocks.6.mlp.c_proj.weight
transformer.resblocks.6.mlp.c_proj.bias
transformer.resblocks.6.ln_2.weight
transformer.resblocks.6.ln_2.bias
transformer.resblocks.7.attn.in_proj_weight
transformer.resblocks.7.attn.in_proj_bias
transformer.resblocks.7.attn.out_proj.weight
transformer.resblocks.7.attn.out_proj.bias
transformer.resblocks.7.ln_1.weight
transformer.resblocks.7.ln_1.bias
transformer.resblocks.7.mlp.c_fc.weight
transformer.resblocks.7.mlp.c_fc.bias
transformer.resblocks.7.mlp.c_proj.weight
transformer.resblocks.7.mlp.c_proj.bias
transformer.resblocks.7.ln_2.weight
transformer.resblocks.7.ln_2.bias
transformer.resblocks.8.attn.in_proj_weight
transformer.resblocks.8.attn.in_proj_bias
transformer.resblocks.8.attn.out_proj.weight
transformer.resblocks.8.attn.out_proj.bias
transformer.resblocks.8.ln_1.weight
transformer.resblocks.8.ln_1.bias
transformer.resblocks.8.mlp.c_fc.weight
transformer.resblocks.8.mlp.c_fc.bias
transformer.resblocks.8.mlp.c_proj.weight
transformer.resblocks.8.mlp.c_proj.bias
transformer.resblocks.8.ln_2.weight
transformer.resblocks.8.ln_2.bias
transformer.resblocks.9.attn.in_proj_weight
transformer.resblocks.9.attn.in_proj_bias
transformer.resblocks.9.attn.out_proj.weight
transformer.resblocks.9.attn.out_proj.bias
transformer.resblocks.9.ln_1.weight
transformer.resblocks.9.ln_1.bias
transformer.resblocks.9.mlp.c_fc.weight
transformer.resblocks.9.mlp.c_fc.bias
transformer.resblocks.9.mlp.c_proj.weight
transformer.resblocks.9.mlp.c_proj.bias
transformer.resblocks.9.ln_2.weight
transformer.resblocks.9.ln_2.bias
transformer.resblocks.10.attn.in_proj_weight
transformer.resblocks.10.attn.in_proj_bias
transformer.resblocks.10.attn.out_proj.weight
transformer.resblocks.10.attn.out_proj.bias
transformer.resblocks.10.ln_1.weight
transformer.resblocks.10.ln_1.bias
transformer.resblocks.10.mlp.c_fc.weight
transformer.resblocks.10.mlp.c_fc.bias
transformer.resblocks.10.mlp.c_proj.weight
transformer.resblocks.10.mlp.c_proj.bias
transformer.resblocks.10.ln_2.weight
transformer.resblocks.10.ln_2.bias
transformer.resblocks.11.attn.in_proj_weight
transformer.resblocks.11.attn.in_proj_bias
transformer.resblocks.11.attn.out_proj.weight
transformer.resblocks.11.attn.out_proj.bias
transformer.resblocks.11.ln_1.weight
transformer.resblocks.11.ln_1.bias
transformer.resblocks.11.mlp.c_fc.weight
transformer.resblocks.11.mlp.c_fc.bias
transformer.resblocks.11.mlp.c_proj.weight
transformer.resblocks.11.mlp.c_proj.bias
transformer.resblocks.11.ln_2.weight
transformer.resblocks.11.ln_2.bias
ln_final.weight
ln_final.bias
text_projection
VIT-L
包括VIT-L/14和VIT-L/14@336px
logit_scale
# 图片相关
visual.conv1.weight
visual.class_embedding
visual.positional_embedding
visual.ln_pre.weight
visual.ln_pre.bias
visual.transformer.resblocks.0.attn.in_proj_weight
visual.transformer.resblocks.0.attn.in_proj_bias
visual.transformer.resblocks.0.attn.out_proj.weight
visual.transformer.resblocks.0.attn.out_proj.bias
visual.transformer.resblocks.0.ln_1.weight
visual.transformer.resblocks.0.ln_1.bias
visual.transformer.resblocks.0.mlp.c_fc.weight
visual.transformer.resblocks.0.mlp.c_fc.bias
visual.transformer.resblocks.0.mlp.c_proj.weight
visual.transformer.resblocks.0.mlp.c_proj.bias
visual.transformer.resblocks.0.ln_2.weight
visual.transformer.resblocks.0.ln_2.bias
visual.transformer.resblocks.1.attn.in_proj_weight
visual.transformer.resblocks.1.attn.in_proj_bias
visual.transformer.resblocks.1.attn.out_proj.weight
visual.transformer.resblocks.1.attn.out_proj.bias
visual.transformer.resblocks.1.ln_1.weight
visual.transformer.resblocks.1.ln_1.bias
visual.transformer.resblocks.1.mlp.c_fc.weight
visual.transformer.resblocks.1.mlp.c_fc.bias
visual.transformer.resblocks.1.mlp.c_proj.weight
visual.transformer.resblocks.1.mlp.c_proj.bias
visual.transformer.resblocks.1.ln_2.weight
visual.transformer.resblocks.1.ln_2.bias
visual.transformer.resblocks.2.attn.in_proj_weight
visual.transformer.resblocks.2.attn.in_proj_bias
visual.transformer.resblocks.2.attn.out_proj.weight
visual.transformer.resblocks.2.attn.out_proj.bias
visual.transformer.resblocks.2.ln_1.weight
visual.transformer.resblocks.2.ln_1.bias
visual.transformer.resblocks.2.mlp.c_fc.weight
visual.transformer.resblocks.2.mlp.c_fc.bias
visual.transformer.resblocks.2.mlp.c_proj.weight
visual.transformer.resblocks.2.mlp.c_proj.bias
visual.transformer.resblocks.2.ln_2.weight
visual.transformer.resblocks.2.ln_2.bias
visual.transformer.resblocks.3.attn.in_proj_weight
visual.transformer.resblocks.3.attn.in_proj_bias
visual.transformer.resblocks.3.attn.out_proj.weight
visual.transformer.resblocks.3.attn.out_proj.bias
visual.transformer.resblocks.3.ln_1.weight
visual.transformer.resblocks.3.ln_1.bias
visual.transformer.resblocks.3.mlp.c_fc.weight
visual.transformer.resblocks.3.mlp.c_fc.bias
visual.transformer.resblocks.3.mlp.c_proj.weight
visual.transformer.resblocks.3.mlp.c_proj.bias
visual.transformer.resblocks.3.ln_2.weight
visual.transformer.resblocks.3.ln_2.bias
visual.transformer.resblocks.4.attn.in_proj_weight
visual.transformer.resblocks.4.attn.in_proj_bias
visual.transformer.resblocks.4.attn.out_proj.weight
visual.transformer.resblocks.4.attn.out_proj.bias
visual.transformer.resblocks.4.ln_1.weight
visual.transformer.resblocks.4.ln_1.bias
visual.transformer.resblocks.4.mlp.c_fc.weight
visual.transformer.resblocks.4.mlp.c_fc.bias
visual.transformer.resblocks.4.mlp.c_proj.weight
visual.transformer.resblocks.4.mlp.c_proj.bias
visual.transformer.resblocks.4.ln_2.weight
visual.transformer.resblocks.4.ln_2.bias
visual.transformer.resblocks.5.attn.in_proj_weight
visual.transformer.resblocks.5.attn.in_proj_bias
visual.transformer.resblocks.5.attn.out_proj.weight
visual.transformer.resblocks.5.attn.out_proj.bias
visual.transformer.resblocks.5.ln_1.weight
visual.transformer.resblocks.5.ln_1.bias
visual.transformer.resblocks.5.mlp.c_fc.weight
visual.transformer.resblocks.5.mlp.c_fc.bias
visual.transformer.resblocks.5.mlp.c_proj.weight
visual.transformer.resblocks.5.mlp.c_proj.bias
visual.transformer.resblocks.5.ln_2.weight
visual.transformer.resblocks.5.ln_2.bias
visual.transformer.resblocks.6.attn.in_proj_weight
visual.transformer.resblocks.6.attn.in_proj_bias
visual.transformer.resblocks.6.attn.out_proj.weight
visual.transformer.resblocks.6.attn.out_proj.bias
visual.transformer.resblocks.6.ln_1.weight
visual.transformer.resblocks.6.ln_1.bias
visual.transformer.resblocks.6.mlp.c_fc.weight
visual.transformer.resblocks.6.mlp.c_fc.bias
visual.transformer.resblocks.6.mlp.c_proj.weight
visual.transformer.resblocks.6.mlp.c_proj.bias
visual.transformer.resblocks.6.ln_2.weight
visual.transformer.resblocks.6.ln_2.bias
visual.transformer.resblocks.7.attn.in_proj_weight
visual.transformer.resblocks.7.attn.in_proj_bias
visual.transformer.resblocks.7.attn.out_proj.weight
visual.transformer.resblocks.7.attn.out_proj.bias
visual.transformer.resblocks.7.ln_1.weight
visual.transformer.resblocks.7.ln_1.bias
visual.transformer.resblocks.7.mlp.c_fc.weight
visual.transformer.resblocks.7.mlp.c_fc.bias
visual.transformer.resblocks.7.mlp.c_proj.weight
visual.transformer.resblocks.7.mlp.c_proj.bias
visual.transformer.resblocks.7.ln_2.weight
visual.transformer.resblocks.7.ln_2.bias
visual.transformer.resblocks.8.attn.in_proj_weight
visual.transformer.resblocks.8.attn.in_proj_bias
visual.transformer.resblocks.8.attn.out_proj.weight
visual.transformer.resblocks.8.attn.out_proj.bias
visual.transformer.resblocks.8.ln_1.weight
visual.transformer.resblocks.8.ln_1.bias
visual.transformer.resblocks.8.mlp.c_fc.weight
visual.transformer.resblocks.8.mlp.c_fc.bias
visual.transformer.resblocks.8.mlp.c_proj.weight
visual.transformer.resblocks.8.mlp.c_proj.bias
visual.transformer.resblocks.8.ln_2.weight
visual.transformer.resblocks.8.ln_2.bias
visual.transformer.resblocks.9.attn.in_proj_weight
visual.transformer.resblocks.9.attn.in_proj_bias
visual.transformer.resblocks.9.attn.out_proj.weight
visual.transformer.resblocks.9.attn.out_proj.bias
visual.transformer.resblocks.9.ln_1.weight
visual.transformer.resblocks.9.ln_1.bias
visual.transformer.resblocks.9.mlp.c_fc.weight
visual.transformer.resblocks.9.mlp.c_fc.bias
visual.transformer.resblocks.9.mlp.c_proj.weight
visual.transformer.resblocks.9.mlp.c_proj.bias
visual.transformer.resblocks.9.ln_2.weight
visual.transformer.resblocks.9.ln_2.bias
visual.transformer.resblocks.10.attn.in_proj_weight
visual.transformer.resblocks.10.attn.in_proj_bias
visual.transformer.resblocks.10.attn.out_proj.weight
visual.transformer.resblocks.10.attn.out_proj.bias
visual.transformer.resblocks.10.ln_1.weight
visual.transformer.resblocks.10.ln_1.bias
visual.transformer.resblocks.10.mlp.c_fc.weight
visual.transformer.resblocks.10.mlp.c_fc.bias
visual.transformer.resblocks.10.mlp.c_proj.weight
visual.transformer.resblocks.10.mlp.c_proj.bias
visual.transformer.resblocks.10.ln_2.weight
visual.transformer.resblocks.10.ln_2.bias
visual.transformer.resblocks.11.attn.in_proj_weight
visual.transformer.resblocks.11.attn.in_proj_bias
visual.transformer.resblocks.11.attn.out_proj.weight
visual.transformer.resblocks.11.attn.out_proj.bias
visual.transformer.resblocks.11.ln_1.weight
visual.transformer.resblocks.11.ln_1.bias
visual.transformer.resblocks.11.mlp.c_fc.weight
visual.transformer.resblocks.11.mlp.c_fc.bias
visual.transformer.resblocks.11.mlp.c_proj.weight
visual.transformer.resblocks.11.mlp.c_proj.bias
visual.transformer.resblocks.11.ln_2.weight
visual.transformer.resblocks.11.ln_2.bias
visual.transformer.resblocks.12.attn.in_proj_weight
visual.transformer.resblocks.12.attn.in_proj_bias
visual.transformer.resblocks.12.attn.out_proj.weight
visual.transformer.resblocks.12.attn.out_proj.bias
visual.transformer.resblocks.12.ln_1.weight
visual.transformer.resblocks.12.ln_1.bias
visual.transformer.resblocks.12.mlp.c_fc.weight
visual.transformer.resblocks.12.mlp.c_fc.bias
visual.transformer.resblocks.12.mlp.c_proj.weight
visual.transformer.resblocks.12.mlp.c_proj.bias
visual.transformer.resblocks.12.ln_2.weight
visual.transformer.resblocks.12.ln_2.bias
visual.transformer.resblocks.13.attn.in_proj_weight
visual.transformer.resblocks.13.attn.in_proj_bias
visual.transformer.resblocks.13.attn.out_proj.weight
visual.transformer.resblocks.13.attn.out_proj.bias
visual.transformer.resblocks.13.ln_1.weight
visual.transformer.resblocks.13.ln_1.bias
visual.transformer.resblocks.13.mlp.c_fc.weight
visual.transformer.resblocks.13.mlp.c_fc.bias
visual.transformer.resblocks.13.mlp.c_proj.weight
visual.transformer.resblocks.13.mlp.c_proj.bias
visual.transformer.resblocks.13.ln_2.weight
visual.transformer.resblocks.13.ln_2.bias
visual.transformer.resblocks.14.attn.in_proj_weight
visual.transformer.resblocks.14.attn.in_proj_bias
visual.transformer.resblocks.14.attn.out_proj.weight
visual.transformer.resblocks.14.attn.out_proj.bias
visual.transformer.resblocks.14.ln_1.weight
visual.transformer.resblocks.14.ln_1.bias
visual.transformer.resblocks.14.mlp.c_fc.weight
visual.transformer.resblocks.14.mlp.c_fc.bias
visual.transformer.resblocks.14.mlp.c_proj.weight
visual.transformer.resblocks.14.mlp.c_proj.bias
visual.transformer.resblocks.14.ln_2.weight
visual.transformer.resblocks.14.ln_2.bias
visual.transformer.resblocks.15.attn.in_proj_weight
visual.transformer.resblocks.15.attn.in_proj_bias
visual.transformer.resblocks.15.attn.out_proj.weight
visual.transformer.resblocks.15.attn.out_proj.bias
visual.transformer.resblocks.15.ln_1.weight
visual.transformer.resblocks.15.ln_1.bias
visual.transformer.resblocks.15.mlp.c_fc.weight
visual.transformer.resblocks.15.mlp.c_fc.bias
visual.transformer.resblocks.15.mlp.c_proj.weight
visual.transformer.resblocks.15.mlp.c_proj.bias
visual.transformer.resblocks.15.ln_2.weight
visual.transformer.resblocks.15.ln_2.bias
visual.transformer.resblocks.16.attn.in_proj_weight
visual.transformer.resblocks.16.attn.in_proj_bias
visual.transformer.resblocks.16.attn.out_proj.weight
visual.transformer.resblocks.16.attn.out_proj.bias
visual.transformer.resblocks.16.ln_1.weight
visual.transformer.resblocks.16.ln_1.bias
visual.transformer.resblocks.16.mlp.c_fc.weight
visual.transformer.resblocks.16.mlp.c_fc.bias
visual.transformer.resblocks.16.mlp.c_proj.weight
visual.transformer.resblocks.16.mlp.c_proj.bias
visual.transformer.resblocks.16.ln_2.weight
visual.transformer.resblocks.16.ln_2.bias
visual.transformer.resblocks.17.attn.in_proj_weight
visual.transformer.resblocks.17.attn.in_proj_bias
visual.transformer.resblocks.17.attn.out_proj.weight
visual.transformer.resblocks.17.attn.out_proj.bias
visual.transformer.resblocks.17.ln_1.weight
visual.transformer.resblocks.17.ln_1.bias
visual.transformer.resblocks.17.mlp.c_fc.weight
visual.transformer.resblocks.17.mlp.c_fc.bias
visual.transformer.resblocks.17.mlp.c_proj.weight
visual.transformer.resblocks.17.mlp.c_proj.bias
visual.transformer.resblocks.17.ln_2.weight
visual.transformer.resblocks.17.ln_2.bias
visual.transformer.resblocks.18.attn.in_proj_weight
visual.transformer.resblocks.18.attn.in_proj_bias
visual.transformer.resblocks.18.attn.out_proj.weight
visual.transformer.resblocks.18.attn.out_proj.bias
visual.transformer.resblocks.18.ln_1.weight
visual.transformer.resblocks.18.ln_1.bias
visual.transformer.resblocks.18.mlp.c_fc.weight
visual.transformer.resblocks.18.mlp.c_fc.bias
visual.transformer.resblocks.18.mlp.c_proj.weight
visual.transformer.resblocks.18.mlp.c_proj.bias
visual.transformer.resblocks.18.ln_2.weight
visual.transformer.resblocks.18.ln_2.bias
visual.transformer.resblocks.19.attn.in_proj_weight
visual.transformer.resblocks.19.attn.in_proj_bias
visual.transformer.resblocks.19.attn.out_proj.weight
visual.transformer.resblocks.19.attn.out_proj.bias
visual.transformer.resblocks.19.ln_1.weight
visual.transformer.resblocks.19.ln_1.bias
visual.transformer.resblocks.19.mlp.c_fc.weight
visual.transformer.resblocks.19.mlp.c_fc.bias
visual.transformer.resblocks.19.mlp.c_proj.weight
visual.transformer.resblocks.19.mlp.c_proj.bias
visual.transformer.resblocks.19.ln_2.weight
visual.transformer.resblocks.19.ln_2.bias
visual.transformer.resblocks.20.attn.in_proj_weight
visual.transformer.resblocks.20.attn.in_proj_bias
visual.transformer.resblocks.20.attn.out_proj.weight
visual.transformer.resblocks.20.attn.out_proj.bias
visual.transformer.resblocks.20.ln_1.weight
visual.transformer.resblocks.20.ln_1.bias
visual.transformer.resblocks.20.mlp.c_fc.weight
visual.transformer.resblocks.20.mlp.c_fc.bias
visual.transformer.resblocks.20.mlp.c_proj.weight
visual.transformer.resblocks.20.mlp.c_proj.bias
visual.transformer.resblocks.20.ln_2.weight
visual.transformer.resblocks.20.ln_2.bias
visual.transformer.resblocks.21.attn.in_proj_weight
visual.transformer.resblocks.21.attn.in_proj_bias
visual.transformer.resblocks.21.attn.out_proj.weight
visual.transformer.resblocks.21.attn.out_proj.bias
visual.transformer.resblocks.21.ln_1.weight
visual.transformer.resblocks.21.ln_1.bias
visual.transformer.resblocks.21.mlp.c_fc.weight
visual.transformer.resblocks.21.mlp.c_fc.bias
visual.transformer.resblocks.21.mlp.c_proj.weight
visual.transformer.resblocks.21.mlp.c_proj.bias
visual.transformer.resblocks.21.ln_2.weight
visual.transformer.resblocks.21.ln_2.bias
visual.transformer.resblocks.22.attn.in_proj_weight
visual.transformer.resblocks.22.attn.in_proj_bias
visual.transformer.resblocks.22.attn.out_proj.weight
visual.transformer.resblocks.22.attn.out_proj.bias
visual.transformer.resblocks.22.ln_1.weight
visual.transformer.resblocks.22.ln_1.bias
visual.transformer.resblocks.22.mlp.c_fc.weight
visual.transformer.resblocks.22.mlp.c_fc.bias
visual.transformer.resblocks.22.mlp.c_proj.weight
visual.transformer.resblocks.22.mlp.c_proj.bias
visual.transformer.resblocks.22.ln_2.weight
visual.transformer.resblocks.22.ln_2.bias
visual.transformer.resblocks.23.attn.in_proj_weight
visual.transformer.resblocks.23.attn.in_proj_bias
visual.transformer.resblocks.23.attn.out_proj.weight
visual.transformer.resblocks.23.attn.out_proj.bias
visual.transformer.resblocks.23.ln_1.weight
visual.transformer.resblocks.23.ln_1.bias
visual.transformer.resblocks.23.mlp.c_fc.weight
visual.transformer.resblocks.23.mlp.c_fc.bias
visual.transformer.resblocks.23.mlp.c_proj.weight
visual.transformer.resblocks.23.mlp.c_proj.bias
visual.transformer.resblocks.23.ln_2.weight
visual.transformer.resblocks.23.ln_2.bias
visual.ln_post.weight
visual.ln_post.bias
visual.proj
# 文本相关
token_embedding.weight
positional_embedding
transformer.resblocks.0.attn.in_proj_weight
transformer.resblocks.0.attn.in_proj_bias
transformer.resblocks.0.attn.out_proj.weight
transformer.resblocks.0.attn.out_proj.bias
transformer.resblocks.0.ln_1.weight
transformer.resblocks.0.ln_1.bias
transformer.resblocks.0.mlp.c_fc.weight
transformer.resblocks.0.mlp.c_fc.bias
transformer.resblocks.0.mlp.c_proj.weight
transformer.resblocks.0.mlp.c_proj.bias
transformer.resblocks.0.ln_2.weight
transformer.resblocks.0.ln_2.bias
transformer.resblocks.1.attn.in_proj_weight
transformer.resblocks.1.attn.in_proj_bias
transformer.resblocks.1.attn.out_proj.weight
transformer.resblocks.1.attn.out_proj.bias
transformer.resblocks.1.ln_1.weight
transformer.resblocks.1.ln_1.bias
transformer.resblocks.1.mlp.c_fc.weight
transformer.resblocks.1.mlp.c_fc.bias
transformer.resblocks.1.mlp.c_proj.weight
transformer.resblocks.1.mlp.c_proj.bias
transformer.resblocks.1.ln_2.weight
transformer.resblocks.1.ln_2.bias
transformer.resblocks.2.attn.in_proj_weight
transformer.resblocks.2.attn.in_proj_bias
transformer.resblocks.2.attn.out_proj.weight
transformer.resblocks.2.attn.out_proj.bias
transformer.resblocks.2.ln_1.weight
transformer.resblocks.2.ln_1.bias
transformer.resblocks.2.mlp.c_fc.weight
transformer.resblocks.2.mlp.c_fc.bias
transformer.resblocks.2.mlp.c_proj.weight
transformer.resblocks.2.mlp.c_proj.bias
transformer.resblocks.2.ln_2.weight
transformer.resblocks.2.ln_2.bias
transformer.resblocks.3.attn.in_proj_weight
transformer.resblocks.3.attn.in_proj_bias
transformer.resblocks.3.attn.out_proj.weight
transformer.resblocks.3.attn.out_proj.bias
transformer.resblocks.3.ln_1.weight
transformer.resblocks.3.ln_1.bias
transformer.resblocks.3.mlp.c_fc.weight
transformer.resblocks.3.mlp.c_fc.bias
transformer.resblocks.3.mlp.c_proj.weight
transformer.resblocks.3.mlp.c_proj.bias
transformer.resblocks.3.ln_2.weight
transformer.resblocks.3.ln_2.bias
transformer.resblocks.4.attn.in_proj_weight
transformer.resblocks.4.attn.in_proj_bias
transformer.resblocks.4.attn.out_proj.weight
transformer.resblocks.4.attn.out_proj.bias
transformer.resblocks.4.ln_1.weight
transformer.resblocks.4.ln_1.bias
transformer.resblocks.4.mlp.c_fc.weight
transformer.resblocks.4.mlp.c_fc.bias
transformer.resblocks.4.mlp.c_proj.weight
transformer.resblocks.4.mlp.c_proj.bias
transformer.resblocks.4.ln_2.weight
transformer.resblocks.4.ln_2.bias
transformer.resblocks.5.attn.in_proj_weight
transformer.resblocks.5.attn.in_proj_bias
transformer.resblocks.5.attn.out_proj.weight
transformer.resblocks.5.attn.out_proj.bias
transformer.resblocks.5.ln_1.weight
transformer.resblocks.5.ln_1.bias
transformer.resblocks.5.mlp.c_fc.weight
transformer.resblocks.5.mlp.c_fc.bias
transformer.resblocks.5.mlp.c_proj.weight
transformer.resblocks.5.mlp.c_proj.bias
transformer.resblocks.5.ln_2.weight
transformer.resblocks.5.ln_2.bias
transformer.resblocks.6.attn.in_proj_weight
transformer.resblocks.6.attn.in_proj_bias
transformer.resblocks.6.attn.out_proj.weight
transformer.resblocks.6.attn.out_proj.bias
transformer.resblocks.6.ln_1.weight
transformer.resblocks.6.ln_1.bias
transformer.resblocks.6.mlp.c_fc.weight
transformer.resblocks.6.mlp.c_fc.bias
transformer.resblocks.6.mlp.c_proj.weight
transformer.resblocks.6.mlp.c_proj.bias
transformer.resblocks.6.ln_2.weight
transformer.resblocks.6.ln_2.bias
transformer.resblocks.7.attn.in_proj_weight
transformer.resblocks.7.attn.in_proj_bias
transformer.resblocks.7.attn.out_proj.weight
transformer.resblocks.7.attn.out_proj.bias
transformer.resblocks.7.ln_1.weight
transformer.resblocks.7.ln_1.bias
transformer.resblocks.7.mlp.c_fc.weight
transformer.resblocks.7.mlp.c_fc.bias
transformer.resblocks.7.mlp.c_proj.weight
transformer.resblocks.7.mlp.c_proj.bias
transformer.resblocks.7.ln_2.weight
transformer.resblocks.7.ln_2.bias
transformer.resblocks.8.attn.in_proj_weight
transformer.resblocks.8.attn.in_proj_bias
transformer.resblocks.8.attn.out_proj.weight
transformer.resblocks.8.attn.out_proj.bias
transformer.resblocks.8.ln_1.weight
transformer.resblocks.8.ln_1.bias
transformer.resblocks.8.mlp.c_fc.weight
transformer.resblocks.8.mlp.c_fc.bias
transformer.resblocks.8.mlp.c_proj.weight
transformer.resblocks.8.mlp.c_proj.bias
transformer.resblocks.8.ln_2.weight
transformer.resblocks.8.ln_2.bias
transformer.resblocks.9.attn.in_proj_weight
transformer.resblocks.9.attn.in_proj_bias
transformer.resblocks.9.attn.out_proj.weight
transformer.resblocks.9.attn.out_proj.bias
transformer.resblocks.9.ln_1.weight
transformer.resblocks.9.ln_1.bias
transformer.resblocks.9.mlp.c_fc.weight
transformer.resblocks.9.mlp.c_fc.bias
transformer.resblocks.9.mlp.c_proj.weight
transformer.resblocks.9.mlp.c_proj.bias
transformer.resblocks.9.ln_2.weight
transformer.resblocks.9.ln_2.bias
transformer.resblocks.10.attn.in_proj_weight
transformer.resblocks.10.attn.in_proj_bias
transformer.resblocks.10.attn.out_proj.weight
transformer.resblocks.10.attn.out_proj.bias
transformer.resblocks.10.ln_1.weight
transformer.resblocks.10.ln_1.bias
transformer.resblocks.10.mlp.c_fc.weight
transformer.resblocks.10.mlp.c_fc.bias
transformer.resblocks.10.mlp.c_proj.weight
transformer.resblocks.10.mlp.c_proj.bias
transformer.resblocks.10.ln_2.weight
transformer.resblocks.10.ln_2.bias
transformer.resblocks.11.attn.in_proj_weight
transformer.resblocks.11.attn.in_proj_bias
transformer.resblocks.11.attn.out_proj.weight
transformer.resblocks.11.attn.out_proj.bias
transformer.resblocks.11.ln_1.weight
transformer.resblocks.11.ln_1.bias
transformer.resblocks.11.mlp.c_fc.weight
transformer.resblocks.11.mlp.c_fc.bias
transformer.resblocks.11.mlp.c_proj.weight
transformer.resblocks.11.mlp.c_proj.bias
transformer.resblocks.11.ln_2.weight
transformer.resblocks.11.ln_2.bias
ln_final.weight
ln_final.bias
text_projection
仅优化CLIP的后几层参数
def get_optim_params(model_name: str):
if model_name in ['ViT-B/32', 'ViT-B/16']:
return ['visual.transformer.resblocks.11.attn.in_proj_weight',
'visual.transformer.resblocks.11.attn.in_proj_bias',
'visual.transformer.resblocks.11.attn.out_proj.weight',
'visual.transformer.resblocks.11.attn.out_proj.bias',
'visual.transformer.resblocks.11.ln_1.weight',
'visual.transformer.resblocks.11.ln_1.bias',
'visual.transformer.resblocks.11.mlp.c_fc.weight',
'visual.transformer.resblocks.11.mlp.c_fc.bias',
'visual.transformer.resblocks.11.mlp.c_proj.weight',
'visual.transformer.resblocks.11.mlp.c_proj.bias',
'visual.transformer.resblocks.11.ln_2.weight',
'visual.transformer.resblocks.11.ln_2.bias',
'visual.ln_post.weight',
'visual.ln_post.bias',
'visual.proj',
'transformer.resblocks.11.attn.in_proj_weight',
'transformer.resblocks.11.attn.in_proj_bias',
'transformer.resblocks.11.attn.out_proj.weight',
'transformer.resblocks.11.attn.out_proj.bias',
'transformer.resblocks.11.ln_1.weight',
'transformer.resblocks.11.ln_1.bias',
'transformer.resblocks.11.mlp.c_fc.weight',
'transformer.resblocks.11.mlp.c_fc.bias',
'transformer.resblocks.11.mlp.c_proj.weight',
'transformer.resblocks.11.mlp.c_proj.bias',
'transformer.resblocks.11.ln_2.weight',
'transformer.resblocks.11.ln_2.bias',
'ln_final.weight',
'ln_final.bias',
'text_projection']
elif model_name in ['ViT-L/14', 'ViT-L/14@336px']:
return ['visual.transformer.resblocks.23.attn.in_proj_weight',
'visual.transformer.resblocks.23.attn.in_proj_bias',
'visual.transformer.resblocks.23.attn.out_proj.weight',
'visual.transformer.resblocks.23.attn.out_proj.bias',
'visual.transformer.resblocks.23.ln_1.weight',
'visual.transformer.resblocks.23.ln_1.bias',
'visual.transformer.resblocks.23.mlp.c_fc.weight',
'visual.transformer.resblocks.23.mlp.c_fc.bias',
'visual.transformer.resblocks.23.mlp.c_proj.weight',
'visual.transformer.resblocks.23.mlp.c_proj.bias',
'visual.transformer.resblocks.23.ln_2.weight',
'visual.transformer.resblocks.23.ln_2.bias',
'visual.ln_post.weight',
'visual.ln_post.bias',
'visual.proj',
'transformer.resblocks.11.attn.in_proj_weight',
'transformer.resblocks.11.attn.in_proj_bias',
'transformer.resblocks.11.attn.out_proj.weight',
'transformer.resblocks.11.attn.out_proj.bias',
'transformer.resblocks.11.ln_1.weight',
'transformer.resblocks.11.ln_1.bias',
'transformer.resblocks.11.mlp.c_fc.weight',
'transformer.resblocks.11.mlp.c_fc.bias',
'transformer.resblocks.11.mlp.c_proj.weight',
'transformer.resblocks.11.mlp.c_proj.bias',
'transformer.resblocks.11.ln_2.weight',
'transformer.resblocks.11.ln_2.bias',
'ln_final.weight',
'ln_final.bias',
'text_projection']
else:
print(f"no {model_name}")
import os
import clip
from torch import nn
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10
from torch.nn import functional as F
import torch
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.model, self.preprocess = clip.load('ViT-B/32', 'cpu')
optim_params = get_optim_params('ViT-B/32'):
for name, param in self.model.named_parameters():
if name not in optim_params:
param.requires_grad = False
def forward(self, image, text):
image_features = self.model.encode_image(image)
text_features = self.model.encode_text(text)
return image_features, text_features
net = Net()
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2)
root = os.path.expanduser("~/.cache")
cifar10 = CIFAR10(root, download=True, train=True, transform=net.preprocess)
train = next(iter(DataLoader(cifar10, batch_size=8)))
images = train[0]
texts = torch.cat([clip.tokenize(f"a photo of a {cifar10.classes[c]}") for c in train[1]])
storeParam = {}
for name, param in net.named_parameters():
storeParam[name] = param.detach().clone()
for i in range(10):
image_features, text_features = net(images, texts)
loss = F.mse_loss(image_features, text_features)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(loss.item())
for name, param in net.named_parameters():
if not torch.equal(param, storeParam[name]):
print(f"{name}")