简介
CLIP,全称Constrastive Language-Image Pre-training,是2021年OpenAI提出的基于4亿(400M)图像-文本对数据,通过对比学习的多模态预训练模型,具备强大的zero-shot迁移能力。
论文地址:https://arxiv.org/pdf/2103.00020.pdf
代码地址:https://github.com/openai/CLIP
模型结构图:

CLIP代码结构
伪代码如下:

对伪代码进行注释:
# image_encoder 图像编码器:ResNet或者ViT
# text_encoder 文本编码器:CBOW或者Text Transformer
# I[n,h,w,c] 图像输入大小: 比如 [16, 224, 224, 3]
# T[n,l] 文本输入大小:n表示batch size,l表示序列长度
# W_i[d_i, d_e] 图像的投射层,学习如何从单模态到多模态
# W_t[d_t, d_e] 文本的投射层,学习如何从单模态到多模态
# t 可学习的温度系数
# 分别提取每个模态的特征
I_f = image_encoder(I) # 输出大小 [n, d_i]
T_f = text_encoder(T) # 输出大小 [n, d_t]
# 合并多模态特征
I_e = l2_normalize(np.dot(I_f, W_i), axis=1) # 输出大小 [n, d_e]
T_e = l2_normalize(np.dot(T_f, W_t), axis=1) # 输出大小 [n, d_e]
# 计算cos相似度,I_e 与 T_e 转置矩阵的点积,np.exp(t) 表示可学习的温度系数的指数。
# 这里实际是在对点积的结果进行加权,通过指数函数引入温度系数的影响,以调整余弦相似度的分数。
# 这样的加权余弦相似度通常用于度量多模态特征之间的相似性。
logits = np.dot(I_e, T_e.T) * np.exp(t) # 输出大小 [n, n]
# 计算损失函数
labels = np.arange(n)
loss_i = cross_entropy_loss(logits, labels, axis=0) # 针对图像的交叉熵损失
loss_t = cross_entropy_loss(logits, labels, axis=1) # 针对文本的交叉熵损失
loss = (loss_i + loss_t) / 2 # 综合图像和文本的损失
对温度系数t的理解:
clip在处理图像和文本这两个向量的相似度时,在0~1区间范围内的辨识度很低,也就是说,难以拉开两个相似度不高的向量之间的距离,而温度系数t可以把向量的相似度进行放大,使其能够很好的区分。
温度系数来源于InfoNCE损失函数中:
I
n
f
o
N
C
E
L
o
s
s
=
−
1
N
∑
i
=
1
N
l
o
g
(
exp
(
q
i
⋅
k
i
+
τ
)
∑
j
=
1
N
exp
(
q
i
⋅
k
i
−
τ
)
)
InfoNCE Loss = -\frac{1}{N} \sum_{i=1}^{N} log\left(\frac{\exp\left(\frac{q_i \cdot k_{i+}\ }{\tau}\right)}{\sum_{j=1}^{N} \exp\left(\frac{q_i \cdot k_{i-}}{\tau}\right)}\right)
InfoNCELoss=−N1i=1∑Nlog
∑j=1Nexp(τqi⋅ki−)exp(τqi⋅ki+ )
# 温度系数
self.T = 0.07
# 计算相似度
# positive logits: Nx1
s_pos = torch.sum(q*k, dim=1).unsqueeze(dim=1)
# negative logits: NxK
s_neg = torch.matmul(q, self.queue.clone().detach().T)
# 拼接相似度 logits: Nx(1+K)
logits = torch.cat([s_pos, s_neg], dim=1)
logits /= self.T
# 创建标签
labels = torch.zeros(logits.shape[0], dtype=torch.long).cuda()
# 计算InfoNCE损失
loss = F.cross_entropy(logits, labels)
CLIP前向架构
def forward(self, image, text):
image_features = self.encode_image(image)
text_features = self.encode_text(text)
# normalized features
image_features = image_features / image_features.norm(dim=1, keepdim=True)
text_features = text_features / text_features.norm(dim=1, keepdim=True)
# cosine similarity as logits
logit_scale = self.logit_scale.exp()
logits_per_image = logit_scale * image_features @ text_features.t()
logits_per_text = logits_per_image.t()
# shape = [global_batch_size, global_batch_size]
return logits_per_image, logits_per_text
CLIP图像编码模块
图像预处理
def _transform(input_resolution):
return Compose([
Resize(input_resolution, interpolation=BICUBIC),
CenterCrop(input_resolution),
_convert_image_to_rgb,
ToTensor(),
Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
])
encode_image
该模块提供了两种编码的backbone,分别是经过修改的ResNet和ViT。
ModifiedResNet主干:
class ModifiedResNet(nn.Module):
"""
A ResNet class that is similar to torchvision's but contains the following changes:
- There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
- Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
- The final pooling layer is a QKV attention instead of an average pool
"""
def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
super().__init__()
self.output_dim = output_dim
self.input_resolution = input_resolution
# the 3-layer stem
self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
self.bn1 = nn.BatchNorm2d(width // 2)
self.relu1 = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
self.bn2 = nn.BatchNorm2d(width // 2)
self.relu2 = nn.ReLU(inplace=True)
self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
self.bn3 = nn.BatchNorm2d(width)
self.relu3 = nn.ReLU(inplace=True)
self.avgpool = nn.AvgPool2d(2)
# residual layers
self._inplanes = width # this is a *mutable* variable used during construction
self.layer1 = self._make_layer(width, layers[0])
self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
embed_dim = width * 32 # the ResNet feature dimension
self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
def _make_layer(self, planes, blocks, stride=1):
layers = [Bottleneck(self._inplanes, planes, stride)]
self._inplanes = planes * Bottleneck.expansion
for _ in range(1, blocks):
layers.append(Bottleneck(self._inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
def stem(x):
x = self.relu1(self.bn1(self.conv1(x)))
x = self.relu2(self.bn2(self.conv2(x)))
x = self.relu3(self.bn3(self.conv3(x)))
x = self.avgpool(x)
return x
x = x.type(self.conv1.weight.dtype)
x = stem(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.attnpool(x)
return x
Transformer主干:
class VisionTransformer(nn.Module):
def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
super().__init__()
self.input_resolution = input_resolution
self.output_dim = output_dim
self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
scale = width ** -0.5
self.class_embedding = nn.Parameter(scale * torch.randn(width))
self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
self.ln_pre = LayerNorm(width)
self.transformer = Transformer(width, layers, heads)
self.ln_post = LayerNorm(width)
self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
def forward(self, x: torch.Tensor):
x = self.conv1(x) # shape = [*, width, grid, grid]
x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1) # shape = [*, grid ** 2 + 1, width]
x = x + self.positional_embedding.to(x.dtype)
x = self.ln_pre(x)
x = x.permute(1, 0, 2) # NLD -> LND
x = self.transformer(x)
x = x.permute(1, 0, 2) # LND -> NLD
x = self.ln_post(x[:, 0, :])
if self.proj is not None:
x = x @ self.proj
return x
CLIP文本编码模块
def encode_text(self, text):
x = self.token_embedding(text).type(self.dtype) # [batch_size, n_ctx, d_model]
x = x + self.positional_embedding.type(self.dtype)
x = x.permute(1, 0, 2) # NLD -> LND
x = self.transformer(x)
x = x.permute(1, 0, 2) # LND -> NLD
x = self.ln_final(x).type(self.dtype)
# x.shape = [batch_size, n_ctx, transformer.width]
# take features from the eot embedding (eot_token is the highest number in each sequence)
x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
return x
CLIP模型参数
ViT-B-32模型image_encoder模块参数量:
Total params: 59,068,416
Trainable params: 59,068,416
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.57
Forward/backward pass size (MB): 349.28
Params size (MB): 225.33
Estimated Total Size (MB): 575.18
----------------------------------------------------------------
Layer (type) Output Shape Param #
================================================================
Conv2d-1 [-1, 768, 7, 7] 2,359,296
LayerNorm-2 [-1, 50, 768] 1,536
LayerNorm-3 [-1, 2, 768] 1,536
MultiheadAttention-4 [[-1, 2, 768], [-1, 50, 50]] 0
LayerNorm-5 [-1, 2, 768] 1,536
Linear-6 [-1, 2, 3072] 2,362,368
QuickGELU-7 [-1, 2, 3072] 0
Linear-8 [-1, 2, 768] 2,360,064
ResidualAttentionBlock-9 [-1, 2, 768] 0
LayerNorm-10 [-1, 2, 768] 1,536
MultiheadAttention-11 [[-1, 2, 768], [-1, 50, 50]] 0
LayerNorm-12 [-1, 2, 768] 1,536
Linear-13 [-1, 2, 3072] 2,362,368
QuickGELU-14 [-1, 2, 3072] 0
Linear-15 [-1, 2, 768] 2,360,064
ResidualAttentionBlock-16 [-1, 2, 768] 0
LayerNorm-17 [-1, 2, 768] 1,536
MultiheadAttention-18 [[-1, 2, 768], [-1, 50, 50]] 0
LayerNorm-19 [-1, 2, 768] 1,536
Linear-20 [-1, 2, 3072] 2,362,368
QuickGELU-21 [-1, 2, 3072] 0
Linear-22 [-1, 2, 768] 2,360,064
ResidualAttentionBlock-23 [-1, 2, 768] 0
LayerNorm-24 [-1, 2, 768] 1,536
MultiheadAttention-25 [[-1, 2, 768], [-1, 50, 50]] 0
LayerNorm-26 [-1, 2, 768] 1,536
Linear-27 [-1, 2, 3072] 2,362,368
QuickGELU-28 [-1, 2, 3072] 0
Linear-29 [-1, 2, 768] 2,360,064
ResidualAttentionBlock-30 [-1, 2, 768] 0
LayerNorm-31 [-1, 2, 768] 1,536
MultiheadAttention-32 [[-1, 2, 768], [-1, 50, 50]] 0
LayerNorm-33 [-1, 2, 768] 1,536
Linear-34 [-1, 2, 3072] 2,362,368
QuickGELU-35 [-1, 2, 3072] 0
Linear-36 [-1, 2, 768] 2,360,064
ResidualAttentionBlock-37 [-1, 2, 768] 0
LayerNorm-38 [-1, 2, 768] 1,536
MultiheadAttention-39 [[-1, 2, 768], [-1, 50, 50]] 0
LayerNorm-40 [-1, 2, 768] 1,536
Linear-41 [-1, 2, 3072] 2,362,368
QuickGELU-42 [-1, 2, 3072] 0
Linear-43 [-1, 2, 768] 2,360,064
ResidualAttentionBlock-44 [-1, 2, 768] 0
LayerNorm-45 [-1, 2, 768] 1,536
MultiheadAttention-46 [[-1, 2, 768], [-1, 50, 50]] 0
LayerNorm-47 [-1, 2, 768] 1,536
Linear-48 [-1, 2, 3072] 2,362,368
QuickGELU-49 [-1, 2, 3072] 0
Linear-50 [-1, 2, 768] 2,360,064
ResidualAttentionBlock-51 [-1, 2, 768] 0
LayerNorm-52 [-1, 2, 768] 1,536
MultiheadAttention-53 [[-1, 2, 768], [-1, 50, 50]] 0
LayerNorm-54 [-1, 2, 768] 1,536
Linear-55 [-1, 2, 3072] 2,362,368
QuickGELU-56 [-1, 2, 3072] 0
Linear-57 [-1, 2, 768] 2,360,064
ResidualAttentionBlock-58 [-1, 2, 768] 0
LayerNorm-59 [-1, 2, 768] 1,536
MultiheadAttention-60 [[-1, 2, 768], [-1, 50, 50]] 0
LayerNorm-61 [-1, 2, 768] 1,536
Linear-62 [-1, 2, 3072] 2,362,368
QuickGELU-63 [-1, 2, 3072] 0
Linear-64 [-1, 2, 768] 2,360,064
ResidualAttentionBlock-65 [-1, 2, 768] 0
LayerNorm-66 [-1, 2, 768] 1,536
MultiheadAttention-67 [[-1, 2, 768], [-1, 50, 50]] 0
LayerNorm-68 [-1, 2, 768] 1,536
Linear-69 [-1, 2, 3072] 2,362,368
QuickGELU-70 [-1, 2, 3072] 0
Linear-71 [-1, 2, 768] 2,360,064
ResidualAttentionBlock-72 [-1, 2, 768] 0
LayerNorm-73 [-1, 2, 768] 1,536
MultiheadAttention-74 [[-1, 2, 768], [-1, 50, 50]] 0
LayerNorm-75 [-1, 2, 768] 1,536
Linear-76 [-1, 2, 3072] 2,362,368
QuickGELU-77 [-1, 2, 3072] 0
Linear-78 [-1, 2, 768] 2,360,064
ResidualAttentionBlock-79 [-1, 2, 768] 0
LayerNorm-80 [-1, 2, 768] 1,536
MultiheadAttention-81 [[-1, 2, 768], [-1, 50, 50]] 0
LayerNorm-82 [-1, 2, 768] 1,536
Linear-83 [-1, 2, 3072] 2,362,368
QuickGELU-84 [-1, 2, 3072] 0
Linear-85 [-1, 2, 768] 2,360,064
ResidualAttentionBlock-86 [-1, 2, 768] 0
Transformer-87 [-1, 2, 768] 0
LayerNorm-88 [-1, 768] 1,536
================================================================
RN50模型image_encoder模块参数量:
Total params: 23,527,264
Trainable params: 23,527,264
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.57
Forward/backward pass size (MB): 331.71
Params size (MB): 89.75
Estimated Total Size (MB): 422.04
----------------------------------------------------------------
Layer (type) Output Shape Param #
================================================================
Conv2d-1 [-1, 32, 112, 112] 864
BatchNorm2d-2 [-1, 32, 112, 112] 64
ReLU-3 [-1, 32, 112, 112] 0
Conv2d-4 [-1, 32, 112, 112] 9,216
BatchNorm2d-5 [-1, 32, 112, 112] 64
ReLU-6 [-1, 32, 112, 112] 0
Conv2d-7 [-1, 64, 112, 112] 18,432
BatchNorm2d-8 [-1, 64, 112, 112] 128
ReLU-9 [-1, 64, 112, 112] 0
AvgPool2d-10 [-1, 64, 56, 56] 0
Conv2d-11 [-1, 64, 56, 56] 4,096
BatchNorm2d-12 [-1, 64, 56, 56] 128
ReLU-13 [-1, 64, 56, 56] 0
Conv2d-14 [-1, 64, 56, 56] 36,864
BatchNorm2d-15 [-1, 64, 56, 56] 128
ReLU-16 [-1, 64, 56, 56] 0
Identity-17 [-1, 64, 56, 56] 0
Conv2d-18 [-1, 256, 56, 56] 16,384
BatchNorm2d-19 [-1, 256, 56, 56] 512
AvgPool2d-20 [-1, 64, 56, 56] 0
Conv2d-21 [-1, 256, 56, 56] 16,384
BatchNorm2d-22 [-1, 256, 56, 56] 512
ReLU-23 [-1, 256, 56, 56] 0
Bottleneck-24 [-1, 256, 56, 56] 0
Conv2d-25 [-1, 64, 56, 56] 16,384
BatchNorm2d-26 [-1, 64, 56, 56] 128
ReLU-27 [-1, 64, 56, 56] 0
Conv2d-28 [-1, 64, 56, 56] 36,864
BatchNorm2d-29 [-1, 64, 56, 56] 128
ReLU-30 [-1, 64, 56, 56] 0
Identity-31 [-1, 64, 56, 56] 0
Conv2d-32 [-1, 256, 56, 56] 16,384
BatchNorm2d-33 [-1, 256, 56, 56] 512
ReLU-34 [-1, 256, 56, 56] 0
Bottleneck-35 [-1, 256, 56, 56] 0
Conv2d-36 [-1, 64, 56, 56] 16,384
BatchNorm2d-37 [-1, 64, 56, 56] 128
ReLU-38 [-1, 64, 56, 56] 0
Conv2d-39 [-1, 64, 56, 56] 36,864
BatchNorm2d-40 [-1, 64, 56, 56] 128
ReLU-41 [-1, 64, 56, 56] 0
Identity-42 [-1, 64, 56, 56] 0
Conv2d-43 [-1, 256, 56, 56] 16,384
BatchNorm2d-44 [-1, 256, 56, 56] 512
ReLU-45 [-1, 256, 56, 56] 0
Bottleneck-46 [-1, 256, 56, 56] 0
Conv2d-47 [-1, 128, 56, 56] 32,768
BatchNorm2d-48 [-1, 128, 56, 56] 256
ReLU-49 [-1, 128, 56, 56] 0
Conv2d-50 [-1, 128, 56, 56] 147,456
BatchNorm2d-51 [-1, 128, 56, 56] 256
ReLU-52 [-1, 128, 56, 56] 0
AvgPool2d-53 [-1, 128, 28, 28] 0
Conv2d-54 [-1, 512, 28, 28] 65,536
BatchNorm2d-55 [-1, 512, 28, 28] 1,024
AvgPool2d-56 [-1, 256, 28, 28] 0
Conv2d-57 [-1, 512, 28, 28] 131,072
BatchNorm2d-58 [-1, 512, 28, 28] 1,024
ReLU-59 [-1, 512, 28, 28] 0
Bottleneck-60 [-1, 512, 28, 28] 0
Conv2d-61 [-1, 128, 28, 28] 65,536
BatchNorm2d-62 [-1, 128, 28, 28] 256
ReLU-63 [-1, 128, 28, 28] 0
Conv2d-64 [-1, 128, 28, 28] 147,456
BatchNorm2d-65 [-1, 128, 28, 28] 256
ReLU-66 [-1, 128, 28, 28] 0
Identity-67 [-1, 128, 28, 28] 0
Conv2d-68 [-1, 512, 28, 28] 65,536
BatchNorm2d-69 [-1, 512, 28, 28] 1,024
ReLU-70 [-1, 512, 28, 28] 0
Bottleneck-71 [-1, 512, 28, 28] 0
Conv2d-72 [-1, 128, 28, 28] 65,536
BatchNorm2d-73 [-1, 128, 28, 28] 256
ReLU-74 [-1, 128, 28, 28] 0
Conv2d-75 [-1, 128, 28, 28] 147,456
BatchNorm2d-76 [-1, 128, 28, 28] 256
ReLU-77 [-1, 128, 28, 28] 0
Identity-78 [-1, 128, 28, 28] 0
Conv2d-79 [-1, 512, 28, 28] 65,536
BatchNorm2d-80 [-1, 512, 28, 28] 1,024
ReLU-81 [-1, 512, 28, 28] 0
Bottleneck-82 [-1, 512, 28, 28] 0
Conv2d-83 [-1, 128, 28, 28] 65,536
BatchNorm2d-84 [-1, 128, 28, 28] 256
ReLU-85 [-1, 128, 28, 28] 0
Conv2d-86 [-1, 128, 28, 28] 147,456
BatchNorm2d-87 [-1, 128, 28, 28] 256
ReLU-88 [-1, 128, 28, 28] 0
Identity-89 [-1, 128, 28, 28] 0
Conv2d-90 [-1, 512, 28, 28] 65,536
BatchNorm2d-91 [-1, 512, 28, 28] 1,024
ReLU-92 [-1, 512, 28, 28] 0
Bottleneck-93 [-1, 512, 28, 28] 0
Conv2d-94 [-1, 256, 28, 28] 131,072
BatchNorm2d-95 [-1, 256, 28, 28] 512
ReLU-96 [-1, 256, 28, 28] 0
Conv2d-97 [-1, 256, 28, 28] 589,824
BatchNorm2d-98 [-1, 256, 28, 28] 512
ReLU-99 [-1, 256, 28, 28] 0
AvgPool2d-100 [-1, 256, 14, 14] 0
Conv2d-101 [-1, 1024, 14, 14] 262,144
BatchNorm2d-102 [-1, 1024, 14, 14] 2,048
AvgPool2d-103 [-1, 512, 14, 14] 0
Conv2d-104 [-1, 1024, 14, 14] 524,288
BatchNorm2d-105 [-1, 1024, 14, 14] 2,048
ReLU-106 [-1, 1024, 14, 14] 0
Bottleneck-107 [-1, 1024, 14, 14] 0
Conv2d-108 [-1, 256, 14, 14] 262,144
BatchNorm2d-109 [-1, 256, 14, 14] 512
ReLU-110 [-1, 256, 14, 14] 0
Conv2d-111 [-1, 256, 14, 14] 589,824
BatchNorm2d-112 [-1, 256, 14, 14] 512
ReLU-113 [-1, 256, 14, 14] 0
Identity-114 [-1, 256, 14, 14] 0
Conv2d-115 [-1, 1024, 14, 14] 262,144
BatchNorm2d-116 [-1, 1024, 14, 14] 2,048
ReLU-117 [-1, 1024, 14, 14] 0
Bottleneck-118 [-1, 1024, 14, 14] 0
Conv2d-119 [-1, 256, 14, 14] 262,144
BatchNorm2d-120 [-1, 256, 14, 14] 512
ReLU-121 [-1, 256, 14, 14] 0
Conv2d-122 [-1, 256, 14, 14] 589,824
BatchNorm2d-123 [-1, 256, 14, 14] 512
ReLU-124 [-1, 256, 14, 14] 0
Identity-125 [-1, 256, 14, 14] 0
Conv2d-126 [-1, 1024, 14, 14] 262,144
BatchNorm2d-127 [-1, 1024, 14, 14] 2,048
ReLU-128 [-1, 1024, 14, 14] 0
Bottleneck-129 [-1, 1024, 14, 14] 0
Conv2d-130 [-1, 256, 14, 14] 262,144
BatchNorm2d-131 [-1, 256, 14, 14] 512
ReLU-132 [-1, 256, 14, 14] 0
Conv2d-133 [-1, 256, 14, 14] 589,824
BatchNorm2d-134 [-1, 256, 14, 14] 512
ReLU-135 [-1, 256, 14, 14] 0
Identity-136 [-1, 256, 14, 14] 0
Conv2d-137 [-1, 1024, 14, 14] 262,144
BatchNorm2d-138 [-1, 1024, 14, 14] 2,048
ReLU-139 [-1, 1024, 14, 14] 0
Bottleneck-140 [-1, 1024, 14, 14] 0
Conv2d-141 [-1, 256, 14, 14] 262,144
BatchNorm2d-142 [-1, 256, 14, 14] 512
ReLU-143 [-1, 256, 14, 14] 0
Conv2d-144 [-1, 256, 14, 14] 589,824
BatchNorm2d-145 [-1, 256, 14, 14] 512
ReLU-146 [-1, 256, 14, 14] 0
Identity-147 [-1, 256, 14, 14] 0
Conv2d-148 [-1, 1024, 14, 14] 262,144
BatchNorm2d-149 [-1, 1024, 14, 14] 2,048
ReLU-150 [-1, 1024, 14, 14] 0
Bottleneck-151 [-1, 1024, 14, 14] 0
Conv2d-152 [-1, 256, 14, 14] 262,144
BatchNorm2d-153 [-1, 256, 14, 14] 512
ReLU-154 [-1, 256, 14, 14] 0
Conv2d-155 [-1, 256, 14, 14] 589,824
BatchNorm2d-156 [-1, 256, 14, 14] 512
ReLU-157 [-1, 256, 14, 14] 0
Identity-158 [-1, 256, 14, 14] 0
Conv2d-159 [-1, 1024, 14, 14] 262,144
BatchNorm2d-160 [-1, 1024, 14, 14] 2,048
ReLU-161 [-1, 1024, 14, 14] 0
Bottleneck-162 [-1, 1024, 14, 14] 0
Conv2d-163 [-1, 512, 14, 14] 524,288
BatchNorm2d-164 [-1, 512, 14, 14] 1,024
ReLU-165 [-1, 512, 14, 14] 0
Conv2d-166 [-1, 512, 14, 14] 2,359,296
BatchNorm2d-167 [-1, 512, 14, 14] 1,024
ReLU-168 [-1, 512, 14, 14] 0
AvgPool2d-169 [-1, 512, 7, 7] 0
Conv2d-170 [-1, 2048, 7, 7] 1,048,576
BatchNorm2d-171 [-1, 2048, 7, 7] 4,096
AvgPool2d-172 [-1, 1024, 7, 7] 0
Conv2d-173 [-1, 2048, 7, 7] 2,097,152
BatchNorm2d-174 [-1, 2048, 7, 7] 4,096
ReLU-175 [-1, 2048, 7, 7] 0
Bottleneck-176 [-1, 2048, 7, 7] 0
Conv2d-177 [-1, 512, 7, 7] 1,048,576
BatchNorm2d-178 [-1, 512, 7, 7] 1,024
ReLU-179 [-1, 512, 7, 7] 0
Conv2d-180 [-1, 512, 7, 7] 2,359,296
BatchNorm2d-181 [-1, 512, 7, 7] 1,024
ReLU-182 [-1, 512, 7, 7] 0
Identity-183 [-1, 512, 7, 7] 0
Conv2d-184 [-1, 2048, 7, 7] 1,048,576
BatchNorm2d-185 [-1, 2048, 7, 7] 4,096
ReLU-186 [-1, 2048, 7, 7] 0
Bottleneck-187 [-1, 2048, 7, 7] 0
Conv2d-188 [-1, 512, 7, 7] 1,048,576
BatchNorm2d-189 [-1, 512, 7, 7] 1,024
ReLU-190 [-1, 512, 7, 7] 0
Conv2d-191 [-1, 512, 7, 7] 2,359,296
BatchNorm2d-192 [-1, 512, 7, 7] 1,024
ReLU-193 [-1, 512, 7, 7] 0
Identity-194 [-1, 512, 7, 7] 0
Conv2d-195 [-1, 2048, 7, 7] 1,048,576
BatchNorm2d-196 [-1, 2048, 7, 7] 4,096
ReLU-197 [-1, 2048, 7, 7] 0
Bottleneck-198 [-1, 2048, 7, 7] 0
AttentionPool2d-199 [-1, 1024] 0
================================================================
text_encoder网络结构打印:
positional_embedding torch.Size([77, 512])
text_projection torch.Size([512, 1024])
logit_scale torch.Size([])
token_embedding.weight torch.Size([49408, 512])
ln_final.weight torch.Size([512])
ln_final.bias torch.Size([512])
CLIP支持模型
‘RN50’, ‘RN101’, ‘RN50x4’, ‘RN50x16’, ‘RN50x64’, ‘ViT-B/32’, ‘ViT-B/16’, ‘ViT-L/14’, ‘ViT-L/14@336px’
各模型性能对比:

CLIP模型下载地址
模型 | 下载地址 |
---|---|
RN50 | https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt |
RN101 | https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt |
RN50X4 | https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt |
RN50X16 | https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt |
RN50X64 | https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt |
ViT-B/32 | https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt |
ViT-B/16 | https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt |
ViT-L/14 | https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt |
ViT-L/14@336px | https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt |
各类模型参数对比
model | image_size | image_width | text_width | embed_dim | mparams | image_mparams | text_mparams | gflops | image_gflops | text_gflops |
---|---|---|---|---|---|---|---|---|---|---|
ViT-S-32-alt | 224 | 384 | 256 | 256 | 43.22 | 22.59 | 20.63 | 3.56 | 2.29 | 1.27 |
ViT-S-32 | 224 | 384 | 384 | 384 | 63.09 | 22.64 | 40.44 | 5.66 | 2.29 | 3.38 |
ViT-M-32-alt | 224 | 512 | 384 | 384 | 80.07 | 39.63 | 40.44 | 7.37 | 3.99 | 3.38 |
ViT-M-32 | 224 | 512 | 512 | 512 | 103.12 | 39.69 | 63.43 | 9.95 | 3.99 | 5.96 |
ViT-S-16-alt | 224 | 384 | 256 | 256 | 42.4 | 21.76 | 20.63 | 10.47 | 9.2 | 1.27 |
ViT-S-16 | 224 | 384 | 384 | 384 | 62.26 | 21.81 | 40.44 | 12.58 | 9.2 | 3.38 |
ViT-B-32 | 224 | 768 | 512 | 512 | 151.28 | 87.85 | 63.43 | 14.78 | 8.82 | 5.96 |
ViT-B-32-quickgelu | 224 | 768 | 512 | 512 | 151.28 | 87.85 | 63.43 | 14.78 | 8.82 | 5.96 |
convnext_tiny | 224 | 768 | 512 | 1024 | 92.3 | 28.61 | 63.69 | 14.87 | 8.91 | 5.96 |
ViT-B-32-256 | 256 | 768 | 512 | 512 | 151.29 | 87.86 | 63.43 | 17.46 | 11.5 | 5.96 |
RN50 | 224 | 64 | 512 | 1024 | 102.01 | 38.32 | 63.69 | 18.18 | 12.22 | 5.96 |
RN50-quickgelu | 224 | 64 | 512 | 1024 | 102.01 | 38.32 | 63.69 | 18.18 | 12.22 | 5.96 |
ViT-M-16-alt | 224 | 512 | 384 | 384 | 78.98 | 38.53 | 40.44 | 19.36 | 15.98 | 3.38 |
ViT-M-16 | 224 | 512 | 512 | 512 | 102.02 | 38.59 | 63.43 | 21.94 | 15.98 | 5.96 |
vit_relpos_medium_patch16_cls_224 | 224 | 768 | 512 | 512 | 101.94 | 38.51 | 63.43 | 21.99 | 16.03 | 5.96 |
mt5-base-ViT-B-32 | 224 | 768 | 512 | 512 | 365.71 | 87.85 | 277.86 | 22.12 | 8.82 | 13.3 |
convnext_small | 224 | 768 | 512 | 512 | 113.28 | 49.85 | 63.43 | 23.33 | 17.37 | 5.96 |
ViT-B-32-plus-256 | 256 | 896 | 640 | 640 | 210.3 | 119.13 | 91.16 | 24.83 | 15.56 | 9.27 |
RN101 | 224 | 64 | 512 | 512 | 119.69 | 56.26 | 63.43 | 25.5 | 19.54 | 5.96 |
RN101-quickgelu | 224 | 64 | 512 | 512 | 119.69 | 56.26 | 63.43 | 25.5 | 19.54 | 5.96 |
vit_medium_patch16_gap_256 | 256 | 768 | 512 | 512 | 102.04 | 38.61 | 63.43 | 27.1 | 21.14 | 5.96 |
coca_ViT-B-32 | 224 | 768 | 512 | 512 | 253.56 | 89.16 | 63.43 | 33.34 | 9.19 | 5.96 |
convnext_base | 224 | 768 | 512 | 512 | 151.52 | 88.09 | 63.43 | 36.67 | 30.71 | 5.96 |
swin_base_patch4_window7_224 | 224 | 768 | 640 | 640 | 178.56 | 87.4 | 91.16 | 40.13 | 30.86 | 9.27 |
ViT-B-16 | 224 | 768 | 512 | 512 | 149.62 | 86.19 | 63.43 | 41.09 | 35.13 | 5.96 |
ViT-B-16-quickgelu | 224 | 768 | 512 | 512 | 149.62 | 86.19 | 63.43 | 41.09 | 35.13 | 5.96 |
EVA02-B-16 | 224 | 768 | 512 | 512 | 149.69 | 86.26 | 63.43 | 41.09 | 35.13 | 5.96 |
ViT-B-16-SigLIP | 224 | 768 | 768 | 768 | 203.16 | 92.88 | 110.27 | 46.44 | 35.42 | 11.02 |
convnext_base_w | 256 | 768 | 640 | 640 | 179.39 | 88.22 | 91.16 | 49.38 | 40.11 | 9.27 |
RN50x4 | 288 | 80 | 640 | 640 | 178.3 | 87.14 | 91.16 | 51.82 | 42.56 | 9.27 |
coca_roberta-ViT-B-32 | 224 | 768 | 768 | 512 | 420.37 | 87.85 | 124.45 | 53.12 | 8.82 | 13.12 |
ViT-B-16-plus | 224 | 896 | 640 | 640 | 208.35 | 117.19 | 91.16 | 56.75 | 47.49 | 9.27 |
ViT-B-16-SigLIP-256 | 256 | 768 | 768 | 768 | 203.2 | 92.93 | 110.27 | 57.84 | 46.82 | 11.02 |
ViT-B-16-SigLIP-i18n-256 | 256 | 768 | 768 | 768 | 370.63 | 92.93 | 277.7 | 57.84 | 46.82 | 11.02 |
ViT-B-16-plus-240 | 240 | 896 | 640 | 640 | 208.38 | 117.21 | 91.16 | 64.03 | 54.76 | 9.27 |
convnext_base_w_320 | 320 | 768 | 640 | 640 | 179.39 | 88.22 | 91.16 | 71.94 | 62.67 | 9.27 |
convnext_large | 224 | 768 | 768 | 768 | 321.06 | 197.41 | 123.65 | 82.02 | 68.72 | 13.3 |
coca_base | 288 | 768 | 768 | 512 | 440.34 | 86.4 | 134.66 | 99.09 | 46.47 | 13.3 |
roberta-ViT-B-32 | 224 | 768 | 512 | 512 | 212.72 | 87.85 | 124.87 | 105.87 | 8.82 | 97.05 |
xlm-roberta-base-ViT-B-32 | 224 | 768 | 512 | 512 | 366.12 | 87.85 | 278.27 | 105.87 | 8.82 | 97.05 |
convnext_large_d | 256 | 768 | 768 | 768 | 351.77 | 199.77 | 152.0 | 107.5 | 89.76 | 17.73 |
ViT-B-16-SigLIP-384 | 384 | 768 | 768 | 768 | 203.45 | 93.18 | 110.27 | 123.15 | 112.13 | 11.02 |
ViT-L-16 | 224 | 1024 | 768 | 768 | 427.74 | 304.09 | 123.65 | 136.41 | 123.11 | 13.3 |
convnext_large_d_320 | 320 | 768 | 768 | 768 | 351.77 | 199.77 | 152.0 | 157.98 | 140.25 | 17.73 |
RN50x16 | 384 | 96 | 768 | 768 | 290.98 | 167.33 | 123.65 | 162.69 | 149.39 | 13.3 |
ViT-L-14-CLIPA | 224 | 1024 | 768 | 768 | 414.21 | 303.96 | 110.25 | 167.5 | 162.03 | 5.47 |
EVA02-L-14 | 224 | 768 | 768 | 768 | 427.76 | 304.11 | 123.65 | 175.3 | 162.0 | 13.3 |
ViT-L-14 | 224 | 1024 | 768 | 768 | 427.62 | 303.97 | 123.65 | 175.33 | 162.03 | 13.3 |
ViT-L-14-quickgelu | 224 | 1024 | 768 | 768 | 427.62 | 303.97 | 123.65 | 175.33 | 162.03 | 13.3 |
convnext_xlarge | 256 | 768 | 1024 | 1024 | 653.89 | 350.25 | 303.65 | 198.38 | 159.14 | 39.24 |
ViT-L-16-SigLIP-256 | 256 | 768 | 1024 | 1024 | 652.15 | 315.96 | 336.19 | 201.62 | 162.56 | 39.06 |
coca_ViT-L-14 | 224 | 1024 | 768 | 768 | 638.45 | 306.72 | 123.65 | 214.52 | 163.64 | 13.3 |
ViT-B-16-SigLIP-512 | 512 | 768 | 768 | 768 | 203.79 | 93.52 | 110.27 | 227.26 | 216.24 | 11.02 |
ViT-SO400M-14-SigLIP | 224 | 768 | 1152 | 1152 | 877.36 | 427.68 | 449.68 | 233.54 | 220.35 | 13.19 |
ViT-L-14-280 | 280 | 1024 | 768 | 768 | 427.76 | 304.11 | 123.65 | 271.79 | 258.49 | 13.3 |
ViT-L-16-320 | 320 | 1024 | 768 | 768 | 427.95 | 304.3 | 123.65 | 271.93 | 258.63 | 13.3 |
ViT-H-16 | 224 | 1280 | 1024 | 1024 | 986.26 | 632.23 | 354.03 | 301.72 | 254.63 | 47.09 |
ViT-H-14-CLIPA | 224 | 1280 | 1024 | 1024 | 968.24 | 632.07 | 336.16 | 354.02 | 334.59 | 19.43 |
nllb-clip-base | 224 | 768 | 512 | 512 | 501.89 | 87.85 | 414.04 | 369.6 | 8.82 | 360.78 |
ViT-H-14 | 224 | 1280 | 1024 | 1024 | 986.11 | 632.08 | 354.03 | 381.68 | 334.59 | 47.09 |
ViT-H-14-quickgelu | 224 | 1280 | 1024 | 1024 | 986.11 | 632.08 | 354.03 | 381.68 | 334.59 | 47.09 |
ViT-L-14-CLIPA-336 | 336 | 1024 | 768 | 768 | 414.54 | 304.29 | 110.25 | 387.39 | 381.92 | 5.47 |
EVA02-L-14-336 | 336 | 768 | 768 | 768 | 428.08 | 304.43 | 123.65 | 395.16 | 381.86 | 13.3 |
ViT-L-14-336 | 336 | 1024 | 768 | 768 | 427.94 | 304.29 | 123.65 | 395.22 | 381.92 | 13.3 |
ViT-L-16-SigLIP-384 | 384 | 768 | 1024 | 1024 | 652.48 | 316.28 | 336.19 | 422.91 | 383.85 | 39.06 |
convnext_xxlarge | 256 | 768 | 1024 | 1024 | 1200.58 | 846.54 | 354.03 | 443.03 | 395.94 | 47.09 |
nllb-clip-base-siglip | 384 | 768 | 512 | 768 | 507.47 | 93.18 | 414.3 | 472.91 | 112.13 | 360.78 |
mt5-xl-ViT-H-14 | 224 | 1280 | 512 | 1024 | 2306.75 | 632.08 | 1674.68 | 514.04 | 334.59 | 179.45 |
EVA01-g-14 | 224 | 768 | 768 | 1024 | 1136.44 | 1012.59 | 123.85 | 547.36 | 534.06 | 13.3 |
RN50x64 | 448 | 128 | 1024 | 1024 | 623.26 | 420.38 | 202.88 | 552.65 | 529.11 | 23.55 |
EVA01-g-14-plus | 224 | 768 | 1024 | 1024 | 1366.62 | 1012.59 | 354.03 | 581.15 | 534.06 | 47.09 |
ViT-g-14 | 224 | 1408 | 1024 | 1024 | 1366.68 | 1012.65 | 354.03 | 581.15 | 534.06 | 47.09 |
convnext_xxlarge_320 | 320 | 768 | 1024 | 1024 | 1200.58 | 846.54 | 354.03 | 665.74 | 618.65 | 47.09 |
xlm-roberta-large-ViT-H-14 | 224 | 1280 | 512 | 1024 | 1193.01 | 632.08 | 560.94 | 671.01 | 334.59 | 336.42 |
ViT-SO400M-14-SigLIP-384 | 384 | 768 | 1152 | 1152 | 877.96 | 428.23 | 449.73 | 723.48 | 670.35 | 53.13 |
ViT-H-14-CLIPA-336 | 336 | 1280 | 1024 | 1024 | 968.64 | 632.48 | 336.16 | 800.88 | 781.45 | 19.43 |
ViT-bigG-14-CLIPA | 224 | 1664 | 1280 | 1280 | 2517.22 | 1844.9 | 672.32 | 1007.93 | 967.5 | 40.44 |
ViT-H-14-378-quickgelu | 378 | 1280 | 1024 | 1024 | 986.71 | 632.68 | 354.03 | 1054.05 | 1006.96 | 47.09 |
ViT-bigG-14 | 224 | 1664 | 1280 | 1280 | 2539.57 | 1844.91 | 694.66 | 1065.36 | 967.5 | 97.86 |
nllb-clip-large | 224 | 1280 | 512 | 1024 | 1399.22 | 632.08 | 767.14 | 1468.46 | 334.59 | 1133.87 |
nllb-clip-large-siglip | 384 | 768 | 512 | 1152 | 1195.5 | 428.23 | 767.27 | 1804.22 | 670.35 | 1133.87 |
ViT-e-14 | 224 | 1792 | 1280 | 1280 | 4581.09 | 3807.72 | 773.37 | 2091.45 | 1981.35 | 110.1 |
ViT-bigG-14-CLIPA-336 | 336 | 1664 | 1280 | 1280 | 2517.76 | 1845.44 | 672.32 | 2271.58 | 2231.15 | 40.44 |
EVA02-E-14 | 224 | 768 | 1024 | 1024 | 4704.59 | 4350.56 | 354.03 | 2311.42 | 2264.33 | 47.09 |
EVA02-E-14-plus | 224 | 768 | 1280 | 1024 | 5044.89 | 4350.56 | 694.33 | 2362.19 | 2264.33 | 97.86 |