CLIP模型

bblingbbling

已于 2024-09-22 22:24:16 修改

阅读量8.4k

点赞数 21

文章标签：算法人工智能

于 2024-03-06 17:09:07 首次发布

本文链接：https://blog.csdn.net/bblingbbling/article/details/136511701

版权

CLIP模型

简介
CLIP代码结构
CLIP模型参数
CLIP支持模型
CLIP模型下载地址
各类模型参数对比

简介

CLIP，全称Constrastive Language-Image Pre-training，是2021年OpenAI提出的基于4亿（400M）图像-文本对数据，通过对比学习的多模态预训练模型，具备强大的zero-shot迁移能力。

论文地址：https://arxiv.org/pdf/2103.00020.pdf
代码地址：https://github.com/openai/CLIP

模型结构图：

CLIP代码结构

伪代码如下：

对伪代码进行注释：

# image_encoder    图像编码器：ResNet或者ViT
# text_encoder     文本编码器：CBOW或者Text Transformer
# I[n,h,w,c]       图像输入大小: 比如 [16, 224, 224, 3]
# T[n,l]           文本输入大小：n表示batch size，l表示序列长度
# W_i[d_i, d_e]     图像的投射层，学习如何从单模态到多模态
# W_t[d_t, d_e]     文本的投射层，学习如何从单模态到多模态
# t                可学习的温度系数

# 分别提取每个模态的特征
I_f = image_encoder(I)  # 输出大小 [n, d_i]
T_f = text_encoder(T)   # 输出大小 [n, d_t]

# 合并多模态特征
I_e = l2_normalize(np.dot(I_f, W_i), axis=1)  # 输出大小 [n, d_e]
T_e = l2_normalize(np.dot(T_f, W_t), axis=1)  # 输出大小 [n, d_e]

# 计算cos相似度，I_e 与 T_e 转置矩阵的点积，np.exp(t) 表示可学习的温度系数的指数。
# 这里实际是在对点积的结果进行加权，通过指数函数引入温度系数的影响，以调整余弦相似度的分数。
# 这样的加权余弦相似度通常用于度量多模态特征之间的相似性。
logits = np.dot(I_e, T_e.T) * np.exp(t)  # 输出大小 [n, n]

# 计算损失函数
labels = np.arange(n)
loss_i = cross_entropy_loss(logits, labels, axis=0)  # 针对图像的交叉熵损失
loss_t = cross_entropy_loss(logits, labels, axis=1)  # 针对文本的交叉熵损失
loss = (loss_i + loss_t) / 2  # 综合图像和文本的损失

对温度系数t的理解：
clip在处理图像和文本这两个向量的相似度时，在0~1区间范围内的辨识度很低，也就是说，难以拉开两个相似度不高的向量之间的距离，而温度系数t可以把向量的相似度进行放大，使其能够很好的区分。

温度系数来源于InfoNCE损失函数中：
$-\frac{1}{N} \sum_{i=1}^{N} log\left(\frac{\exp\left(\frac{q_i \cdot k_{i+}\ }{\tau}\right)}{\sum_{j=1}^{N} \exp\left(\frac{q_i \cdot k_{i-}}{\tau}\right)}\right)$

# 温度系数
self.T = 0.07

# 计算相似度
# positive logits: Nx1
s_pos = torch.sum(q*k, dim=1).unsqueeze(dim=1)
# negative logits: NxK
s_neg = torch.matmul(q, self.queue.clone().detach().T)

# 拼接相似度 logits: Nx(1+K)
logits = torch.cat([s_pos, s_neg], dim=1)
logits /= self.T

# 创建标签
labels = torch.zeros(logits.shape[0], dtype=torch.long).cuda()

# 计算InfoNCE损失
loss = F.cross_entropy(logits, labels)

CLIP前向架构

    def forward(self, image, text):
        image_features = self.encode_image(image)
        text_features = self.encode_text(text)

        # normalized features
        image_features = image_features / image_features.norm(dim=1, keepdim=True)
        text_features = text_features / text_features.norm(dim=1, keepdim=True)

        # cosine similarity as logits
        logit_scale = self.logit_scale.exp()
        logits_per_image = logit_scale * image_features @ text_features.t()
        logits_per_text = logits_per_image.t()

        # shape = [global_batch_size, global_batch_size]
        return logits_per_image, logits_per_text

CLIP图像编码模块

图像预处理

def _transform(input_resolution):
    return Compose([
        Resize(input_resolution, interpolation=BICUBIC),
        CenterCrop(input_resolution),
        _convert_image_to_rgb,
        ToTensor(),
        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
    ])

encode_image

该模块提供了两种编码的backbone，分别是经过修改的ResNet和ViT。
ModifiedResNet主干：

class ModifiedResNet(nn.Module):
    """
    A ResNet class that is similar to torchvision's but contains the following changes:
    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
    - The final pooling layer is a QKV attention instead of an average pool
    """

    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
        super().__init__()
        self.output_dim = output_dim
        self.input_resolution = input_resolution

        # the 3-layer stem
        self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(width // 2)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(width // 2)
        self.relu2 = nn.ReLU(inplace=True)
        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
        self.bn3 = nn.BatchNorm2d(width)
        self.relu3 = nn.ReLU(inplace=True)
        self.avgpool = nn.AvgPool2d(2)

        # residual layers
        self._inplanes = width  # this is a *mutable* variable used during construction
        self.layer1 = self._make_layer(width, layers[0])
        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)

        embed_dim = width * 32  # the ResNet feature dimension
        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)

    def _make_layer(self, planes, blocks, stride=1):
        layers = [Bottleneck(self._inplanes, planes, stride)]

        self._inplanes = planes * Bottleneck.expansion
        for _ in range(1, blocks):
            layers.append(Bottleneck(self._inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        def stem(x):
            x = self.relu1(self.bn1(self.conv1(x)))
            x = self.relu2(self.bn2(self.conv2(x)))
            x = self.relu3(self.bn3(self.conv3(x)))
            x = self.avgpool(x)
            return x

        x = x.type(self.conv1.weight.dtype)
        x = stem(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.attnpool(x)

        return x

Transformer主干：

class VisionTransformer(nn.Module):
    def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
        super().__init__()
        self.input_resolution = input_resolution
        self.output_dim = output_dim
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)

        scale = width ** -0.5
        self.class_embedding = nn.Parameter(scale * torch.randn(width))
        self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
        self.ln_pre = LayerNorm(width)

        self.transformer = Transformer(width, layers, heads)

        self.ln_post = LayerNorm(width)
        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))

    def forward(self, x: torch.Tensor):
        x = self.conv1(x)  # shape = [*, width, grid, grid]
        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
        x = x + self.positional_embedding.to(x.dtype)
        x = self.ln_pre(x)

        x = x.permute(1, 0, 2)  # NLD -> LND
        x = self.transformer(x)
        x = x.permute(1, 0, 2)  # LND -> NLD

        x = self.ln_post(x[:, 0, :])

        if self.proj is not None:
            x = x @ self.proj

        return x

CLIP文本编码模块

def encode_text(self, text):
    x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
    x = x + self.positional_embedding.type(self.dtype)
    x = x.permute(1, 0, 2)  # NLD -> LND
    x = self.transformer(x)
    x = x.permute(1, 0, 2)  # LND -> NLD
    x = self.ln_final(x).type(self.dtype)

    # x.shape = [batch_size, n_ctx, transformer.width]
    # take features from the eot embedding (eot_token is the highest number in each sequence)
    x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection

    return x

CLIP模型参数

ViT-B-32模型image_encoder模块参数量：

Total params: 59,068,416
Trainable params: 59,068,416
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.57
Forward/backward pass size (MB): 349.28
Params size (MB): 225.33
Estimated Total Size (MB): 575.18
----------------------------------------------------------------
        Layer (type)               Output Shape         Param # 
================================================================
            Conv2d-1            [-1, 768, 7, 7]       2,359,296
         LayerNorm-2              [-1, 50, 768]           1,536
         LayerNorm-3               [-1, 2, 768]           1,536
MultiheadAttention-4  [[-1, 2, 768], [-1, 50, 50]]               0
         LayerNorm-5               [-1, 2, 768]           1,536
            Linear-6              [-1, 2, 3072]       2,362,368
         QuickGELU-7              [-1, 2, 3072]               0
            Linear-8               [-1, 2, 768]       2,360,064
ResidualAttentionBlock-9               [-1, 2, 768]               0
        LayerNorm-10               [-1, 2, 768]           1,536
MultiheadAttention-11  [[-1, 2, 768], [-1, 50, 50]]               0
        LayerNorm-12               [-1, 2, 768]           1,536
           Linear-13              [-1, 2, 3072]       2,362,368
        QuickGELU-14              [-1, 2, 3072]               0
           Linear-15               [-1, 2, 768]       2,360,064
ResidualAttentionBlock-16               [-1, 2, 768]               0
        LayerNorm-17               [-1, 2, 768]           1,536
MultiheadAttention-18  [[-1, 2, 768], [-1, 50, 50]]               0
        LayerNorm-19               [-1, 2, 768]           1,536
           Linear-20              [-1, 2, 3072]       2,362,368
        QuickGELU-21              [-1, 2, 3072]               0
           Linear-22               [-1, 2, 768]       2,360,064
ResidualAttentionBlock-23               [-1, 2, 768]               0
        LayerNorm-24               [-1, 2, 768]           1,536
MultiheadAttention-25  [[-1, 2, 768], [-1, 50, 50]]               0
        LayerNorm-26               [-1, 2, 768]           1,536
           Linear-27              [-1, 2, 3072]       2,362,368
        QuickGELU-28              [-1, 2, 3072]               0
           Linear-29               [-1, 2, 768]       2,360,064
ResidualAttentionBlock-30               [-1, 2, 768]               0
        LayerNorm-31               [-1, 2, 768]           1,536
MultiheadAttention-32  [[-1, 2, 768], [-1, 50, 50]]               0
        LayerNorm-33               [-1, 2, 768]           1,536
           Linear-34              [-1, 2, 3072]       2,362,368
        QuickGELU-35              [-1, 2, 3072]               0
           Linear-36               [-1, 2, 768]       2,360,064
ResidualAttentionBlock-37               [-1, 2, 768]               0
        LayerNorm-38               [-1, 2, 768]           1,536
MultiheadAttention-39  [[-1, 2, 768], [-1, 50, 50]]               0
        LayerNorm-40               [-1, 2, 768]           1,536
           Linear-41              [-1, 2, 3072]       2,362,368
        QuickGELU-42              [-1, 2, 3072]               0
           Linear-43               [-1, 2, 768]       2,360,064
ResidualAttentionBlock-44               [-1, 2, 768]               0
        LayerNorm-45               [-1, 2, 768]           1,536
MultiheadAttention-46  [[-1, 2, 768], [-1, 50, 50]]               0
        LayerNorm-47               [-1, 2, 768]           1,536
           Linear-48              [-1, 2, 3072]       2,362,368
        QuickGELU-49              [-1, 2, 3072]               0
           Linear-50               [-1, 2, 768]       2,360,064
ResidualAttentionBlock-51               [-1, 2, 768]               0
        LayerNorm-52               [-1, 2, 768]           1,536
MultiheadAttention-53  [[-1, 2, 768], [-1, 50, 50]]               0
        LayerNorm-54               [-1, 2, 768]           1,536
           Linear-55              [-1, 2, 3072]       2,362,368
        QuickGELU-56              [-1, 2, 3072]               0
           Linear-57               [-1, 2, 768]       2,360,064
ResidualAttentionBlock-58               [-1, 2, 768]               0
        LayerNorm-59               [-1, 2, 768]           1,536
MultiheadAttention-60  [[-1, 2, 768], [-1, 50, 50]]               0
        LayerNorm-61               [-1, 2, 768]           1,536
           Linear-62              [-1, 2, 3072]       2,362,368
        QuickGELU-63              [-1, 2, 3072]               0
           Linear-64               [-1, 2, 768]       2,360,064
ResidualAttentionBlock-65               [-1, 2, 768]               0
        LayerNorm-66               [-1, 2, 768]           1,536
MultiheadAttention-67  [[-1, 2, 768], [-1, 50, 50]]               0
        LayerNorm-68               [-1, 2, 768]           1,536
           Linear-69              [-1, 2, 3072]       2,362,368
        QuickGELU-70              [-1, 2, 3072]               0
           Linear-71               [-1, 2, 768]       2,360,064
ResidualAttentionBlock-72               [-1, 2, 768]               0
        LayerNorm-73               [-1, 2, 768]           1,536
MultiheadAttention-74  [[-1, 2, 768], [-1, 50, 50]]               0
        LayerNorm-75               [-1, 2, 768]           1,536
           Linear-76              [-1, 2, 3072]       2,362,368
        QuickGELU-77              [-1, 2, 3072]               0
           Linear-78               [-1, 2, 768]       2,360,064
ResidualAttentionBlock-79               [-1, 2, 768]               0
        LayerNorm-80               [-1, 2, 768]           1,536
MultiheadAttention-81  [[-1, 2, 768], [-1, 50, 50]]               0
        LayerNorm-82               [-1, 2, 768]           1,536
           Linear-83              [-1, 2, 3072]       2,362,368
        QuickGELU-84              [-1, 2, 3072]               0
           Linear-85               [-1, 2, 768]       2,360,064
ResidualAttentionBlock-86               [-1, 2, 768]               0
      Transformer-87               [-1, 2, 768]               0
        LayerNorm-88                  [-1, 768]           1,536
================================================================

RN50模型image_encoder模块参数量：

Total params: 23,527,264
Trainable params: 23,527,264
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.57
Forward/backward pass size (MB): 331.71
Params size (MB): 89.75
Estimated Total Size (MB): 422.04
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Conv2d-1         [-1, 32, 112, 112]             864
       BatchNorm2d-2         [-1, 32, 112, 112]              64
              ReLU-3         [-1, 32, 112, 112]               0
            Conv2d-4         [-1, 32, 112, 112]           9,216
       BatchNorm2d-5         [-1, 32, 112, 112]              64
              ReLU-6         [-1, 32, 112, 112]               0
            Conv2d-7         [-1, 64, 112, 112]          18,432
       BatchNorm2d-8         [-1, 64, 112, 112]             128
              ReLU-9         [-1, 64, 112, 112]               0
        AvgPool2d-10           [-1, 64, 56, 56]               0
           Conv2d-11           [-1, 64, 56, 56]           4,096
      BatchNorm2d-12           [-1, 64, 56, 56]             128
             ReLU-13           [-1, 64, 56, 56]               0
           Conv2d-14           [-1, 64, 56, 56]          36,864
      BatchNorm2d-15           [-1, 64, 56, 56]             128
             ReLU-16           [-1, 64, 56, 56]               0
         Identity-17           [-1, 64, 56, 56]               0
           Conv2d-18          [-1, 256, 56, 56]          16,384
      BatchNorm2d-19          [-1, 256, 56, 56]             512
        AvgPool2d-20           [-1, 64, 56, 56]               0
           Conv2d-21          [-1, 256, 56, 56]          16,384
      BatchNorm2d-22          [-1, 256, 56, 56]             512
             ReLU-23          [-1, 256, 56, 56]               0
       Bottleneck-24          [-1, 256, 56, 56]               0
           Conv2d-25           [-1, 64, 56, 56]          16,384
      BatchNorm2d-26           [-1, 64, 56, 56]             128
             ReLU-27           [-1, 64, 56, 56]               0
           Conv2d-28           [-1, 64, 56, 56]          36,864
      BatchNorm2d-29           [-1, 64, 56, 56]             128
             ReLU-30           [-1, 64, 56, 56]               0
         Identity-31           [-1, 64, 56, 56]               0
           Conv2d-32          [-1, 256, 56, 56]          16,384
      BatchNorm2d-33          [-1, 256, 56, 56]             512
             ReLU-34          [-1, 256, 56, 56]               0
       Bottleneck-35          [-1, 256, 56, 56]               0
           Conv2d-36           [-1, 64, 56, 56]          16,384
      BatchNorm2d-37           [-1, 64, 56, 56]             128
             ReLU-38           [-1, 64, 56, 56]               0
           Conv2d-39           [-1, 64, 56, 56]          36,864
      BatchNorm2d-40           [-1, 64, 56, 56]             128
             ReLU-41           [-1, 64, 56, 56]               0
         Identity-42           [-1, 64, 56, 56]               0
           Conv2d-43          [-1, 256, 56, 56]          16,384
      BatchNorm2d-44          [-1, 256, 56, 56]             512
             ReLU-45          [-1, 256, 56, 56]               0
       Bottleneck-46          [-1, 256, 56, 56]               0
           Conv2d-47          [-1, 128, 56, 56]          32,768
      BatchNorm2d-48          [-1, 128, 56, 56]             256
             ReLU-49          [-1, 128, 56, 56]               0
           Conv2d-50          [-1, 128, 56, 56]         147,456
      BatchNorm2d-51          [-1, 128, 56, 56]             256
             ReLU-52          [-1, 128, 56, 56]               0
        AvgPool2d-53          [-1, 128, 28, 28]               0
           Conv2d-54          [-1, 512, 28, 28]          65,536
      BatchNorm2d-55          [-1, 512, 28, 28]           1,024
        AvgPool2d-56          [-1, 256, 28, 28]               0
           Conv2d-57          [-1, 512, 28, 28]         131,072
      BatchNorm2d-58          [-1, 512, 28, 28]           1,024
             ReLU-59          [-1, 512, 28, 28]               0
       Bottleneck-60          [-1, 512, 28, 28]               0
           Conv2d-61          [-1, 128, 28, 28]          65,536
      BatchNorm2d-62          [-1, 128, 28, 28]             256
             ReLU-63          [-1, 128, 28, 28]               0
           Conv2d-64          [-1, 128, 28, 28]         147,456
      BatchNorm2d-65          [-1, 128, 28, 28]             256
             ReLU-66          [-1, 128, 28, 28]               0
         Identity-67          [-1, 128, 28, 28]               0
           Conv2d-68          [-1, 512, 28, 28]          65,536
      BatchNorm2d-69          [-1, 512, 28, 28]           1,024
             ReLU-70          [-1, 512, 28, 28]               0
       Bottleneck-71          [-1, 512, 28, 28]               0
           Conv2d-72          [-1, 128, 28, 28]          65,536
      BatchNorm2d-73          [-1, 128, 28, 28]             256
             ReLU-74          [-1, 128, 28, 28]               0
           Conv2d-75          [-1, 128, 28, 28]         147,456
      BatchNorm2d-76          [-1, 128, 28, 28]             256
             ReLU-77          [-1, 128, 28, 28]               0
         Identity-78          [-1, 128, 28, 28]               0
           Conv2d-79          [-1, 512, 28, 28]          65,536
      BatchNorm2d-80          [-1, 512, 28, 28]           1,024
             ReLU-81          [-1, 512, 28, 28]               0
       Bottleneck-82          [-1, 512, 28, 28]               0
           Conv2d-83          [-1, 128, 28, 28]          65,536
      BatchNorm2d-84          [-1, 128, 28, 28]             256
             ReLU-85          [-1, 128, 28, 28]               0
           Conv2d-86          [-1, 128, 28, 28]         147,456
      BatchNorm2d-87          [-1, 128, 28, 28]             256
             ReLU-88          [-1, 128, 28, 28]               0
         Identity-89          [-1, 128, 28, 28]               0
           Conv2d-90          [-1, 512, 28, 28]          65,536
      BatchNorm2d-91          [-1, 512, 28, 28]           1,024
             ReLU-92          [-1, 512, 28, 28]               0
       Bottleneck-93          [-1, 512, 28, 28]               0
           Conv2d-94          [-1, 256, 28, 28]         131,072
      BatchNorm2d-95          [-1, 256, 28, 28]             512
             ReLU-96          [-1, 256, 28, 28]               0
           Conv2d-97          [-1, 256, 28, 28]         589,824
      BatchNorm2d-98          [-1, 256, 28, 28]             512
             ReLU-99          [-1, 256, 28, 28]               0
       AvgPool2d-100          [-1, 256, 14, 14]               0
          Conv2d-101         [-1, 1024, 14, 14]         262,144
     BatchNorm2d-102         [-1, 1024, 14, 14]           2,048
       AvgPool2d-103          [-1, 512, 14, 14]               0
          Conv2d-104         [-1, 1024, 14, 14]         524,288
     BatchNorm2d-105         [-1, 1024, 14, 14]           2,048
            ReLU-106         [-1, 1024, 14, 14]               0
      Bottleneck-107         [-1, 1024, 14, 14]               0
          Conv2d-108          [-1, 256, 14, 14]         262,144
     BatchNorm2d-109          [-1, 256, 14, 14]             512
            ReLU-110          [-1, 256, 14, 14]               0
          Conv2d-111          [-1, 256, 14, 14]         589,824
     BatchNorm2d-112          [-1, 256, 14, 14]             512
            ReLU-113          [-1, 256, 14, 14]               0
        Identity-114          [-1, 256, 14, 14]               0
          Conv2d-115         [-1, 1024, 14, 14]         262,144
     BatchNorm2d-116         [-1, 1024, 14, 14]           2,048
            ReLU-117         [-1, 1024, 14, 14]               0
      Bottleneck-118         [-1, 1024, 14, 14]               0
          Conv2d-119          [-1, 256, 14, 14]         262,144
     BatchNorm2d-120          [-1, 256, 14, 14]             512
            ReLU-121          [-1, 256, 14, 14]               0
          Conv2d-122          [-1, 256, 14, 14]         589,824
     BatchNorm2d-123          [-1, 256, 14, 14]             512
            ReLU-124          [-1, 256, 14, 14]               0
        Identity-125          [-1, 256, 14, 14]               0
          Conv2d-126         [-1, 1024, 14, 14]         262,144
     BatchNorm2d-127         [-1, 1024, 14, 14]           2,048
            ReLU-128         [-1, 1024, 14, 14]               0
      Bottleneck-129         [-1, 1024, 14, 14]               0
          Conv2d-130          [-1, 256, 14, 14]         262,144
     BatchNorm2d-131          [-1, 256, 14, 14]             512
            ReLU-132          [-1, 256, 14, 14]               0
          Conv2d-133          [-1, 256, 14, 14]         589,824
     BatchNorm2d-134          [-1, 256, 14, 14]             512
            ReLU-135          [-1, 256, 14, 14]               0
        Identity-136          [-1, 256, 14, 14]               0
          Conv2d-137         [-1, 1024, 14, 14]         262,144
     BatchNorm2d-138         [-1, 1024, 14, 14]           2,048
            ReLU-139         [-1, 1024, 14, 14]               0
      Bottleneck-140         [-1, 1024, 14, 14]               0
          Conv2d-141          [-1, 256, 14, 14]         262,144
     BatchNorm2d-142          [-1, 256, 14, 14]             512
            ReLU-143          [-1, 256, 14, 14]               0
          Conv2d-144          [-1, 256, 14, 14]         589,824
     BatchNorm2d-145          [-1, 256, 14, 14]             512
            ReLU-146          [-1, 256, 14, 14]               0
        Identity-147          [-1, 256, 14, 14]               0
          Conv2d-148         [-1, 1024, 14, 14]         262,144
     BatchNorm2d-149         [-1, 1024, 14, 14]           2,048
            ReLU-150         [-1, 1024, 14, 14]               0
      Bottleneck-151         [-1, 1024, 14, 14]               0
          Conv2d-152          [-1, 256, 14, 14]         262,144
     BatchNorm2d-153          [-1, 256, 14, 14]             512
            ReLU-154          [-1, 256, 14, 14]               0
          Conv2d-155          [-1, 256, 14, 14]         589,824
     BatchNorm2d-156          [-1, 256, 14, 14]             512
            ReLU-157          [-1, 256, 14, 14]               0
        Identity-158          [-1, 256, 14, 14]               0
          Conv2d-159         [-1, 1024, 14, 14]         262,144
     BatchNorm2d-160         [-1, 1024, 14, 14]           2,048
            ReLU-161         [-1, 1024, 14, 14]               0
      Bottleneck-162         [-1, 1024, 14, 14]               0
          Conv2d-163          [-1, 512, 14, 14]         524,288
     BatchNorm2d-164          [-1, 512, 14, 14]           1,024
            ReLU-165          [-1, 512, 14, 14]               0
          Conv2d-166          [-1, 512, 14, 14]       2,359,296
     BatchNorm2d-167          [-1, 512, 14, 14]           1,024
            ReLU-168          [-1, 512, 14, 14]               0
       AvgPool2d-169            [-1, 512, 7, 7]               0
          Conv2d-170           [-1, 2048, 7, 7]       1,048,576
     BatchNorm2d-171           [-1, 2048, 7, 7]           4,096
       AvgPool2d-172           [-1, 1024, 7, 7]               0
          Conv2d-173           [-1, 2048, 7, 7]       2,097,152
     BatchNorm2d-174           [-1, 2048, 7, 7]           4,096
            ReLU-175           [-1, 2048, 7, 7]               0
      Bottleneck-176           [-1, 2048, 7, 7]               0
          Conv2d-177            [-1, 512, 7, 7]       1,048,576
     BatchNorm2d-178            [-1, 512, 7, 7]           1,024
            ReLU-179            [-1, 512, 7, 7]               0
          Conv2d-180            [-1, 512, 7, 7]       2,359,296
     BatchNorm2d-181            [-1, 512, 7, 7]           1,024
            ReLU-182            [-1, 512, 7, 7]               0
        Identity-183            [-1, 512, 7, 7]               0
          Conv2d-184           [-1, 2048, 7, 7]       1,048,576
     BatchNorm2d-185           [-1, 2048, 7, 7]           4,096
            ReLU-186           [-1, 2048, 7, 7]               0
      Bottleneck-187           [-1, 2048, 7, 7]               0
          Conv2d-188            [-1, 512, 7, 7]       1,048,576
     BatchNorm2d-189            [-1, 512, 7, 7]           1,024
            ReLU-190            [-1, 512, 7, 7]               0
          Conv2d-191            [-1, 512, 7, 7]       2,359,296
     BatchNorm2d-192            [-1, 512, 7, 7]           1,024
            ReLU-193            [-1, 512, 7, 7]               0
        Identity-194            [-1, 512, 7, 7]               0
          Conv2d-195           [-1, 2048, 7, 7]       1,048,576
     BatchNorm2d-196           [-1, 2048, 7, 7]           4,096
            ReLU-197           [-1, 2048, 7, 7]               0
      Bottleneck-198           [-1, 2048, 7, 7]               0
 AttentionPool2d-199                 [-1, 1024]               0
================================================================

text_encoder网络结构打印：

positional_embedding torch.Size([77, 512])
text_projection torch.Size([512, 1024])
logit_scale torch.Size([])
token_embedding.weight torch.Size([49408, 512])
ln_final.weight torch.Size([512])
ln_final.bias torch.Size([512])

CLIP支持模型

‘RN50’, ‘RN101’, ‘RN50x4’, ‘RN50x16’, ‘RN50x64’, ‘ViT-B/32’, ‘ViT-B/16’, ‘ViT-L/14’, ‘ViT-L/14@336px’

各模型性能对比：

CLIP模型下载地址

模型	下载地址
RN50	https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt
RN101	https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt
RN50X4	https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt
RN50X16	https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt
RN50X64	https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt
ViT-B/32	https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt
ViT-B/16	https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt
ViT-L/14	https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt
ViT-L/14@336px	https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt

各类模型参数对比

model	image_size	image_width	text_width	embed_dim	mparams	image_mparams	text_mparams	gflops	image_gflops	text_gflops
ViT-S-32-alt	224	384	256	256	43.22	22.59	20.63	3.56	2.29	1.27
ViT-S-32	224	384	384	384	63.09	22.64	40.44	5.66	2.29	3.38
ViT-M-32-alt	224	512	384	384	80.07	39.63	40.44	7.37	3.99	3.38
ViT-M-32	224	512	512	512	103.12	39.69	63.43	9.95	3.99	5.96
ViT-S-16-alt	224	384	256	256	42.4	21.76	20.63	10.47	9.2	1.27
ViT-S-16	224	384	384	384	62.26	21.81	40.44	12.58	9.2	3.38
ViT-B-32	224	768	512	512	151.28	87.85	63.43	14.78	8.82	5.96
ViT-B-32-quickgelu	224	768	512	512	151.28	87.85	63.43	14.78	8.82	5.96
convnext_tiny	224	768	512	1024	92.3	28.61	63.69	14.87	8.91	5.96
ViT-B-32-256	256	768	512	512	151.29	87.86	63.43	17.46	11.5	5.96
RN50	224	64	512	1024	102.01	38.32	63.69	18.18	12.22	5.96
RN50-quickgelu	224	64	512	1024	102.01	38.32	63.69	18.18	12.22	5.96
ViT-M-16-alt	224	512	384	384	78.98	38.53	40.44	19.36	15.98	3.38
ViT-M-16	224	512	512	512	102.02	38.59	63.43	21.94	15.98	5.96
vit_relpos_medium_patch16_cls_224	224	768	512	512	101.94	38.51	63.43	21.99	16.03	5.96
mt5-base-ViT-B-32	224	768	512	512	365.71	87.85	277.86	22.12	8.82	13.3
convnext_small	224	768	512	512	113.28	49.85	63.43	23.33	17.37	5.96
ViT-B-32-plus-256	256	896	640	640	210.3	119.13	91.16	24.83	15.56	9.27
RN101	224	64	512	512	119.69	56.26	63.43	25.5	19.54	5.96
RN101-quickgelu	224	64	512	512	119.69	56.26	63.43	25.5	19.54	5.96
vit_medium_patch16_gap_256	256	768	512	512	102.04	38.61	63.43	27.1	21.14	5.96
coca_ViT-B-32	224	768	512	512	253.56	89.16	63.43	33.34	9.19	5.96
convnext_base	224	768	512	512	151.52	88.09	63.43	36.67	30.71	5.96
swin_base_patch4_window7_224	224	768	640	640	178.56	87.4	91.16	40.13	30.86	9.27
ViT-B-16	224	768	512	512	149.62	86.19	63.43	41.09	35.13	5.96
ViT-B-16-quickgelu	224	768	512	512	149.62	86.19	63.43	41.09	35.13	5.96
EVA02-B-16	224	768	512	512	149.69	86.26	63.43	41.09	35.13	5.96
ViT-B-16-SigLIP	224	768	768	768	203.16	92.88	110.27	46.44	35.42	11.02
convnext_base_w	256	768	640	640	179.39	88.22	91.16	49.38	40.11	9.27
RN50x4	288	80	640	640	178.3	87.14	91.16	51.82	42.56	9.27
coca_roberta-ViT-B-32	224	768	768	512	420.37	87.85	124.45	53.12	8.82	13.12
ViT-B-16-plus	224	896	640	640	208.35	117.19	91.16	56.75	47.49	9.27
ViT-B-16-SigLIP-256	256	768	768	768	203.2	92.93	110.27	57.84	46.82	11.02
ViT-B-16-SigLIP-i18n-256	256	768	768	768	370.63	92.93	277.7	57.84	46.82	11.02
ViT-B-16-plus-240	240	896	640	640	208.38	117.21	91.16	64.03	54.76	9.27
convnext_base_w_320	320	768	640	640	179.39	88.22	91.16	71.94	62.67	9.27
convnext_large	224	768	768	768	321.06	197.41	123.65	82.02	68.72	13.3
coca_base	288	768	768	512	440.34	86.4	134.66	99.09	46.47	13.3
roberta-ViT-B-32	224	768	512	512	212.72	87.85	124.87	105.87	8.82	97.05
xlm-roberta-base-ViT-B-32	224	768	512	512	366.12	87.85	278.27	105.87	8.82	97.05
convnext_large_d	256	768	768	768	351.77	199.77	152.0	107.5	89.76	17.73
ViT-B-16-SigLIP-384	384	768	768	768	203.45	93.18	110.27	123.15	112.13	11.02
ViT-L-16	224	1024	768	768	427.74	304.09	123.65	136.41	123.11	13.3
convnext_large_d_320	320	768	768	768	351.77	199.77	152.0	157.98	140.25	17.73
RN50x16	384	96	768	768	290.98	167.33	123.65	162.69	149.39	13.3
ViT-L-14-CLIPA	224	1024	768	768	414.21	303.96	110.25	167.5	162.03	5.47
EVA02-L-14	224	768	768	768	427.76	304.11	123.65	175.3	162.0	13.3
ViT-L-14	224	1024	768	768	427.62	303.97	123.65	175.33	162.03	13.3
ViT-L-14-quickgelu	224	1024	768	768	427.62	303.97	123.65	175.33	162.03	13.3
convnext_xlarge	256	768	1024	1024	653.89	350.25	303.65	198.38	159.14	39.24
ViT-L-16-SigLIP-256	256	768	1024	1024	652.15	315.96	336.19	201.62	162.56	39.06
coca_ViT-L-14	224	1024	768	768	638.45	306.72	123.65	214.52	163.64	13.3
ViT-B-16-SigLIP-512	512	768	768	768	203.79	93.52	110.27	227.26	216.24	11.02
ViT-SO400M-14-SigLIP	224	768	1152	1152	877.36	427.68	449.68	233.54	220.35	13.19
ViT-L-14-280	280	1024	768	768	427.76	304.11	123.65	271.79	258.49	13.3
ViT-L-16-320	320	1024	768	768	427.95	304.3	123.65	271.93	258.63	13.3
ViT-H-16	224	1280	1024	1024	986.26	632.23	354.03	301.72	254.63	47.09
ViT-H-14-CLIPA	224	1280	1024	1024	968.24	632.07	336.16	354.02	334.59	19.43
nllb-clip-base	224	768	512	512	501.89	87.85	414.04	369.6	8.82	360.78
ViT-H-14	224	1280	1024	1024	986.11	632.08	354.03	381.68	334.59	47.09
ViT-H-14-quickgelu	224	1280	1024	1024	986.11	632.08	354.03	381.68	334.59	47.09
ViT-L-14-CLIPA-336	336	1024	768	768	414.54	304.29	110.25	387.39	381.92	5.47
EVA02-L-14-336	336	768	768	768	428.08	304.43	123.65	395.16	381.86	13.3
ViT-L-14-336	336	1024	768	768	427.94	304.29	123.65	395.22	381.92	13.3
ViT-L-16-SigLIP-384	384	768	1024	1024	652.48	316.28	336.19	422.91	383.85	39.06
convnext_xxlarge	256	768	1024	1024	1200.58	846.54	354.03	443.03	395.94	47.09
nllb-clip-base-siglip	384	768	512	768	507.47	93.18	414.3	472.91	112.13	360.78
mt5-xl-ViT-H-14	224	1280	512	1024	2306.75	632.08	1674.68	514.04	334.59	179.45
EVA01-g-14	224	768	768	1024	1136.44	1012.59	123.85	547.36	534.06	13.3
RN50x64	448	128	1024	1024	623.26	420.38	202.88	552.65	529.11	23.55
EVA01-g-14-plus	224	768	1024	1024	1366.62	1012.59	354.03	581.15	534.06	47.09
ViT-g-14	224	1408	1024	1024	1366.68	1012.65	354.03	581.15	534.06	47.09
convnext_xxlarge_320	320	768	1024	1024	1200.58	846.54	354.03	665.74	618.65	47.09
xlm-roberta-large-ViT-H-14	224	1280	512	1024	1193.01	632.08	560.94	671.01	334.59	336.42
ViT-SO400M-14-SigLIP-384	384	768	1152	1152	877.96	428.23	449.73	723.48	670.35	53.13
ViT-H-14-CLIPA-336	336	1280	1024	1024	968.64	632.48	336.16	800.88	781.45	19.43
ViT-bigG-14-CLIPA	224	1664	1280	1280	2517.22	1844.9	672.32	1007.93	967.5	40.44
ViT-H-14-378-quickgelu	378	1280	1024	1024	986.71	632.68	354.03	1054.05	1006.96	47.09
ViT-bigG-14	224	1664	1280	1280	2539.57	1844.91	694.66	1065.36	967.5	97.86
nllb-clip-large	224	1280	512	1024	1399.22	632.08	767.14	1468.46	334.59	1133.87
nllb-clip-large-siglip	384	768	512	1152	1195.5	428.23	767.27	1804.22	670.35	1133.87
ViT-e-14	224	1792	1280	1280	4581.09	3807.72	773.37	2091.45	1981.35	110.1
ViT-bigG-14-CLIPA-336	336	1664	1280	1280	2517.76	1845.44	672.32	2271.58	2231.15	40.44
EVA02-E-14	224	768	1024	1024	4704.59	4350.56	354.03	2311.42	2264.33	47.09
EVA02-E-14-plus	224	768	1280	1024	5044.89	4350.56	694.33	2362.19	2264.33	97.86