(4-7-03)文生图大模型实操:基于深度学习的图文匹配系统(3)现AI模型

3. 实现AI模型

在“models”目录中包含多个深度学习模型的实现,包括双向LSTM、MobileNetV1和ResNet等。这些模型用于提取图像和文本特征,并通过卷积层将其映射到共同的特征空间,以实现多模态学习。该目录的设计灵活,能够根据输入参数选择不同的图像模型,并初始化相应的权重,为后续任务提供支持。

(1)文件bi_lstm.py定义了一个双向LSTM神经网络模型BiLSTM,用于处理序列数据。模型包含词嵌入层、单向和双向LSTM层。通过前向传播函数forward,模型根据输入的文本序列和长度计算LSTM的输出,并将其最大化池化。辅助函数bilstm_out负责对嵌入序列进行排序、打包、LSTM计算和解包。权重初始化函数weight_init用于对卷积层进行权重初始化。

seed_num = 223
torch.manual_seed(seed_num)
random.seed(seed_num)

"""
神经网络模型:双向LSTM
"""
class BiLSTM(nn.Module):
    def __init__(self, args):
        super(BiLSTM, self).__init__()

        self.hidden_dim = args.num_lstm_units

        V = args.vocab_size
        D = args.embedding_size

        # 词嵌入层
        self.embed = nn.Embedding(V, D, padding_idx=0)
        
        self.bilstm = nn.ModuleList()
        self.bilstm.append(nn.LSTM(D, args.num_lstm_units, num_layers=1, dropout=0, bidirectional=False, bias=False))
        
        self.bidirectional = args.bidirectional
        if self.bidirectional:
            self.bilstm.append(nn.LSTM(D, args.num_lstm_units, num_layers=1, dropout=0, bidirectional=False, bias=False))

    def forward(self, text, text_length):
        embed = self.embed(text)

        # 单向LSTM
        bilstm_out = self.bilstm_out(embed, text_length, 0)
        
        if self.bidirectional:
            index_reverse = list(range(embed.shape[0]-1, -1, -1))
            index_reverse = torch.LongTensor(index_reverse).cuda()
            embed_reverse = embed.index_select(0, index_reverse)
            text_length_reverse = text_length.index_select(0, index_reverse)
            bilstm_out_bidirection = self.bilstm_out(embed_reverse, text_length_reverse, 1)
            bilstm_out_bidirection_reverse = bilstm_out_bidirection.index_select(0, index_reverse)
            bilstm_out = torch.cat([bilstm_out, bilstm_out_bidirection_reverse], dim=2)
        bilstm_out, _ = torch.max(bilstm_out, dim=1)
        bilstm_out = bilstm_out.unsqueeze(2).unsqueeze(2)
        return bilstm_out

    def bilstm_out(self, embed, text_length, index):

        _, idx_sort = torch.sort(text_length, dim=0, descending=True)
        _, idx_unsort = torch.sort(idx_sort, dim=0)

        embed_sort = embed.index_select(0, idx_sort)
        length_list = text_length[idx_sort]
        pack = nn.utils.rnn.pack_padded_sequence(embed_sort, length_list, batch_first=True)

        bilstm_sort_out, _ = self.bilstm[index](pack)
        bilstm_sort_out  = nn.utils.rnn.pad_packed_sequence(bilstm_sort_out, batch_first=True)
        bilstm_sort_out = bilstm_sort_out[0]

        bilstm_out = bilstm_sort_out.index_select(0, idx_unsort)

        return bilstm_out

    def weight_init(self, m):
        if isinstance(m, nn.Conv2d):
            nn.init.xavier_uniform_(m.weight.data, 1)
            nn.init.constant(m.bias.data, 0)

(2)文件mobilenet.py实现了MobileNetV1模型,旨在用于高效的图像分类任务。该模型采用深度可分离卷积结构,以减少参数量和计算复杂度,提升推理速度。构造函数定义了卷积层、批归一化层和激活函数的组合,并通过forward方法将输入数据传入模型进行处理。模型还包含权重初始化方法,通过正态分布为卷积层的权重进行初始化,以提高训练效果。

import torch.nn as nn
import math

class MobileNetV1(nn.Module):
    def __init__(self, dropout_keep_prob=0.999):
        super(MobileNetV1, self).__init__()
        self.dropout_keep_prob = dropout_keep_prob
        self.dropout = nn.Dropout(1 - dropout_keep_prob)

        # 卷积层与批归一化的组合
        def conv_bn(inp, oup, stride):
            return nn.Sequential(
                nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
                nn.BatchNorm2d(oup),
                nn.ReLU6(inplace=True)
            )

        # 深度可分离卷积层
        def conv_dw(inp, oup, stride):
            return nn.Sequential(
                nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
                nn.BatchNorm2d(inp),
                nn.ReLU6(inplace=True),

                nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
                nn.BatchNorm2d(oup),
                nn.ReLU6(inplace=True),
            )

        # MobileNetV1的整体结构
        self.model = nn.Sequential(
            conv_bn(3, 32, 2),
            conv_dw(32, 64, 1),
            conv_dw(64, 128, 2),
            conv_dw(128, 128, 1),
            conv_dw(128, 256, 2),
            conv_dw(256, 256, 1),
            conv_dw(256, 512, 2),
            conv_dw(512, 512, 1),
            conv_dw(512, 512, 1),
            conv_dw(512, 512, 1),
            conv_dw(512, 512, 1),
            conv_dw(512, 512, 1),
            conv_dw(512, 1024, 2),
            conv_dw(1024, 1024, 1),
            nn.AvgPool2d(7),
        )

    def weight_init(self, m):
        if isinstance(m, nn.Conv2d):
            # 使用正态分布初始化卷积层的权重
            nn.init.normal_(m.weight.data, std=0.09)

    def forward(self, x):
        x = self.model(x)
        x = self.dropout(x)
        return x

(3)文件resnet.py实现了ResNet模型系列,包括ResNet-18、ResNet-34、ResNet-50、ResNet-101和ResNet-152。ResNet通过使用残差连接来缓解深度网络训练中的梯度消失问题。

import torch.nn as nn
import torch.utils.model_zoo as model_zoo

__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
           'resnet152']

# 预训练模型的下载链接
model_urls = {
    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
}

def conv3x3(in_planes, out_planes, stride=1):
    """带填充的3x3卷积"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=False)

def conv1x1(in_planes, out_planes, stride=1):
    """1x1卷积"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)

class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample is not None:
            residual = self.downsample(x)
        out += residual  # 残差连接
        out = self.relu(out)
        return out

class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = conv1x1(inplanes, planes)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = conv3x3(planes, planes, stride)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = conv1x1(planes, planes * self.expansion)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.conv3(out)
        out = self.bn3(out)
        if self.downsample is not None:
            residual = self.downsample(x)
        out += residual  # 残差连接
        out = self.relu(out)
        return out

class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes=1000):
        super(ResNet, self).__init__()
        self.inplanes = 64
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        # 构建四个层级的网络
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)
        
        # 初始化权重
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        # 如果需要降采样,构建下采样层
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                nn.BatchNorm2d(planes * block.expansion),
            )
        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avgpool(x)
        # x = x.view(x.size(0), -1)
        # x = self.fc(x)
        return x

# 构造不同版本的ResNet模型
def resnet18(pretrained=False, **kwargs):
    """构造ResNet-18模型。
    Args:
        pretrained (bool): 如果为True,返回在ImageNet上预训练的模型
    """
    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
    return model

def resnet34(pretrained=False, **kwargs):
    """构造ResNet-34模型。
    Args:
        pretrained (bool): 如果为True,返回在ImageNet上预训练的模型
    """
    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
    return model

def resnet50(pretrained=False, **kwargs):
    """构造ResNet-50模型。
    Args:
        pretrained (bool): 如果为True,返回在ImageNet上预训练的模型
    """
    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
    return model

def resnet101(pretrained=False, **kwargs):
    """构造ResNet-101模型。
    Args:
        pretrained (bool): 如果为True,返回在ImageNet上预训练的模型
    """
    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
    return model

def resnet152(pretrained=False, **kwargs):
    """构造ResNet-152模型。
    Args:
        pretrained (bool): 如果为True,返回在ImageNet上预训练的模型
    """
    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
    if pretrained:
        model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
    return model

在上述代码中定义了基本的卷积块BasicBlock和瓶颈块Bottleneck,以及构建ResNet网络的主体结构。构造函数中通过_make_layer方法构建多个层,并使用合适的下采样技术来保证特征图的大小。每个模型还支持加载预训练权重,以提高在新任务上的表现。

(4)文件model.py定义了类Model,该类继承自nn.Module,用于实现一个多模态模型。根据输入参数args选择不同的图像模型(如MobileNetV1、ResNet50或ResNet101)并初始化相应的权重。在模型的构造函数中还包括一个双向LSTM(BiLSTM)用于处理文本数据,并通过1x1卷积将提取的图像和文本特征映射到共同的特征空间。前向传播方法中提取图像和文本特征,并通过build_joint_embeddings方法构建联合嵌入,以便后续的任务(如分类或检索)。

import torch.nn as nn
from .bi_lstm import BiLSTM
from .mobilenet import MobileNetV1
from .resnet import resnet50, resnet101

class Model(nn.Module):
    def __init__(self, args):
        super(Model, self).__init__()
        # 根据参数选择不同的图像模型
        if args.image_model == 'mobilenet_v1':
            self.image_model = MobileNetV1()  # 使用MobileNetV1
            self.image_model.apply(self.image_model.weight_init)  # 初始化权重
        elif args.image_model == 'resnet50':
            self.image_model = resnet50()  # 使用ResNet50
        elif args.image_model == 'resnet101':
            self.image_model = resnet101()  # 使用ResNet101

        self.bilstm = BiLSTM(args)  # 初始化BiLSTM
        self.bilstm.apply(self.bilstm.weight_init)  # 初始化权重

        inp_size = 1024  # 设置输入特征尺寸
        if args.image_model == 'resnet50' or args.image_model == 'resnet101':
            inp_size = 2048  # 对于ResNet,输入特征尺寸为2048

        # 使用1x1卷积缩短张量
        self.conv_images = nn.Conv2d(inp_size, args.feature_size, 1)  # 图像特征卷积
        self.conv_text = nn.Conv2d(1024, args.feature_size, 1)  # 文本特征卷积

    def forward(self, images, text, text_length):
        # 前向传播
        image_features = self.image_model(images)  # 提取图像特征
        text_features = self.bilstm(text, text_length)  # 提取文本特征
        image_embeddings, text_embeddings = self.build_joint_embeddings(image_features, text_features)  # 构建联合嵌入
        return image_embeddings, text_embeddings

    def build_joint_embeddings(self, images_features, text_features):
        # 根据提取的特征构建联合嵌入
        image_embeddings = self.conv_images(images_features).squeeze()  # 图像嵌入
        text_embeddings = self.conv_text(text_features).squeeze()  # 文本嵌入
        return image_embeddings, text_embeddings

注意:本项目实现多种模型(双向LSTM、MobileNetV1和ResNet)是为了充分利用各自的优点,以提升多模态学习的效果。双向LSTM擅长处理序列数据,能有效捕捉文本中的上下文信息;MobileNetV1是一种轻量级卷积神经网络,适合于图像特征提取,尤其是在资源受限的环境下表现良好;ResNet则通过残差连接解决了深层网络训练中的梯度消失问题,能够有效提取高层次的图像特征。结合这些不同类型的模型,项目可以更全面地理解和处理图像与文本之间的关系,从而实现更高的匹配精度和鲁棒性。

最后插个广告,庆祝北京大学出版社的图书《大规模语言模型开发基础与实践》上市,本书内容是本专栏《大模型从入门到实战(数据集、训练、微调、RAG、多模态)》中的1~12部分内容。感兴趣的粉丝可以购买,感谢大家支持:

购书链接:京东          当当

  • 15
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

码农三叔

感谢鼓励

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值