MMIM-MODEL

最新推荐文章于 2024-09-16 16:34:53 发布
weixin_47705508
最新推荐文章于 2024-09-16 16:34:53 发布
阅读量121
点赞数 1
文章标签：深度学习人工智能
本文链接：https://blog.csdn.net/weixin_47705508/article/details/139936388
版权
import torch
import torch.nn.functional as F
import time

from torch import nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from transformers import BertModel, BertConfig

# def: 这是Python中定义函数的关键字。
# add_noise: 这是函数的名称。
# x: 函数的第一个参数，表示需要添加噪声的张量。
# intens: 函数的第二个参数，代表噪声强度，默认值为1e-7。这个参数是可选的，如果在调用函数时没有指定，它将使用默认值。
def add_noise(x, intens=1e-7):
    # torch.rand(x.size()): 这个表达式生成一个与输入张量 x 尺寸相同的随机张量。每个元素都是从0到1的均匀分布中随机抽取的。
    # * intens: 将前面生成的随机张量的每个元素乘以噪声强度 intens。这样做的目的是控制噪声的总体水平，使其保持在一个相对较小的范围内。
    # x +: 最后，将经过缩放的噪声添加到原始张量 x 上，从而生成并返回一个新的带有噪声的张量。
    return x + torch.rand(x.size()) * intens

# 这行代码定义了一个名为 LanguageEmbeddingLayer 的类，它继承自 nn.Module。
# 这是 PyTorch 中所有神经网络模块的基类。
class LanguageEmbeddingLayer(nn.Module):
    """Embed input text with "glove" or "Bert"
    """
    # 在构造函数中，首先通过 super() 调用基类的构造函数。
    def __init__(self, hp):
        super(LanguageEmbeddingLayer, self).__init__()
        # BertConfig.from_pretrained 用于加载预先配置的BERT模型配置，启用 output_hidden_states 以获取所有隐藏层的状态。
        bertconfig = BertConfig.from_pretrained('bert-base-uncased', output_hidden_states=True)
        # self.bertmodel 初始化为预训练的BERT模型，配置为刚加载的 bertconfig。
        self.bertmodel = BertModel.from_pretrained('bert-base-uncased', config=bertconfig)

    # 在使用 BERT 模型时，输入通常包括三个主要部分：
    # 输入 ID（input IDs）、注意力掩码（attention mask）和令牌类型 ID（token type IDs）。
    # 这些输入用于编码句子并告知模型如何处理它们。以下是详细的解释和示例代码，说明如何生成这些输入并将它们传递给 BERT 模型。
    # 1. 输入 ID（input IDs）
    # 输入 ID 是输入文本的标记（tokens）在 BERT 词汇表中的索引。每个标记都映射到一个唯一的 ID。
    # 2. 注意力掩码（attention mask）
    # 注意力掩码用于指示哪些标记是真实标记，哪些是填充标记（padding）。真实标记对应的掩码值为 1，填充标记对应的掩码值为 0。
    # 3. 令牌类型 ID（token type IDs）
    # 令牌类型 ID 用于区分不同的句子（在句对分类任务中）。通常，第一个句子的令牌类型 ID 为 0，第二个句子的令牌类型 ID 为 1。

    # forward 方法定义了如何处理输入数据。它接收BERT的输入ID、注意力掩码和令牌类型ID。
    # 调用 self.bertmodel 处理输入，得到输出，其中 bert_output[0] 是序列级的表示。
    def forward(self, sentences, bert_sent, bert_sent_type, bert_sent_mask):
        bert_output = self.bertmodel(input_ids=bert_sent,
                                attention_mask=bert_sent_mask,
                                token_type_ids=bert_sent_type)
        bert_output = bert_output[0]
        return bert_output   # return head (sequence representation)

# 定义了一个名为 SubNet 的类，用于创建预融合阶段的子网络，也继承自 nn.Module。
class SubNet(nn.Module):
    '''
    The subnetwork that is used in TFN for video and audio in the pre-fusion stage
    '''
    # 在构造函数中，初始化了三个线性层和一个dropout层。
    # nn.Dropout 是用来防止过拟合，随机“丢弃”一部分神经元。
    # nn.Linear 是全连接层，用于学习输入数据的特征。
    def __init__(self, in_size, hidden_size, n_class, dropout, modal_name='text'):
        '''
        Args:
            in_size: input dimension
            hidden_size: hidden layer dimension
            dropout: dropout probability
        Output:
            (return value in forward) a tensor of shape (batch_size, hidden_size)
        '''
        super(SubNet, self).__init__()
        # self.norm = nn.BatchNorm1d(in_size)
        # nn.Dropout 是PyTorch神经网络模块中用于添加dropout层的类。
        # p=dropout 是传递给 nn.Dropout 的参数，其中 p 是dropout的概率（即每个神经元被设置为0的概率），
        # 而 dropout 是一个变量或超参数，通常在0到1之间（不包括0和1）。
        # self.drop 是类的一个属性，用于在后续的前向传播函数（forward 方法）中调用这个dropout层。
        self.drop = nn.Dropout(p=dropout)
        self.linear_1 = nn.Linear(in_size, hidden_size)
        self.linear_2 = nn.Linear(hidden_size, hidden_size)
        self.linear_3 = nn.Linear(hidden_size, n_class)

    # forward 方法定义了数据如何通过网络层传递。
    # dropped 应用dropout。
    # y_1 和 y_2 通过激活函数 torch.tanh 处理，用于增加非线性。
    # fusion 是一个中间变量，似乎在这个上下文中未被使用，可能是个错误或未完成的代码片段。
    # 最终返回 y_2 和 y_3，其中 y_3 是最后的输出层，可以用于分类或回归任务的输出。
    def forward(self, x):
        '''
        Args:
            x: tensor of shape (batch_size, in_size)
        '''
        # normed = self.norm(x)
        dropped = self.drop(x)
        y_1 = torch.tanh(self.linear_1(dropped))
        fusion = self.linear_2(y_1)
        y_2 = torch.tanh(self.linear_2(y_1))
        y_3 = self.linear_3(y_2)
        return y_2, y_3

class CLUB(nn.Module):
    """
        Compute the Contrastive Log-ratio Upper Bound (CLUB) given a pair of inputs.
        Refer to https://arxiv.org/pdf/2006.12013.pdf and https://github.com/Linear95/CLUB/blob/f3457fc250a5773a6c476d79cda8cb07e1621313/MI_DA/MNISTModel_DANN.py#L233-254

        Args:
            hidden_size(int): embedding size
            activation(int): the activation function in the middle layer of MLP
    """
    # 定义了一个名为 CLUB 的类，这个类用于估计两个模态之间的对比对数比上界（Contrastive Log-Ratio Upper Bound）。
    # 代码定义了神经网络的两个部分，一个用于估计均值 (mu)，另一个用于估计对数方差 (logvar)。
    def __init__(self, hidden_size, activation='Tanh'):
        super(CLUB, self).__init__()
        # try语句：这是Python中的错误处理机制。try块让你测试一个代码块是否有错误。
        try:
            # # getattr函数：这是一个内置函数，用于返回对象属性值。
            # 这里的对象是torch.nn模块，activation是传入的激活函数名称的字符串（如'ReLU'、'Sigmoid'等）。
            # 如果activation字符串对应的激活函数Tanh在torch.nn模块中存在，则getattr会返回这个函数对象。
            self.activation = getattr(nn, activation)
        except:
            raise ValueError("Error: CLUB activation function not found in torch library")

        # 定义一个多层感知机 (MLP) 网络 mlp_mu，用于从输入特征中预测均值 (mu)。
        # 该网络包括两个线性层，每个线性层的输入和输出维度都是 hidden_size，中间插入了一个激活函数。
        # 使用 self.activation() 创建激活层的实例，确保网络中的非线性。
        self.mlp_mu = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            self.activation(),
            nn.Linear(hidden_size, hidden_size)
        )
        # 定义另一个多层感知机网络 mlp_logvar，用于从相同的输入特征中预测对数方差 (logvar)。
        # 与 mlp_mu 结构类似，但在最后添加了 Tanh 激活函数。
        # Tanh 函数用于确保输出值落在 [-1, 1] 的范围内，这对于表示对数方差是有意义的，因为它可以帮助避免数值计算问题，如过大或过小的方差。
        self.mlp_logvar = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            self.activation(),
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh()
        )

    # 通过减少正项和负项来估计互信息的对比对数比上界，用于衡量两个随机变量之间的相互依赖程度。
    # 定义前向传播方法，接收两个张量 modal_a 和 modal_b 作为输入，
    # 这两个张量分别代表随机变量 X 和 Y。sample 参数在这段代码中未被使用。
    def forward(self, modal_a, modal_b, sample=False):
        """
            CLUB with random shuffle, the Q function in original paper:
                CLUB = E_p(x,y)[log q(y|x)]-E_p(x)p(y)[log q(y|x)]
            
            Args:
                modal_a (Tensor): x in above equation
                model_b (Tensor): y in above equation
        """
        # 使用 mlp_mu 和 mlp_logvar 网络处理输入 modal_a，分别输出均值 mu 和对数方差 logvar。
        # 这两个输出将用于定义条件分布 q(Y∣X)。
        mu, logvar = self.mlp_mu(modal_a), self.mlp_logvar(modal_a) # (bs, hidden_size)
        # 获取批次大小，并将 mu 的值赋给局部变量 pred，用于后续的计算。
        batch_size = mu.size(0)
        pred = mu

        # pred b using a
        # pred_tile 和 true_tile 通过重复操作扩展 mu 的维度，使其形状适配后续的运算。
        # 这一步骤实现了为每个样本生成与其它所有样本的重复矩阵，以便进行成对比较。
        pred_tile = mu.unsqueeze(1).repeat(1, batch_size, 1)   # (bs, bs, emb_size)
        true_tile = pred.unsqueeze(0).repeat(batch_size, 1, 1)      # (bs, bs, emb_size)

        # 计算正项和负项：
        # 正项 (positive)：代表同一个批次中的 x 和对应的 y 之间的条件概率 q(y∣x) 的对数。
        # 这部分计算了 X 和 Y 之间的均方误差，通过对数方差进行规范化。
        # 负项 (negative)：代表不同批次的 x 与 y 之间的条件概率的对数。
        # 通过计算所有样本对的均方误差的平均值来实现。
        positive = - (mu - modal_b) ** 2 /2./ torch.exp(logvar)
        negative = - torch.mean((true_tile-pred_tile)**2, dim=1)/2./torch.exp(logvar)

        # 计算并返回两个值：
        # lld (Log-Likelihood Difference)：正项的均值，表示在给定 X 的情况下 Y 的对数似然。
        # bound：正项减去负项的结果，提供了互信息的一个上界估计。
        lld = torch.mean(torch.sum(positive, -1))
        bound = torch.mean(torch.sum(positive, -1)-torch.sum(negative, -1))
        return lld, bound
class MMILB(nn.Module):
    """Compute the Modality Mutual Information Lower Bound (MMILB) given bimodal representations.
    Args:
        x_size (int): embedding size of input modality representation x
        y_size (int): embedding size of input modality representation y
        mid_activation(int): the activation function in the middle layer of MLP
        last_activation(int): the activation function in the last layer of MLP that outputs logvar
    """
    # 该类计算给定双模态表示的互信息的下界。
    # 这个类利用神经网络来估计和处理两种不同类型的输入数据之间的互信息。
    # 通过内部的多层感知机网络来处理和分析不同模态之间的统计关系，
    # 特别是通过 mu 和 logvar 来估计输入模态之间的条件概率分布的参数，
    # 同时也提供了处理模态熵的附加网络。

    # self: 在类的方法中表示当前对象的实例。
    # x_size (int): 输入模态x的嵌入大小，这是第一层网络输入的维度。
    # y_size (int): 输入模态y的嵌入大小，这影响网络中的层和输出维度。
    # mid_activation (str): 中间层使用的激活函数，默认为'ReLU'（Rectified Linear Unit）。
    # ReLU通常用于增加非线性元素，有助于解决梯度消失问题，加速神经网络的训练。
    # last_activation (str): 输出层之前的最后一个激活函数，默认为'Tanh'（双曲正切函数）。
    # Tanh函数输出范围为-1到1，它通常用于模型输出需要被限制在这个范围内的场景。
    def __init__(self, x_size, y_size, mid_activation='ReLU', last_activation='Tanh'):
        # 这行代码调用 nn.Module 的构造函数。
        # 在 Python 中，继承自 nn.Module 的类必须在其构造函数中调用 super() 来正确初始化基类。
        super(MMILB, self).__init__()
        # 这段代码尝试从 torch.nn 模块中获取指定的激活函数。
        # 如果指定的激活函数不存在，将引发一个 ValueError 异常。
        # 这种方法提供了一种动态选择激活函数的方式，并确保了代码的健壮性。
        try:
            self.mid_activation = getattr(nn, mid_activation)
            self.last_activation = getattr(nn, last_activation)
        except:
            raise ValueError("Error: CLUB activation function not found in torch library")
        # 初始化一个序列模型 mlp_mu，该模型用于从输入的模态 x 中预测 mu。
        # 模型包含两个线性层，其中间插入了一个激活函数。
        # 此结构的设计是为了能够从输入数据中提取和转换特征，适用于表示更复杂的关系。
        self.mlp_mu = nn.Sequential(
            nn.Linear(x_size, y_size),
            self.mid_activation(),
            nn.Linear(y_size, y_size)
        )
        # 类似于 mlp_mu，mlp_logvar 也是一个序列模型，用于从输入的模态 x 中预测 logvar。
        # 与 mlp_mu 的结构相同，这表明两者可能在功能上有类似的作用，但用于不同的统计估计（均值和方差）。
        self.mlp_logvar = nn.Sequential(
            nn.Linear(x_size, y_size),
            self.mid_activation(),
            nn.Linear(y_size, y_size),
        )
        # entropy_prj 是一个降维网络，用于从 y 的特征中提取更简化的表征，通过 Tanh 激活函数确保输出范围限制在 [-1, 1] 之间。
        # 这样的处理可以有助于后续计算的数值稳定性和解释性。
        self.entropy_prj = nn.Sequential(
            nn.Linear(y_size, y_size // 4),
            nn.Tanh()
        )

    # x: 表示模态x的张量，形状为 (bs, x_size)。
    # y: 表示模态y的张量，形状为 (bs, y_size)。
    # labels: 可选的张量，可能包含用于将 y 分类为正样本和负样本的标签。
    # mem: 可选的字典，可能包含正负样本的历史数据，用于熵计算。
    def forward(self, x, y, labels=None, mem=None):
        """ Forward lld (gaussian prior) and entropy estimation, partially refers the implementation
        of https://github.com/Linear95/CLUB/blob/master/MI_DA/MNISTModel_DANN.py
            Args:
                x (Tensor): x in above equation, shape (bs, x_size)
                y (Tensor): y in above equation, shape (bs, y_size)
        """
        # 通过两个分离的多层感知机（MLP）处理 x，预测均值（mu）和对数方差（logvar），这些参数用于参数化高斯分布，估算互信息。
        mu, logvar = self.mlp_mu(x), self.mlp_logvar(x) # (bs, hidden_size)

        # 计算提供的 x 和 y 对的高斯对数似然，得到 lld 值，作为 x 和 y 之间互信息的下界估计。
        batch_size = mu.size(0)
        positive = -(mu - y)**2/2./torch.exp(logvar)
        lld = torch.mean(torch.sum(positive,-1))

        # For Gaussian Distribution Estimation
        pos_y = neg_y = None
        H = 0.0
        sample_dict = {'pos':None, 'neg':None}

        # 如果提供了标签，则方法将 y 投影到更低维度的空间，并根据标签将其分为正负样本。
        # 这些样本被存储在 sample_dict 中，以备后续的熵计算。
        if labels is not None:
            # store pos and neg samples
            y = self.entropy_prj(y)
            pos_y = y[labels.squeeze() > 0]
            neg_y = y[labels.squeeze() < 0]

            sample_dict['pos'] = pos_y
            sample_dict['neg'] = neg_y

            # estimate entropy
            # 如果提供了记忆（mem），即过去样本的数据，该方法使用它来计算更准确和累积的熵估计 (H)。
            # 它为正负样本分布构建协方差矩阵，并计算其行列式。
            # 然后使用行列式的对数来计算熵估计，有助于理解模态间的变异性和分布特征。
            if mem is not None and mem.get('pos', None) is not None:
                pos_history = mem['pos']
                neg_history = mem['neg']

                # Diagonal setting
                # pos_all = torch.cat(pos_history + [pos_y], dim=0) # n_pos, emb
                # neg_all = torch.cat(neg_history + [neg_y], dim=0)
                # mu_pos = pos_all.mean(dim=0)
                # mu_neg = neg_all.mean(dim=0)

                # sigma_pos = torch.mean(pos_all ** 2, dim = 0) - mu_pos ** 2 # (embed)
                # sigma_neg = torch.mean(neg_all ** 2, dim = 0) - mu_neg ** 2 # (embed)
                # H = 0.25 * (torch.sum(torch.log(sigma_pos)) + torch.sum(torch.log(sigma_neg)))

                # compute the entire co-variance matrix
                pos_all = torch.cat(pos_history + [pos_y], dim=0) # n_pos, emb
                neg_all = torch.cat(neg_history + [neg_y], dim=0)
                mu_pos = pos_all.mean(dim=0)
                mu_neg = neg_all.mean(dim=0)
                sigma_pos = torch.mean(torch.bmm((pos_all-mu_pos).unsqueeze(-1), (pos_all-mu_pos).unsqueeze(1)), dim=0)
                sigma_neg = torch.mean(torch.bmm((neg_all-mu_neg).unsqueeze(-1), (neg_all-mu_neg).unsqueeze(1)), dim=0)
                a = 17.0795
                H = 0.25 * (torch.logdet(sigma_pos) + torch.logdet(sigma_neg))

        # 返回互信息的下界（lld）、包含最新正负样本的字典（sample_dict）、以及估算的熵（H）。
        return lld, sample_dict, H


# 定义了一个名为 CPC（Contrastive Predictive Coding）的类，用于计算对比预测编码的得分。
# 通过对比正样本和负样本的方式提取数据的内在特征和结构信息，是一种无监督学习技术。
class CPC(nn.Module):
    """
        Contrastive Predictive Coding: score computation. See https://arxiv.org/pdf/1807.03748.pdf.

        Args:
            x_size (int): embedding size of input modality representation x
            y_size (int): embedding size of input modality representation y
    """
    # 在构造函数中，通过 super() 调用确保父类的构造函数被执行，这是初始化神经网络模块所必需的。
    def __init__(self, x_size, y_size, n_layers=1, activation='Tanh'):
        super().__init__()
        # 初始化输入和输出的维度（x_size, y_size），层数（n_layers）以及激活函数（activation）。
        # 使用 getattr(nn, activation) 获取指定的激活函数，确保灵活性和动态配置能力。
        self.x_size = x_size
        self.y_size = y_size
        self.layers = n_layers
        self.activation = getattr(nn, activation)
        # 根据层数 n_layers 构建网络。如果只有一层，直接使用一个线性层。
        # 如果有多层，则创建一个列表 net，根据层数添加多个线性层和激活层，然后使用 nn.Sequential 将列表转换成一个序列模型。
        if n_layers == 1:
            self.net = nn.Linear(
                in_features=y_size,
                out_features=x_size
            )
        else:
            net = []
            for i in range(n_layers):
                if i == 0:
                    net.append(nn.Linear(self.y_size, self.x_size))
                    net.append(self.activation())
                else:
                    net.append(nn.Linear(self.x_size, self.x_size))
            self.net = nn.Sequential(*net)

    # 使用网络 self.net 对输入 y 进行处理，得到预测 x_pred。
    def forward(self, x, y):
        """Calulate the score 
        """
        # import ipdb;ipdb.set_trace()
        x_pred = self.net(y)    # bs, emb_size

        # normalize to unit sphere
        # 对 x_pred 和 x 进行规范化处理，确保它们在单位球面上，有助于稳定训练过程和提高模型的泛化能力。
        x_pred = x_pred / x_pred.norm(dim=1, keepdim=True)
        x = x / x.norm(dim=1, keepdim=True)

        # 计算正样本得分 pos 和负样本得分 neg。
        # 正样本得分是 x 和 x_pred 之间的点积，负样本得分使用 torch.logsumexp 来计算所有负样本得分的对数和。
        # 最后，计算 NCE（Noise-Contrastive Estimation）损失 nce，通过最小化此损失，可以使模型更好地学习区分正样本和负样本。
        pos = torch.sum(x*x_pred, dim=-1)   # bs
        neg = torch.logsumexp(torch.matmul(x, x_pred.t()), dim=-1)   # bs
        nce = -(pos - neg).mean()
        # 返回 NCE 损失，该损失将用于训练过程中优化模型参数。
        return nce
# 定义了一个名为 RNNEncoder 的类，该类继承自 nn.Module 并实现了一个循环神经网络编码器（RNN），使用 LSTM 单元来处理序列数据。
class RNNEncoder(nn.Module):
    # 构造函数初始化了编码器所需的各个参数：
    # in_size: 输入维度，即每个时间步长输入数据的特征数量。
    # hidden_size: 隐藏层的维度。
    # out_size: 输出维度。
    # num_layers: LSTM的层数。
    # dropout: 在RNN中使用的dropout概率，有助于防止过拟合。
    # bidirectional: 指示是否使用双向 LSTM。
    def __init__(self, in_size, hidden_size, out_size, num_layers=1, dropout=0.2, bidirectional=False):
        '''
        Args:
            in_size: input dimension
            hidden_size: hidden layer dimension
            num_layers: specify the number of layers of LSTMs.
            dropout: dropout probability
            bidirectional: specify usage of bidirectional LSTM
        Output:
            (return value in forward) a tensor of shape (batch_size, out_size)
        '''
        super().__init__()
        # 存储 bidirectional 参数，用于确定在 RNN 构建和数据处理时是否使用双向 LSTM。
        self.bidirectional = bidirectional
        # 创建一个 LSTM 层，配置了输入尺寸、隐藏层尺寸、层数、dropout概率和是否双向
        self.rnn = nn.LSTM(in_size, hidden_size, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional, batch_first=False)
        # 定义一个 dropout 层，以在网络中的适当位置应用 dropout。
        self.dropout = nn.Dropout(dropout)
        # 定义一个线性层，用于将 LSTM 的最终隐藏状态转换为期望的输出尺寸。如果是双向 LSTM，则需要考虑正向和反向两个方向的隐藏状态。
        self.linear_1 = nn.Linear((2 if bidirectional else 1)*hidden_size, out_size)

    # 前向传播函数接收输入 x 和每个序列的长度 lengths。
    def forward(self, x, lengths):
        '''
        x: (batch_size, sequence_len, in_size)
        '''
        # 将 lengths 转换为整型，确保数据类型的一致性，这对于后续的操作很重要。
        lengths = lengths.to(torch.int64)
        bs = x.size(0)
        # 使用 pack_padded_sequence 处理填充过的序列，这使得 LSTM 只处理序列的有效部分，提高了处理效率和模型性能。
        packed_sequence = pack_padded_sequence(x, lengths, enforce_sorted=False)
        # 运行 LSTM 网络，final_states 包含最后的隐藏状态和单元状态。
        _, final_states = self.rnn(packed_sequence)

        # 如果是双向 LSTM，则将两个方向的隐藏状态拼接在一起。单向 LSTM 则直接使用最终状态。对结果应用 dropout。
        if self.bidirectional:
            h = self.dropout(torch.cat((final_states[0][0],final_states[0][1]),dim=-1))
        else:
            h = self.dropout(final_states[0].squeeze())
        # 将处理后的隐藏状态通过线性层转换成最终的输出。
        y_1 = self.linear_1(h)
        # 返回模型的输出。
        return y_1