attention

最新推荐文章于 2022-06-13 09:03:01 发布
yongquanfengjie
最新推荐文章于 2022-06-13 09:03:01 发布
阅读量123
点赞数
本文链接：https://blog.csdn.net/yongquanfengjie/article/details/106848693
版权
# -- encoding:utf-8 --
"""
原始数据:
[百, 柳, 报, 春, 兆] ----> [千，花，传，欢，乐]
模型训练的时候,将原始数据进行转换:
编码器输入:
    [百, 柳, 报, 春, 兆]
解码器输入:
    [GO, 千，花，传，欢]
解码器输出(实际值):
    [千，花，传，欢，乐]

"""

import tensorflow as tf


def build_interface(encoder_inputs, encoder_vocab_size, decoder_inputs, decoder_vocab_size,
                    embedding_size=128, rnn_num_units=256, is_training=True,
                    project_output_weight=None, project_output_bias=None):
    """
    前向网络的构建
    :param encoder_inputs: List列表，内部为Tensor对象，编码器输入
    :param encoder_vocab_size: 编码器对应的词汇数目
    :param decoder_inputs:  List列表，内部为Tensor对象，解码器的输入
    :param decoder_vocab_size: 解码器对应的词汇数目
    :param embedding_size: 网络结构中，embedding操作的时候，转换的向量维度大小
    :param rnn_num_units: 网络结构中，RNN的神经元数目
    :param is_training: 训练还是推理预测
    :param project_output_weight: 推理预测的时候，所使用的全连接参数
    :param project_output_bias: 推理预测的时候，所使用的全连接参数
    :return:  返回解码器的输出(仅解码器输出, 还没有涉及到全连接的操作)
    """

    def attention(query, keys, values, pre_attention_distribution=None, interpolate_amount=0.5):
        """
        基于编码器的输出以及解码器的状态计算一个Attention向量值
        实现方式为：各个时刻的权重系数使用解码器的状态来计算
        相关性计算采用矩阵乘法的结果
        encoder_unit_size == decoder_unit_size
        N: batch_size --> 批次大小
        T: encoder_sequence_length --> 编码器序列长度
        E: encoder_unit_size & decoder_unit_size --> 向量维度大小
        :param query: [N,E]
        :param keys: [N,T,E]
        :param values: [N,T,E]
        :param pre_attention_distribution: 上一个时刻的Attention权重值, [N,T]
        :param interpolate_amount: Attention权重值合并的系数
        :return:
        """
        # 1. 计算Attention Score
        # 也就是根据解码器的状态计算和编码器各个时刻之间的相关性，最终形成的数据形状为: [N,T]
        # [N,T] 表示存在N个样本，每个样本存在T个值，这T这个值就表示和T个编码器输出的相关性
        # 采用多维矩阵乘法实现， matmul([N1,N2,...Nn,K1,K2], [N1,N2,...Nn,K2,K3]) --> [N1,N2,....,Nn,K1,K3]
        """
        tf.matmul(encoder_output, tf.expand_dims(decoder_state, axis=-1))
        等价于:
        a = encoder_output # [N,T,E]
        b = tf.expand_dims(decoder_state, axis=-1) # [N,E,1]
        c = [None] * N
        for idx in range(N):
            c[idx] = matmul(a[idx], b[idx]) # matmul([T,E], [E,1]) --> [T,1]
        c # [N,T,1]
        """
        attention_score = tf.matmul(keys, tf.expand_dims(query, axis=-1))  # [N,T,1]
        attention_score = tf.squeeze(attention_score, axis=-1)  # [N,T,1] --> [N,T]

        tf.gather()
        tf.gather_nd()

        # 2. 计算权重值
        attention_distribution = tf.nn.softmax(attention_score)  # [N,T]
        # 相当于残差结构
        if pre_attention_distribution is not None:
            interpolate_amount = max(0.0, min(1.0, interpolate_amount))
            attention_distribution = (1 - interpolate_amount) * attention_distribution \
                                     + interpolate_amount * pre_attention_distribution  # [N,T]
            # TODO: 对attention_distribution做CNN和LRN(卷积和局部响应归一化)

        # 3. 加权求和
        # [N,T,E] = [N,T,E] * [N,T,1]
        attention_output = values * tf.expand_dims(attention_distribution, axis=-1)  # [N,T,E]
        attention_output = tf.reduce_mean(attention_output, 1)  # [N,T,E] --> [N,E]

        return attention_output, attention_distribution

    with tf.variable_scope("seq2seq"):
        # 一、编码器
        with tf.variable_scope("encoder"):
            # 1. 对输入的单词id做embedding转换，得到其对应的向量
            encoder_embedding_inputs = []
            with tf.variable_scope("embedding"):
                encoder_embedding_table = tf.get_variable(name="embedding_table",
                                                          shape=[encoder_vocab_size, embedding_size])
                for encoder_input in encoder_inputs:
                    # 将encoder_input从[N,]形状转换为[N,embedding_size]的形状, 也就是对于每个单词id转换其对应的词向量
                    encoder_embedding_inputs.append(tf.nn.embedding_lookup(encoder_embedding_table, encoder_input))
            # encoder_embedding_inputs: list([N,embedding_size], [N,embedding_size], ....)

            # 2. 对embedding转换后的值，做RNN操作，得到输出信息
            with tf.variable_scope("rnn"):
                # a. 构建rnn对象
                encoder_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=rnn_num_units)
                # b. 执行得到编码器的输出值以及最终的状态值
                # encoder_output: list列表，列表内为每个时刻对应的输出值，类型为Tensor，形状为: [N,rnn_num_units]
                # encoder_state: Tensor对象，最后一个时刻的细胞状态值；在RNNCell，形状为:[N,rnn_num_units]
                encoder_output, encoder_state = tf.nn.static_rnn(
                    cell=encoder_cell,  # RNN Cell对象
                    inputs=encoder_embedding_inputs,  # 各个时刻的输入Tensor所组成的List列表
                    dtype=tf.float32  # 数据类型，会用来构建初始状态值(默认初始状态值为zero)
                )

                # 为了计算Attention，将解码器的输出进行合并
                # list([N,256], [N,256], ...) --> list([N,1,256], [N,1,256], ...) --> [N,5,256]
                encoder_output = tf.concat([tf.expand_dims(eo, 1) for eo in encoder_output], axis=1)

        # 二、解码器
        with tf.variable_scope("decoder"):
            # 1. 对输入的单词id做embedding转换，得到其对应的向量
            decoder_embedding_inputs = []
            with tf.variable_scope("embedding"):
                decoder_embedding_table = tf.get_variable(name="embedding_table",
                                                          shape=[decoder_vocab_size, embedding_size])
                for decoder_input in decoder_inputs:
                    # 将decoder_input从[N,]形状转换为[N,embedding_size]的形状, 也就是对于每个单词id转换其对应的词向量
                    decoder_embedding_inputs.append(tf.nn.embedding_lookup(decoder_embedding_table, decoder_input))
            # decoder_embedding_inputs: list([N,embedding_size], [N,embedding_size], ....)

            # 2. 对embedding之后的结果做解码操作
            with tf.variable_scope("rnn"):
                # a. 构建rnn对象
                decoder_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=rnn_num_units)
                # b. 遍历每个时刻，得到其预测结果
                state = encoder_state  # 将编码器的最终状态作为解码器的初始状态
                outputs = []  # 保存各个时刻的解码器RNN的输出
                pre_output = None  # 保存上一个时刻的RNN输出值， [N,rnn_num_units]
                pre_distribution = None  # 初始情况下，上一个时刻的权重相关性为None

                # 获取第一个时刻输入的attention值
                attention_state, pre_distribution = attention(
                    query=state,  # 解码器的状态作为query
                    keys=encoder_output,  # 编码器的输出作为keys
                    values=encoder_output,  # 编码器的输出作为values
                    pre_attention_distribution=pre_distribution  # 上一个时刻的权重值
                )

                for idx, _inp in enumerate(decoder_embedding_inputs):
                    # 0. 对数据转换
                    if is_training:
                        # 训练阶段，使用实际的解码器输入
                        _inp = _inp
                    elif idx > 0:
                        # 推理预测阶段，并且除第一个时刻的预测之外，其它所有时刻使用上一个时刻的预测结果
                        # 1. 对RNN的输出做全连接转换，得到最终输出预测值, [N,decoder_vocab_size]
                        logits = tf.nn.xw_plus_b(pre_output, project_output_weight, project_output_bias)
                        # 2. 获取置信度最高的对应索引, [N,]
                        decoder_input = tf.argmax(logits, -1)
                        # 3. 通过embedding得到对应的向量
                        _inp = tf.nn.embedding_lookup(decoder_embedding_table, decoder_input)

                    # 将Attention值和_inp(原始解码器输入)合并，产生新的解码器输入
                    _inp = tf.concat([_inp, attention_state], axis=-1)

                    # 1. 将_inp和上一个时刻的状态合并到一起做RNN的操作，得到当前时刻的输出以及状态值
                    output, state = decoder_cell(_inp, state)

                    # 基于当前时刻的状态，计算对应的Attention值
                    # Attention值用处：1. 当前时刻的输出合并；2. 下一个时刻的输入
                    attention_state, pre_distribution = attention(
                        query=state,  # 解码器的状态作为query
                        keys=encoder_output,  # 编码器的输出作为keys
                        values=encoder_output,  # 编码器的输出作为values
                        pre_attention_distribution=pre_distribution  # 上一个时刻的权重值
                    )

                    # 将Attention和output合并
                    output = tf.concat([output, attention_state], axis=-1)

                    # 2. 将当前时刻的输出添加到集合中，作为返回值
                    outputs.append(output)

                    # 3. 根据参数保存数据
                    if not is_training:
                        pre_output = output  # 将当前时刻RNN的输出，作为下一个时刻的原始输出
                # c. 返回结果
                return outputs


def train():
    # 一、定义编码器的输入
    # 假设编码器有5个时刻输出，每个时刻输出的是具体对应的单词id
    encoder_inputs = []
    for idx in range(5):
        encoder_inputs.append(tf.placeholder(dtype=tf.int32, shape=[None], name="encoder_{}".format(idx)))

    # 二、定义解码器的输出和输出
    # 假设解码器实际长度为5个时刻，加上特殊一个特殊时刻(输入:GO)
    decoder_placeholders = []
    for idx in range(6):
        decoder_placeholders.append(tf.placeholder(dtype=tf.int32, shape=[None], name='decoder_{}'.format(idx)))
    decoder_inputs = decoder_placeholders[:-1]  # 解码器的输入(5个时刻=4个实际值+1个特殊值)
    decoder_targets = decoder_placeholders[1:]  # 解码器的输出(5个时刻=5个实际值+)

    # 三、前向网络的构建，得到解码器RNN的输出
    project_output_weight = tf.get_variable("w", shape=[512, 10000])  # 512 = decoder_output_size + attention_size
    project_output_bias = tf.get_variable("b", shape=[10000])
    # decoder_rnn_outputs: List列表，内部为解码器RNN的每个时刻的输出；形状为:[N,rnn_num_units]
    decoder_rnn_outputs = build_interface(
        encoder_inputs=encoder_inputs,  # 编码器输入，List列表
        encoder_vocab_size=10000,  # 编码器词汇数目
        decoder_inputs=decoder_inputs,  # 解码器输入，List列表
        decoder_vocab_size=10000,  # 解码器词汇数目
        embedding_size=128,  # embedding转换后的词向量大小
        rnn_num_units=256,  # RNN中神经元数目
        is_training=True,  # 训练阶段还是预测推理阶段
        project_output_weight=project_output_weight,  # 全连接参数
        project_output_bias=project_output_bias  # 全连接参数
    )

    # 四、对于RNN的输出，做一个全连接转换，得到其最终结果
    project_logits = []
    for decoder_rnn_output in decoder_rnn_outputs:
        # [N,256] ---> [N,7897]  也就是预测为每个字/词的置信度
        logits = tf.nn.xw_plus_b(decoder_rnn_output, project_output_weight, project_output_bias)
        project_logits.append(logits)
    print("最终形状:\n{}".format(project_logits))

    # 五、损失函数构建
    loss = 0
    for logits, targets in zip(project_logits, decoder_targets):
        _loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets, logits=logits)
        loss = loss + tf.reduce_mean(_loss)

    # TODO: 下面训练的代码如果想实现就自己完善
    """
    TODO：训练数据的时候，只需要将原始数据转换，比如：
        [百, 柳, 报, 春, 兆] ----> [千，花，传，欢，乐]
    0. 假定词汇转换id表为：
        编码器和解码器使用相同的词汇表
        编码器转换表为:
            PAD   -->   0
            GO  -->   1
            百  -->   2
            柳  -->   3
            报  -->   4
            春  -->   5
            兆  -->   6
            千  -->   7
            花  -->   8
            传  -->   9
            欢  -->   10
            乐  -->   11
            .....
    1. 原始数据转换为序号id:
        [百, 柳, 报, 春, 兆] ----> [千，花，传，欢，乐]
        ====>
        [2,3,4,5,6] ----> [7,8,9,10,11]
    2. 数据填充(因为编码器要求输入长度为5，解码器要求输出长度为5)
        [百, 柳, 报, 春, 兆] ----> [千，花，传，欢，乐]
        ====>
         [百, 柳, 报, 春, 兆] ----> [GO, 千，花，传，欢，乐]
        ====>
        [2,3,4,5,6] ----> [1,7,8,9,10,11]
    3. 上面是描述一条数据的处理方式，那么batch_size条数据，处理方式是完全一样的，最终假定形成两个对象:
            X:[N,5]
            Y:[N,6]
        对X和Y进行转置的操作，也就是形状进行变换:
            X:[5,N]
            Y:[6,N]
    4. 给定feed_dict的值：
        feed_dict = {}
        for encoder_input,x in zip(encoder_inputs,X):
            feed_dict[encoder_input] = x
        for decoder_placeholder, y in zip(decoder_placeholders, Y):
            feed_dict[decoder_placeholder] = y 
    """


def prediction():
    # 一、定义编码器的输入
    # 假设编码器有5个时刻输出，每个时刻输出的是具体对应的单词id
    encoder_inputs = []
    for idx in range(5):
        encoder_inputs.append(tf.placeholder(dtype=tf.int32, shape=[None], name="encoder_{}".format(idx)))

    # 二、定义解码器的输出和输出
    # 假设解码器实际长度为5个时刻，加上特殊一个特殊时刻(输入:GO)
    decoder_placeholders = []
    for idx in range(6):
        decoder_placeholders.append(tf.placeholder(dtype=tf.int32, shape=[None], name='decoder_{}'.format(idx)))
    decoder_inputs = decoder_placeholders[:-1]  # 解码器的输入(5个时刻=4个实际值+1个特殊值)
    decoder_targets = decoder_placeholders[1:]  # 解码器的输出(5个时刻=5个实际值+)

    # 三、前向网络的构建，得到解码器RNN的输出
    project_output_weight = tf.get_variable("w", shape=[512, 10000])
    project_output_bias = tf.get_variable("b", shape=[10000])
    # decoder_rnn_outputs: List列表，内部为解码器RNN的每个时刻的输出；形状为:[N,rnn_num_units]
    decoder_rnn_outputs = build_interface(
        encoder_inputs=encoder_inputs,  # 编码器输入，List列表
        encoder_vocab_size=10000,  # 编码器词汇数目
        decoder_inputs=decoder_inputs,  # 解码器输入，List列表
        decoder_vocab_size=10000,  # 解码器词汇数目
        embedding_size=128,  # embedding转换后的词向量大小
        rnn_num_units=256,  # RNN中神经元数目
        is_training=False,  # 训练阶段还是预测推理阶段
        project_output_weight=project_output_weight,  # 全连接参数
        project_output_bias=project_output_bias  # 全连接参数
    )

    # 四、对于RNN的输出，做一个全连接转换，得到其最终结果
    project_logits = []
    project_predictions = []
    for decoder_rnn_output in decoder_rnn_outputs:
        # [N,256] ---> [N,7897]  也就是预测为每个字/词的置信度
        logits = tf.nn.xw_plus_b(decoder_rnn_output, project_output_weight, project_output_bias)
        project_logits.append(logits)
        project_predictions.append(tf.argmax(logits, -1))

    print("最终形状:\n{}".format(project_predictions))
    # TODO: 下面推理预测的代码如果想实现就自己完善
    """
    TODO：推理预测数据的时候，只需要将原始数据转换，比如：
    TODO：训练数据的时候，只需要将原始数据转换，比如：
        [百, 柳, 报, 春, 兆] ----> [千，花，传，欢，乐]
    0. 假定词汇转换id表为：
        编码器和解码器使用相同的词汇表
        编码器转换表为:
            PAD   -->   0
            GO  -->   1
            百  -->   2
            柳  -->   3
            报  -->   4
            春  -->   5
            兆  -->   6
            千  -->   7
            花  -->   8
            传  -->   9
            欢  -->   10
            乐  -->   11
            .....
    1. 原始数据转换为序号id:
        [百, 柳, 报, 春, 兆] ----> []
        ====>
        [2,3,4,5,6] ----> []
    2. 数据填充(因为编码器要求输入长度为5，解码器要求输出长度为5)
        [百, 柳, 报, 春, 兆] ----> []
        ====>
         [百, 柳, 报, 春, 兆] ----> [GO]
        ====>
        [2,3,4,5,6] ----> [1]
    3. 上面是描述一条数据的处理方式，那么batch_size条数据，处理方式是完全一样的，最终假定形成两个对象:
            X:[N,5]
            Y:[N,1]
        对X和Y进行转置的操作，也就是形状进行变换:
            X:[5,N]
            Y:[1,N]
    4. 给定feed_dict的值：
        feed_dict = {}
        for encoder_input,x in zip(encoder_inputs,X):
            feed_dict[encoder_input] = x
        feed_dict[decoder_placeholders[0]] = Y[0] # 解码的时候，只需要给定解码器的第一个时刻输出 
    """


if __name__ == '__main__':
    train()
yongquanfengjie
关注
0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
attention

# -- encoding:utf-8 --"""原始数据:[百, 柳, 报, 春, 兆] ----> [千，花，传，欢，乐]模型训练的时候,将原始数据进行转换:编码器输入: [百, 柳, 报, 春, 兆]解码器输入: [GO, 千，花，传，欢]解码器输出(实际值): [千，花，传，欢，乐]"""import tensorflow as tfdef build_interface(encoder_inputs, encoder_vocab_size,
复制链接

扫一扫