mindspore打卡第十一天对trainsformer的padding代码的理解

mindspore打卡第十一天对trainsformer的padding代码的理解

修改训练数据 对padding的计算过程进行理解
#!tar -xzvf *.gz
##tain.en
Several women wait outside in a city.
A old man having a beer alone.
Several women wait outside in a city.
##tran.de
Mehrere Frauen warten in einer Stadt im Freien.
Ein alter Mann, der allein ein Bier trinkt.
Mehrere Frauen warten in einer Stadt im Freien.
  Cell In[56], line 2
    Several women wait outside in a city.
            ^
SyntaxError: invalid syntax
#val.en
A person on a snowmobile in mid jump.
A woman sits at a dark bar.
A person on a snowmobile in mid jump.
#val.de
Eine Person auf einem Schneemobil mitten im Sprung.
Eine Frau sitzt an einer dunklen Bar.
Eine Person auf einem Schneemobil mitten im Sprung.
#test.en
A man in an orange hat starring at something.
People are fixing the roof of a house.
A man in an orange hat starring at something.
#test.de
Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.
Leute Reparieren das Dach eines Hauses.
Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.
  Cell In[57], line 2
    A man in an orange hat starring at something.
      ^
SyntaxError: invalid syntax
train_path ="/home/nginx/work/data/train"
valid_path ="/home/nginx/work/data/valid"
test_path ="/home/nginx/work/data/test"

##tar -xzvf *.gz
import re
import os
class Multi30K():
    """Multi30K数据集加载器
    
    加载Multi30K数据集并处理为一个Python迭代对象。
    
    """
    def __init__(self, path):
        self.data = self._load(path)
        
    def _load(self, path):
        def tokenize(text):
            # 对句子进行分词,统一大小写
            text = text.rstrip()
            return [tok.lower() for tok in re.findall(r'\w+|[^\w\s]', text)]
        
        # 读取Multi30K数据,并进行分词
        members = {i.split('.')[-1]: i for i in os.listdir(path)}
        de_path = os.path.join(path, members['de1'])
        en_path = os.path.join(path, members['en1'])
        with open(de_path, 'r', encoding='utf-8') as de_file:
            de = de_file.readlines()[:-1]
            de = [tokenize(i) for i in de]
        with open(en_path, 'r', encoding='utf-8') as en_file:
            en = en_file.readlines()[:-1]
            en = [tokenize(i) for i in en]

        return list(zip(de, en))
        
    def __getitem__(self, idx):
        return self.data[idx]
    
    def __len__(self):
        return len(self.data)
train_dataset= Multi30K(train_path)
len(train_dataset)  ####会删除一行
2
valid_dataset=  Multi30K(valid_path)
len(valid_dataset)
2
test_dataset =  Multi30K(test_path) ##### 或者改为test.en1 test.de1
len(test_dataset)
2
train_dataset, valid_dataset, test_dataset = Multi30K(train_path), Multi30K(valid_path), Multi30K(test_path)
train_dataset
<__main__.Multi30K at 0xfffe106da1c0>
for de, en in test_dataset:
    print(f'de = {de}')
    print(f'en = {en}')
    break
de = ['ein', 'mann', 'mit', 'einem', 'orangefarbenen', 'hut', ',', 'der', 'etwas', 'anstarrt', '.']
en = ['a', 'man', 'in', 'an', 'orange', 'hat', 'starring', 'at', 'something', '.']
class Vocab:
    """通过词频字典,构建词典"""

    special_tokens = ['<unk>', '<pad>', '<bos>', '<eos>']

    def __init__(self, word_count_dict, min_freq=1):
        self.word2idx = {}
        for idx, tok in enumerate(self.special_tokens):
            self.word2idx[tok] = idx

        # 过滤低词频的词元,并为每个词元配置数字索引
        filted_dict = {
            w: c
            for w, c in word_count_dict.items() if c >= min_freq
        }
        for w, _ in filted_dict.items():
            self.word2idx[w] = len(self.word2idx)

        self.idx2word = {idx: word for word, idx in self.word2idx.items()}

        self.bos_idx = self.word2idx['<bos>']  # 特殊占位符:序列开始
        self.eos_idx = self.word2idx['<eos>']  # 特殊占位符:序列结束
        self.pad_idx = self.word2idx['<pad>']  # 特殊占位符:补充字符
        self.unk_idx = self.word2idx['<unk>']  # 特殊占位符:低词频词元或未曾出现的词元

    def _word2idx(self, word):
        """单词映射至数字索引"""
        if word not in self.word2idx:
            return self.unk_idx
        return self.word2idx[word]

    def _idx2word(self, idx):
        """数字索引映射至单词"""
        if idx not in self.idx2word:
            raise ValueError('input index is not in vocabulary.')
        return self.idx2word[idx]

    def encode(self, word_or_list):
        """将单个单词或单词数组映射至单个数字索引或数字索引数组"""
        if isinstance(word_or_list, list):
            return [self._word2idx(i) for i in word_or_list]
        return self._word2idx(word_or_list)

    def decode(self, idx_or_list):
        """将单个数字索引或数字索引数组映射至单个单词或单词数组"""
        if isinstance(idx_or_list, list):
            return [self._idx2word(i) for i in idx_or_list]
        return self._idx2word(idx_or_list)

    def __len__(self):
        return len(self.word2idx)
from collections import Counter, OrderedDict

def build_vocab(dataset):
    de_words, en_words = [], []
    for de, en in dataset:
        de_words.extend(de)
        en_words.extend(en)

    de_count_dict = OrderedDict(sorted(Counter(de_words).items(), key=lambda t: t[1], reverse=True))
    en_count_dict = OrderedDict(sorted(Counter(en_words).items(), key=lambda t: t[1], reverse=True))

    return Vocab(de_count_dict, min_freq=1), Vocab(en_count_dict, min_freq=1)
de_vocab, en_vocab = build_vocab(train_dataset)   ####用训练集的词典
print('Unique tokens in de vocabulary:', len(de_vocab))
Unique tokens in de vocabulary: 21
de_vocab
<__main__.Vocab at 0xfffe18995c70>
import mindspore

class Iterator():
    """创建数据迭代器"""
    def __init__(self, dataset, de_vocab, en_vocab, batch_size, max_len=32, drop_reminder=False):
        self.dataset = dataset
        self.de_vocab = de_vocab
        self.en_vocab = en_vocab

        self.batch_size = batch_size
        self.max_len = max_len
        self.drop_reminder = drop_reminder

        length = len(self.dataset) // batch_size
        self.len = length if drop_reminder else length + 1  # 批量数量            len

    def __call__(self):
        def pad(idx_list, vocab, max_len):
            """统一序列长度,并记录有效长度"""
            idx_pad_list, idx_len = [], []
            # 当前序列度超过最大长度时,将超出的部分丢弃;当前序列长度小于最大长度时,用占位符补齐
            for i in idx_list:
                if len(i) > max_len - 2:
                    idx_pad_list.append(
                        [vocab.bos_idx] + i[:max_len-2] + [vocab.eos_idx]
                    )
                    idx_len.append(max_len)
                else:
                    idx_pad_list.append(
                        [vocab.bos_idx] + i + [vocab.eos_idx] + [vocab.pad_idx] * (max_len - len(i) - 2)
                    )
                    idx_len.append(len(i) + 2)
            return idx_pad_list, idx_len

        def sort_by_length(src, trg):
            """对德/英语的字段长度进行排序"""
            data = zip(src, trg)
            data = sorted(data, key=lambda t: len(t[0]), reverse=True)
            return zip(*list(data))

        def encode_and_pad(batch_data, max_len):
            """将批量中的文本数据转换为数字索引,并统一每个序列的长度"""
            # 将当前批量数据中的词元转化为索引
            src_data, trg_data = zip(*batch_data)
            src_idx = [self.de_vocab.encode(i) for i in src_data]
            trg_idx = [self.en_vocab.encode(i) for i in trg_data]

            # 统一序列长度
            src_idx, trg_idx = sort_by_length(src_idx, trg_idx)
            src_idx_pad, src_len = pad(src_idx, de_vocab, max_len)
            trg_idx_pad, _ = pad(trg_idx, en_vocab, max_len)

            return src_idx_pad, src_len, trg_idx_pad

        for i in range(self.len):
            # 获取当前批量的数据
            if i == self.len - 1 and not self.drop_reminder:
                batch_data = self.dataset[i * self.batch_size:]
            else:
                batch_data = self.dataset[i * self.batch_size: (i+1) * self.batch_size]

            src_idx, src_len, trg_idx = encode_and_pad(batch_data, self.max_len)
            # 将序列数据转换为tensor
            yield mindspore.Tensor(src_idx, mindspore.int32), \
                mindspore.Tensor(src_len, mindspore.int32), \
                mindspore.Tensor(trg_idx, mindspore.int32)

    def __len__(self):
        return self.len
len(train_dataset) // 1
2
len(train_dataset)
2
train_iterator = Iterator(train_dataset, de_vocab, en_vocab, batch_size=1, max_len=20, drop_reminder=True)  ###序列的最长的长度 应该是句子的数量吧?改为2试试
valid_iterator = Iterator(valid_dataset, de_vocab, en_vocab, batch_size=1, max_len=20, drop_reminder=False)
test_iterator = Iterator(test_dataset, de_vocab, en_vocab, batch_size=1, max_len=20, drop_reminder=False)
train_iterator
<__main__.Iterator at 0xffff98030b20>
len(de_vocab)
21
len(en_vocab)
17
de_vocab.pad_idx
1
en_vocab.pad_idx  ##并为每个词元配置数字索引 其中pad_idx九三pad对应的数值
1

import mindspore
from mindspore import nn
from mindspore import ops
from mindspore import Tensor
from mindspore import dtype as mstype


class ScaledDotProductAttention(nn.Cell):
    def __init__(self, dropout_p=0.):
        super().__init__()
        self.softmax = nn.Softmax()
        self.dropout = nn.Dropout(1-dropout_p)
        self.sqrt = ops.Sqrt()


    def construct(self, query, key, value, attn_mask=None):
        """scaled dot product attention"""
        # 计算scaling factor
        embed_size = query.shape[-1]
        scaling_factor = self.sqrt(Tensor(embed_size, mstype.float32))
        
        # 注意力权重计算
        # 计算query和key之间的点积,并除以scaling factor进行归一化
        attn = ops.matmul(query, key.swapaxes(-2, -1) / scaling_factor)

        # 注意力掩码机制
        if attn_mask is not None:
            attn = attn.masked_fill(attn_mask, -1e9)
        
        # softmax,保证注意力权重范围在0-1之间
        attn = self.softmax(attn)

        # dropout
        attn = self.dropout(attn)

        # 对value进行加权
        output = ops.matmul(attn, value)  ## QK  V

        return (output, attn)

def get_attn_pad_mask(seq_q, seq_k, pad_idx):
    """注意力掩码:识别序列中的<pad>占位符

    Args:
        seq_q (Tensor): query序列,shape = [batch size, query len]
        seq_k (Tensor): key序列,shape = [batch size, key len]
        pad_idx (Tensor): key序列<pad>占位符对应的数字索引
    """
    batch_size, len_q = seq_q.shape
    batch_size, len_k = seq_k.shape

    # 如果序列中元素对应<pad>占位符,则该位置在mask中对应元素为True
    # pad_attn_mask: [batch size, key len]
    pad_attn_mask = ops.equal(seq_k, pad_idx)

    # 增加额外的维度
    # pad_attn_mask: [batch size, 1, key len]
    pad_attn_mask = pad_attn_mask.expand_dims(1)
    # 将掩码广播到[batch size, query len, key len]
    pad_attn_mask = ops.broadcast_to(pad_attn_mask, (batch_size, len_q, len_k))

    return pad_attn_mask
class MultiHeadAttention(nn.Cell):
    def __init__(self, d_model, d_k, n_heads, dropout_p=0.):
        super().__init__()
        self.n_heads = n_heads
        self.d_k = d_k
        self.W_Q = nn.Dense(d_model, d_k * n_heads)
        self.W_K = nn.Dense(d_model, d_k * n_heads)
        self.W_V = nn.Dense(d_model, d_k * n_heads)
        self.W_O = nn.Dense(n_heads * d_k, d_model)
        self.attention = ScaledDotProductAttention(dropout_p=dropout_p)

    def construct(self, query, key, value, attn_mask):
        """
        query: [batch_size, len_q, d_model]
        key: [batch_size, len_k, d_model]
        value: [batch_size, len_k, d_model]
        attn_mask: [batch_size, seq_len, seq_len]
        """

        batch_size = query.shape[0]

        # 将query,key和value分别乘以对应的权重,并分割为不同的“头”
        # q_s: [batch_size, len_q, n_heads, d_k]
        # k_s: [batch_size, len_k, n_heads, d_k]
        # v_s: [batch_size, len_k, n_heads, d_k]
        q_s = self.W_Q(query).view(batch_size, -1, self.n_heads, self.d_k)
        k_s = self.W_K(key).view(batch_size, -1, self.n_heads, self.d_k)
        v_s = self.W_V(value).view(batch_size, -1, self.n_heads, self.d_k)

        # 调整query,key和value的维度
        # q_s: [batch_size, n_heads, len_q, d_k]
        # k_s: [batch_size, n_heads, len_k, d_k]
        # v_s: [batch_size, n_heads, len_k, d_k]
        q_s = q_s.transpose((0, 2, 1, 3))  ###变成 bz  head  dmodel  dk
        k_s = k_s.transpose((0, 2, 1, 3))
        v_s = v_s.transpose((0, 2, 1, 3))

        # attn_mask的dimension需与q_s, k_s, v_s对应
        # attn_mask: [batch_size, n_heads, seq_len, seq_len]
        attn_mask = attn_mask.expand_dims(1)
        attn_mask = ops.tile(attn_mask, (1, self.n_heads, 1, 1))

        # 计算每个头的注意力分数
        # context: [batch_size, n_heads, len_q, d_k]
        # attn: [batch_size, n_heads, len_q, len_k]
        context, attn = self.attention(q_s, k_s, v_s, attn_mask)

        # concatenate
        # context: [batch_size, len_q, n_heads * d_k]
        context = context.transpose((0, 2, 1, 3)).view((batch_size, -1, self.n_heads * self.d_k))

        # 乘以W_O
        # output: [batch_size, len_q, n_heads * d_k]
        output = self.W_O(context)

        return output, attn
from mindspore import numpy as mnp

class PositionalEncoding(nn.Cell):
    """位置编码"""

    def __init__(self, d_model, dropout_p=0.1, max_len=100):
        super().__init__()
        self.dropout = nn.Dropout(1 - dropout_p)

        # 位置信息
        # pe: [max_len, d_model]
        self.pe = ops.Zeros()((max_len, d_model), mstype.float32)

        # pos: [max_len, 1]
        # angle: [d_model/2, ]
        # pos/angle: [max len, d_model/2]
        pos = mnp.arange(0, max_len, dtype=mstype.float32).view((-1, 1))
        angle = ops.pow(10000.0, mnp.arange(0, d_model, 2, dtype=mstype.float32)/d_model)
        
        # pe: [max len, d_model]
        self.pe[:, 0::2] = ops.sin(pos/angle)
        self.pe[:, 1::2] = ops.cos(pos/angle)

    def construct(self, x):
        batch_size = x.shape[0]

        # broadcast
        # pe: [batch_size, max_len, d_model]
        pe = self.pe.expand_dims(0)
        pe = ops.broadcast_to(pe, (batch_size, -1, -1))

        # 将位置编码截取至x同等大小
        # x: [batch_size, seq_len, d_model]
        x = x + pe[:, :x.shape[1], :]
        return self.dropout(x)
class PoswiseFeedForward(nn.Cell):
    def __init__(self, d_ff, d_model, dropout_p=0.):
        super().__init__()
        self.linear1 = nn.Dense(d_model, d_ff)
        self.linear2 = nn.Dense(d_ff, d_model)
        self.dropout = nn.Dropout(1-dropout_p)
        self.relu = nn.ReLU()

    def construct(self, x):
        """前馈神经网络
        x: [batch_size, seq_len, d_model]
        """
        # x: [batch_size, seq_len, d_ff]
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        # x: [batch_size, seq_len, d_model]
        output = self.linear2(x)
        return output
class AddNorm(nn.Cell):
    def __init__(self, d_model, dropout_p=0.):
        super().__init__()
        self.layer_norm = nn.LayerNorm((d_model, ), epsilon=1e-5)
        self.dropout = nn.Dropout(1-dropout_p)
    
    def construct(self, x, residual):
        return self.layer_norm(self.dropout(x) + residual)
class EncoderLayer(nn.Cell):
    def __init__(self, d_model, n_heads, d_ff, dropout_p=0.):
        super().__init__()
        d_k = d_model // n_heads
        if d_k * n_heads != d_model:
            raise ValueError(f"The `d_model` {d_model} can not be divisible by `num_heads` {n_heads}.")
        self.enc_self_attn = MultiHeadAttention(d_model, d_k, n_heads, dropout_p)
        self.pos_ffn = PoswiseFeedForward(d_ff, d_model, dropout_p)
        self.add_norm1 = AddNorm(d_model, dropout_p)
        self.add_norm2 = AddNorm(d_model, dropout_p)
        
    def construct(self, enc_inputs, enc_self_attn_mask):
        """
        enc_inputs: [batch_size, src_len, d_model]
        enc_self_attn_mask: [batch_size, src_len, src_len]
        """
        residual = enc_inputs

        # multi-head attention
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)

        # add & norm
        enc_outputs = self.add_norm1(enc_outputs, residual)
        residual = enc_outputs

        # feed-forward
        enc_outputs = self.pos_ffn(enc_outputs)

        # add & norm
        enc_outputs = self.add_norm2(enc_outputs, residual)

        return enc_outputs, attn
class Encoder(nn.Cell):
    def __init__(self, src_vocab_size, d_model, n_heads, d_ff, n_layers, dropout_p=0.):
        super().__init__()
        self.src_emb = nn.Embedding(src_vocab_size, d_model)
        self.pos_emb = PositionalEncoding(d_model, dropout_p)
        self.layers = nn.CellList([EncoderLayer(d_model, n_heads, d_ff, dropout_p) for _ in range(n_layers)])
        self.scaling_factor = ops.Sqrt()(Tensor(d_model, mstype.float32))

        
    def construct(self, enc_inputs, src_pad_idx):
        """enc_inputs : [batch_size, src_len]
        """
        # 将输入转换为embedding,并添加位置信息
        # enc_outputs: [batch_size, src_len, d_model]
        enc_outputs = self.src_emb(enc_inputs.astype(mstype.int32))
        enc_outputs = self.pos_emb(enc_outputs * self.scaling_factor)

        # 输入的padding掩码
        # enc_self_attn_mask: [batch_size, src_len, src_len]
        enc_self_attn_mask = get_attn_pad_mask(enc_inputs, enc_inputs, src_pad_idx)

        # 堆叠encoder层
        # enc_outputs: [batch_size, src_len, d_model]
        # enc_self_attns: [batch_size, n_heads, src_len, src_len]
        enc_self_attns = []
        for layer in self.layers:
            enc_outputs, enc_self_attn = layer(enc_outputs, enc_self_attn_mask)
            enc_self_attns.append(enc_self_attn)
        return enc_outputs, enc_self_attns
def get_attn_subsequent_mask(seq_q, seq_k):
    """生成时间掩码,使decoder在第t时刻只能看到序列的前t-1个元素
    
    Args:
        seq_q (Tensor): query序列,shape = [batch size, len_q]
        seq_k (Tensor): key序列,shape = [batch size, len_k]
    """
    batch_size, len_q = seq_q.shape
    batch_size, len_k = seq_k.shape
    # 生成三角矩阵
    # subsequent_mask: [batch size, len_q, len_k]
    ones = ops.ones((batch_size, len_q, len_k), mindspore.float32)
    subsequent_mask = mnp.triu(ones, k=1)
    return subsequent_mask
class DecoderLayer(nn.Cell):
    def __init__(self, d_model, n_heads, d_ff, dropout_p=0.):
        super().__init__()
        d_k = d_model // n_heads
        if d_k * n_heads != d_model:
            raise ValueError(f"The `d_model` {d_model} can not be divisible by `num_heads` {n_heads}.")
        self.dec_self_attn = MultiHeadAttention(d_model, d_k, n_heads, dropout_p)
        self.dec_enc_attn = MultiHeadAttention(d_model, d_k, n_heads, dropout_p)
        self.pos_ffn = PoswiseFeedForward(d_ff, d_model, dropout_p)
        self.add_norm1 = AddNorm(d_model, dropout_p)
        self.add_norm2 = AddNorm(d_model, dropout_p)
        self.add_norm3 = AddNorm(d_model, dropout_p)
        
    def construct(self, dec_inputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask):
        """
        dec_inputs: [batch_size, trg_len, d_model]
        enc_outputs: [batch_size, src_len, d_model]
        dec_self_attn_mask: [batch_size, trg_len, trg_len]
        dec_enc_attn_mask: [batch_size, trg_len, src_len]
        """
        residual = dec_inputs
    
        # decoder multi-head attention
        dec_outputs, dec_self_attn = self.dec_self_attn(dec_inputs, dec_inputs, dec_inputs, dec_self_attn_mask)

        # add & norm
        dec_outputs = self.add_norm1(dec_outputs, residual)
        residual = dec_outputs

        # encoder-decoder multi-head attention        
        dec_outputs, dec_enc_attn = self.dec_enc_attn(dec_outputs, enc_outputs, enc_outputs, dec_enc_attn_mask)## dec_outputs作为Q enc作为KV

        # add & norm
        dec_outputs = self.add_norm2(dec_outputs, residual)
        residual = dec_outputs

        # feed-forward
        dec_outputs = self.pos_ffn(dec_outputs)

        # add & norm
        dec_outputs = self.add_norm3(dec_outputs, residual)

        return dec_outputs, dec_self_attn, dec_enc_attn
class Decoder(nn.Cell):
    def __init__(self, trg_vocab_size, d_model, n_heads, d_ff, n_layers, dropout_p=0.):
        super().__init__()
        self.trg_emb = nn.Embedding(trg_vocab_size, d_model)
        self.pos_emb = PositionalEncoding(d_model, dropout_p)
        self.layers = nn.CellList([DecoderLayer(d_model, n_heads, d_ff) for _ in range(n_layers)])
        self.projection = nn.Dense(d_model, trg_vocab_size)
        self.scaling_factor = ops.Sqrt()(Tensor(d_model, mstype.float32))      
        
    def construct(self, dec_inputs, enc_inputs, enc_outputs, src_pad_idx, trg_pad_idx):
        """
        dec_inputs: [batch_size, trg_len]
        enc_inputs: [batch_size, src_len]
        enc_outputs: [batch_size, src_len, d_model]
        """
        # 将输入转换为Embedding,并添加位置信息
        # dec_outputs: [batch_size, trg_len, d_model]
        dec_outputs = self.trg_emb(dec_inputs.astype(mstype.int32))
        dec_outputs = self.pos_emb(dec_outputs * self.scaling_factor)

        # decoder中自注意力的掩码
        # dec_self_attn_mask: [batch_size, trg_len, trg_len]
        dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs, trg_pad_idx)
        dec_self_attn_subsequent_mask = get_attn_subsequent_mask(dec_inputs, dec_inputs)
        dec_self_attn_mask = ops.gt((dec_self_attn_pad_mask + dec_self_attn_subsequent_mask), 0)

        # encoder-decoder中的注意力padding掩码
        # dec_enc_attn_mask: [batch_size, trg_len, src_len]
        dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs, src_pad_idx)

        # 堆叠decoder层
        # dec_outputs: [batch_size, trg_len, d_model]
        dec_self_attns, dec_enc_attns = [], []
        for layer in self.layers:
            dec_outputs, dec_self_attn, dec_enc_attn = layer(dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)
            dec_self_attns.append(dec_self_attn)
            dec_enc_attns.append(dec_enc_attn)

        # 线性层
        # dec_outputs: [batch_size, trg_len, trg_vocab_size]
        dec_outputs = self.projection(dec_outputs)
        return dec_outputs, dec_self_attns, dec_enc_attns
class Transformer(nn.Cell):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def construct(self, enc_inputs, dec_inputs, src_pad_idx, trg_pad_idx):
        """
        enc_inputs: [batch_size, src_len]
        dec_inputs: [batch_size, trg_len]
        """
        # encoder,输出表示源序列信息tensor
        # enc_ouputs: [batch_size, src_len, d_model]
        enc_outputs, enc_self_attns = self.encoder(enc_inputs, src_pad_idx)

        # decoder
        # de_outputs: [batch_size, trg_len, trg_vocab_size]
        dec_outputs, dec_self_attns, dec_enc_attns = self.decoder(dec_inputs, enc_inputs, enc_outputs, src_pad_idx, trg_pad_idx)

        # decoder logits
        # dec_logits: [batch_size * trg_len, trg_vocab_size]
        dec_logits = dec_outputs.view((-1, dec_outputs.shape[-1]))
        print("shape:",dec_logits.shape)  ###输出的其中一个是输出的目标最大长度trg_vocab_size

        return dec_logits, enc_self_attns, dec_self_attns, dec_enc_attns
        

拆解代码

# vocabulary
src_vocab_size = len(de_vocab)
trg_vocab_size = len(en_vocab)
src_pad_idx = de_vocab.pad_idx
trg_pad_idx = en_vocab.pad_idx

# hyper-parameters
d_model = 512
d_ff = 2048
n_layers = 1
n_heads = 2

# 实例化模型
encoder = Encoder(src_vocab_size, d_model, n_heads, d_ff, n_layers, dropout_p=0.1)
decoder = Decoder(trg_vocab_size, d_model, n_heads, d_ff, n_layers, dropout_p=0.1)
model = Transformer(encoder, decoder)   ###out_chnael 是12  512---》12
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.455.085 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.507.455 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.548.181 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.552.848 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.557.008 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.574.551 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.630.250 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.685.188 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.731.413 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.735.747 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.739.445 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.742.999 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
src_vocab_size,trg_vocab_size,src_pad_idx,trg_pad_idx
(21, 17, 1, 1)
encoder 
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.848.909 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.850.098 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.850.754 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.851.548 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.852.300 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.





Encoder<
  (src_emb): Embedding<vocab_size=21, embedding_size=512, use_one_hot=False, embedding_table=Parameter (name=encoder.src_emb.embedding_table, shape=(21, 512), dtype=Float32, requires_grad=True), dtype=Float32, padding_idx=None>
  (pos_emb): PositionalEncoding<
    (dropout): Dropout<keep_prob=0.9>
    >
  (layers): CellList<
    (0): EncoderLayer<
      (enc_self_attn): MultiHeadAttention<
        (W_Q): Dense<input_channels=512, output_channels=512, has_bias=True>
        (W_K): Dense<input_channels=512, output_channels=512, has_bias=True>
        (W_V): Dense<input_channels=512, output_channels=512, has_bias=True>
        (W_O): Dense<input_channels=512, output_channels=512, has_bias=True>
        (attention): ScaledDotProductAttention<
          (softmax): Softmax<>
          (dropout): Dropout<keep_prob=0.9>
          >
        >
      (pos_ffn): PoswiseFeedForward<
        (linear1): Dense<input_channels=512, output_channels=2048, has_bias=True>
        (linear2): Dense<input_channels=2048, output_channels=512, has_bias=True>
        (dropout): Dropout<keep_prob=0.9>
        (relu): ReLU<>
        >
      (add_norm1): AddNorm<
        (layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=encoder.layers.0.add_norm1.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=encoder.layers.0.add_norm1.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
        (dropout): Dropout<keep_prob=0.9>
        >
      (add_norm2): AddNorm<
        (layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=encoder.layers.0.add_norm2.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=encoder.layers.0.add_norm2.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
        (dropout): Dropout<keep_prob=0.9>
        >
      >
    >
  >
decoder
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.189.660 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.190.848 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.191.471 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.192.247 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.192.899 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.193.581 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.194.262 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.





Decoder<
  (trg_emb): Embedding<vocab_size=17, embedding_size=512, use_one_hot=False, embedding_table=Parameter (name=decoder.trg_emb.embedding_table, shape=(17, 512), dtype=Float32, requires_grad=True), dtype=Float32, padding_idx=None>
  (pos_emb): PositionalEncoding<
    (dropout): Dropout<keep_prob=0.9>
    >
  (layers): CellList<
    (0): DecoderLayer<
      (dec_self_attn): MultiHeadAttention<
        (W_Q): Dense<input_channels=512, output_channels=512, has_bias=True>
        (W_K): Dense<input_channels=512, output_channels=512, has_bias=True>
        (W_V): Dense<input_channels=512, output_channels=512, has_bias=True>
        (W_O): Dense<input_channels=512, output_channels=512, has_bias=True>
        (attention): ScaledDotProductAttention<
          (softmax): Softmax<>
          (dropout): Dropout<keep_prob=1.0>
          >
        >
      (dec_enc_attn): MultiHeadAttention<
        (W_Q): Dense<input_channels=512, output_channels=512, has_bias=True>
        (W_K): Dense<input_channels=512, output_channels=512, has_bias=True>
        (W_V): Dense<input_channels=512, output_channels=512, has_bias=True>
        (W_O): Dense<input_channels=512, output_channels=512, has_bias=True>
        (attention): ScaledDotProductAttention<
          (softmax): Softmax<>
          (dropout): Dropout<keep_prob=1.0>
          >
        >
      (pos_ffn): PoswiseFeedForward<
        (linear1): Dense<input_channels=512, output_channels=2048, has_bias=True>
        (linear2): Dense<input_channels=2048, output_channels=512, has_bias=True>
        (dropout): Dropout<keep_prob=1.0>
        (relu): ReLU<>
        >
      (add_norm1): AddNorm<
        (layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=decoder.layers.0.add_norm1.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=decoder.layers.0.add_norm1.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
        (dropout): Dropout<keep_prob=1.0>
        >
      (add_norm2): AddNorm<
        (layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=decoder.layers.0.add_norm2.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=decoder.layers.0.add_norm2.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
        (dropout): Dropout<keep_prob=1.0>
        >
      (add_norm3): AddNorm<
        (layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=decoder.layers.0.add_norm3.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=decoder.layers.0.add_norm3.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
        (dropout): Dropout<keep_prob=1.0>
        >
      >
    >
  (projection): Dense<input_channels=512, output_channels=17, has_bias=True>
  >
model 
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.413.623 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.414.666 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.415.426 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.416.565 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.417.290 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.417.947 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.418.626 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.419.811 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.420.428 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.421.098 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.421.764 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.422.448 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.





Transformer<
  (encoder): Encoder<
    (src_emb): Embedding<vocab_size=21, embedding_size=512, use_one_hot=False, embedding_table=Parameter (name=encoder.src_emb.embedding_table, shape=(21, 512), dtype=Float32, requires_grad=True), dtype=Float32, padding_idx=None>
    (pos_emb): PositionalEncoding<
      (dropout): Dropout<keep_prob=0.9>
      >
    (layers): CellList<
      (0): EncoderLayer<
        (enc_self_attn): MultiHeadAttention<
          (W_Q): Dense<input_channels=512, output_channels=512, has_bias=True>
          (W_K): Dense<input_channels=512, output_channels=512, has_bias=True>
          (W_V): Dense<input_channels=512, output_channels=512, has_bias=True>
          (W_O): Dense<input_channels=512, output_channels=512, has_bias=True>
          (attention): ScaledDotProductAttention<
            (softmax): Softmax<>
            (dropout): Dropout<keep_prob=0.9>
            >
          >
        (pos_ffn): PoswiseFeedForward<
          (linear1): Dense<input_channels=512, output_channels=2048, has_bias=True>
          (linear2): Dense<input_channels=2048, output_channels=512, has_bias=True>
          (dropout): Dropout<keep_prob=0.9>
          (relu): ReLU<>
          >
        (add_norm1): AddNorm<
          (layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=encoder.layers.0.add_norm1.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=encoder.layers.0.add_norm1.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
          (dropout): Dropout<keep_prob=0.9>
          >
        (add_norm2): AddNorm<
          (layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=encoder.layers.0.add_norm2.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=encoder.layers.0.add_norm2.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
          (dropout): Dropout<keep_prob=0.9>
          >
        >
      >
    >
  (decoder): Decoder<
    (trg_emb): Embedding<vocab_size=17, embedding_size=512, use_one_hot=False, embedding_table=Parameter (name=decoder.trg_emb.embedding_table, shape=(17, 512), dtype=Float32, requires_grad=True), dtype=Float32, padding_idx=None>
    (pos_emb): PositionalEncoding<
      (dropout): Dropout<keep_prob=0.9>
      >
    (layers): CellList<
      (0): DecoderLayer<
        (dec_self_attn): MultiHeadAttention<
          (W_Q): Dense<input_channels=512, output_channels=512, has_bias=True>
          (W_K): Dense<input_channels=512, output_channels=512, has_bias=True>
          (W_V): Dense<input_channels=512, output_channels=512, has_bias=True>
          (W_O): Dense<input_channels=512, output_channels=512, has_bias=True>
          (attention): ScaledDotProductAttention<
            (softmax): Softmax<>
            (dropout): Dropout<keep_prob=1.0>
            >
          >
        (dec_enc_attn): MultiHeadAttention<
          (W_Q): Dense<input_channels=512, output_channels=512, has_bias=True>
          (W_K): Dense<input_channels=512, output_channels=512, has_bias=True>
          (W_V): Dense<input_channels=512, output_channels=512, has_bias=True>
          (W_O): Dense<input_channels=512, output_channels=512, has_bias=True>
          (attention): ScaledDotProductAttention<
            (softmax): Softmax<>
            (dropout): Dropout<keep_prob=1.0>
            >
          >
        (pos_ffn): PoswiseFeedForward<
          (linear1): Dense<input_channels=512, output_channels=2048, has_bias=True>
          (linear2): Dense<input_channels=2048, output_channels=512, has_bias=True>
          (dropout): Dropout<keep_prob=1.0>
          (relu): ReLU<>
          >
        (add_norm1): AddNorm<
          (layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=decoder.layers.0.add_norm1.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=decoder.layers.0.add_norm1.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
          (dropout): Dropout<keep_prob=1.0>
          >
        (add_norm2): AddNorm<
          (layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=decoder.layers.0.add_norm2.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=decoder.layers.0.add_norm2.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
          (dropout): Dropout<keep_prob=1.0>
          >
        (add_norm3): AddNorm<
          (layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=decoder.layers.0.add_norm3.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=decoder.layers.0.add_norm3.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
          (dropout): Dropout<keep_prob=1.0>
          >
        >
      >
    (projection): Dense<input_channels=512, output_channels=17, has_bias=True>
    >
  >
trg_pad_idx
1
model.trainable_params()
[Parameter (name=encoder.src_emb.embedding_table, shape=(21, 512), dtype=Float32, requires_grad=True),
 Parameter (name=encoder.layers.0.enc_self_attn.W_Q.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
 Parameter (name=encoder.layers.0.enc_self_attn.W_Q.bias, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=encoder.layers.0.enc_self_attn.W_K.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
 Parameter (name=encoder.layers.0.enc_self_attn.W_K.bias, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=encoder.layers.0.enc_self_attn.W_V.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
 Parameter (name=encoder.layers.0.enc_self_attn.W_V.bias, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=encoder.layers.0.enc_self_attn.W_O.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
 Parameter (name=encoder.layers.0.enc_self_attn.W_O.bias, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=encoder.layers.0.pos_ffn.linear1.weight, shape=(2048, 512), dtype=Float32, requires_grad=True),
 Parameter (name=encoder.layers.0.pos_ffn.linear1.bias, shape=(2048,), dtype=Float32, requires_grad=True),
 Parameter (name=encoder.layers.0.pos_ffn.linear2.weight, shape=(512, 2048), dtype=Float32, requires_grad=True),
 Parameter (name=encoder.layers.0.pos_ffn.linear2.bias, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=encoder.layers.0.add_norm1.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=encoder.layers.0.add_norm1.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=encoder.layers.0.add_norm2.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=encoder.layers.0.add_norm2.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.trg_emb.embedding_table, shape=(17, 512), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.dec_self_attn.W_Q.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.dec_self_attn.W_Q.bias, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.dec_self_attn.W_K.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.dec_self_attn.W_K.bias, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.dec_self_attn.W_V.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.dec_self_attn.W_V.bias, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.dec_self_attn.W_O.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.dec_self_attn.W_O.bias, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.dec_enc_attn.W_Q.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.dec_enc_attn.W_Q.bias, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.dec_enc_attn.W_K.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.dec_enc_attn.W_K.bias, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.dec_enc_attn.W_V.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.dec_enc_attn.W_V.bias, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.dec_enc_attn.W_O.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.dec_enc_attn.W_O.bias, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.pos_ffn.linear1.weight, shape=(2048, 512), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.pos_ffn.linear1.bias, shape=(2048,), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.pos_ffn.linear2.weight, shape=(512, 2048), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.pos_ffn.linear2.bias, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.add_norm1.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.add_norm1.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.add_norm2.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.add_norm2.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.add_norm3.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.layers.0.add_norm3.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.projection.weight, shape=(17, 512), dtype=Float32, requires_grad=True),
 Parameter (name=decoder.projection.bias, shape=(17,), dtype=Float32, requires_grad=True)]
loss_fn = nn.CrossEntropyLoss(ignore_index=trg_pad_idx)
optimizer = nn.Adam(model.trainable_params(), learning_rate=0.0001)
loss_fn
CrossEntropyLoss<>
optimizer
Adam<>
# import mindspore as ms
# from mindspore.common.initializer import Normal
# # 构建输入数据
# enc_inputs = ms.Tensor(shape=(1, 13,512,512), dtype=ms.float32, init=Normal())
# dec_inputs =  ms.Tensor(shape=(1, 12,512,512), dtype=ms.float32, init=Normal()) # 显示数据形状


# print("enc_inputs shape:", enc_inputs.shape)
# print("dec_inputs shape:", dec_inputs.shape)
## batch_size=1  在生成词迭代已经定义
def forward(enc_inputs, dec_inputs):
    """前向网络
    enc_inputs: [batch_size, src_len]
    dec_inputs: [batch_size, trg_len]
    """
    # 训练过程中不应该包含目标序列中的最后一个词元<eos>
    # logits: [batch_size * (trg_len - 1), trg_vocab_size]
    logits, _, _, _ = model(enc_inputs, dec_inputs[:, :-1], src_pad_idx, trg_pad_idx)  ## 13*512  12 *512 (13 12 已经考虑了panding的符号),1 1
    
    # 推理结果不应该包含目标序列中的第一个词元<bos>
    # targets: [batch_size * (trg_len -1), ]
    targets = dec_inputs[:, 1:].view(-1)
    loss = loss_fn(logits, targets)

    return loss

拆解

import mindspore as ms

# 定义张量的具体值
values = [2, 6, 7, 8, 9, 10, 11, 12, 13, 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1]

# 使用mindspore.tensor创建张量
enc_inputs = ms.Tensor([values], dtype=ms.int32)

print(enc_inputs)
# 定义张量的具体值
values = [2, 6, 7, 8, 9, 10, 4, 11, 5, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

# 使用mindspore.tensor创建张量
dec_inputs = ms.Tensor([values], dtype=ms.int32)

print( dec_inputs)
[[ 2  6  7  8  9 10 11 12 13  4  3  1  1  1  1  1  1  1  1  1]]
[[ 2  6  7  8  9 10  4 11  5  3  1  1  1  1  1  1  1  1  1  1]]
logits1, _, _, _=model(enc_inputs, dec_inputs[:, :-1], 1, 1)  ##17 是de vocab_size=17, embedding_size=512   而21 是enc Embedding<vocab_size=21, embedding_size=512  ;;;max_len=20
# logits: [batch_size * (trg_len - 1), trg_vocab_size]  1*(20-1),17
logits1
shape: (19, 17)





Tensor(shape=[19, 17], dtype=Float32, value=
[[-2.55366063e+00, -3.06832147e+00, -2.64111972e+00 ... -2.12405175e-01, -6.74118578e-01, -5.54979622e-01],
 [-2.59403634e+00, -3.07656002e+00, -2.21387792e+00 ...  6.27510190e-01, -1.68930933e-01, -2.08749816e-01],
 [-2.14952374e+00, -2.65785956e+00, -2.27898407e+00 ...  8.72464657e-01,  2.22310439e-01, -5.46112001e-01],
 ...
 [-2.10627747e+00, -1.17115605e+00, -2.49626946e+00 ...  1.63479745e-01,  2.89909244e-02, -3.14286537e-02],
 [-2.41592431e+00, -1.02976418e+00, -2.43828940e+00 ...  3.43136370e-01,  7.33953714e-03,  3.92905682e-01],
 [-2.60901284e+00, -9.56778646e-01, -2.45280385e+00 ... -1.38613194e-01, -9.00381804e-03,  2.95851976e-01]])
trg_vocab_size,d_model,n_heads, d_ff,n_layers
(17, 512, 2, 2048, 1)
dropout_p=0.
trg_emb = nn.Embedding(trg_vocab_size, d_model)
pos_emb = PositionalEncoding(d_model, dropout_p)
layers = nn.CellList([DecoderLayer(d_model, n_heads, d_ff) for _ in range(n_layers)])
projection = nn.Dense(d_model, trg_vocab_size)
scaling_factor = ops.Sqrt()(Tensor(d_model, mstype.float32))    
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:16:32.154.289 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:16:32.316.986 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:16:32.492.765 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:16:32.870.995 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:16:32.876.128 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:16:32.880.744 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:16:32.884.699 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
dec_outputs = trg_emb(dec_inputs.astype(mstype.int32))
dec_outputs = pos_emb(dec_outputs * scaling_factor)
dec_outputs
Tensor(shape=[1, 20, 512], dtype=Float32, value=
[[[-3.29979628e-01,  8.65685344e-01, -1.53060444e-02 ...  8.94590139e-01, -2.27691412e-01,  1.21550345e+00],
  [ 1.27103972e+00,  4.10478264e-01,  8.03606927e-01 ...  1.20142996e+00, -1.58704035e-02,  9.82034385e-01],
  [ 1.01256967e+00, -6.59448624e-01,  1.21936429e+00 ...  1.03363061e+00,  1.61111519e-01,  1.06760633e+00],
  ...
  [-5.15642047e-01, -4.91233796e-01, -6.98207080e-01 ...  1.34653473e+00, -3.68010491e-01,  1.06946480e+00],
  [-3.05232048e-01,  4.44246233e-01, -1.05705857e+00 ...  1.34653449e+00, -3.67906809e-01,  1.06946468e+00],
  [ 5.95632434e-01,  7.72634208e-01, -5.58417439e-01 ...  1.34653425e+00, -3.67803156e-01,  1.06946445e+00]]])
dec_outputs
Tensor(shape=[1, 20, 512], dtype=Float32, value=
[[[-3.29979628e-01,  8.65685344e-01, -1.53060444e-02 ...  8.94590139e-01, -2.27691412e-01,  1.21550345e+00],
  [ 1.27103972e+00,  4.10478264e-01,  8.03606927e-01 ...  1.20142996e+00, -1.58704035e-02,  9.82034385e-01],
  [ 1.01256967e+00, -6.59448624e-01,  1.21936429e+00 ...  1.03363061e+00,  1.61111519e-01,  1.06760633e+00],
  ...
  [-5.15642047e-01, -4.91233796e-01, -6.98207080e-01 ...  1.34653473e+00, -3.68010491e-01,  1.06946480e+00],
  [-3.05232048e-01,  4.44246233e-01, -1.05705857e+00 ...  1.34653449e+00, -3.67906809e-01,  1.06946468e+00],
  [ 5.95632434e-01,  7.72634208e-01, -5.58417439e-01 ...  1.34653425e+00, -3.67803156e-01,  1.06946445e+00]]])
dec_inputs
Tensor(shape=[1, 20], dtype=Int32, value=
[[2, 6, 7 ... 1, 1, 1]])
dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs, trg_pad_idx)
dec_self_attn_pad_mask
Tensor(shape=[1, 20, 20], dtype=Bool, value=
[[[False, False, False ...  True,  True,  True],
  [False, False, False ...  True,  True,  True],
  [False, False, False ...  True,  True,  True],
  ...
  [False, False, False ...  True,  True,  True],
  [False, False, False ...  True,  True,  True],
  [False, False, False ...  True,  True,  True]]])
dec_self_attn_subsequent_mask = get_attn_subsequent_mask(dec_inputs, dec_inputs)
dec_self_attn_subsequent_mask
Tensor(shape=[1, 20, 20], dtype=Float32, value=
[[[ 0.00000000e+00,  1.00000000e+00,  1.00000000e+00 ...  1.00000000e+00,  1.00000000e+00,  1.00000000e+00],
  [ 0.00000000e+00,  0.00000000e+00,  1.00000000e+00 ...  1.00000000e+00,  1.00000000e+00,  1.00000000e+00],
  [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00 ...  1.00000000e+00,  1.00000000e+00,  1.00000000e+00],
  ...
  [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00 ...  0.00000000e+00,  1.00000000e+00,  1.00000000e+00],
  [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00 ...  0.00000000e+00,  0.00000000e+00,  1.00000000e+00],
  [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00]]])
import numpy as np
import pandas as pd
# 将MindSpore张量转换为NumPy数组
np_mask = dec_self_attn_subsequent_mask.asnumpy()

# 将NumPy数组转换为Pandas DataFrame进行可视化
df_mask = pd.DataFrame(np_mask[0])

# 打印DataFrame查看
#print(df_mask)

# 如果你想在Jupyter notebook中以表格形式更美观地展示,直接赋值即可
display(df_mask)
012345678910111213141516171819
00.01.01.01.01.01.01.01.01.01.01.01.01.01.01.01.01.01.01.01.0
10.00.01.01.01.01.01.01.01.01.01.01.01.01.01.01.01.01.01.01.0
20.00.00.01.01.01.01.01.01.01.01.01.01.01.01.01.01.01.01.01.0
30.00.00.00.01.01.01.01.01.01.01.01.01.01.01.01.01.01.01.01.0
40.00.00.00.00.01.01.01.01.01.01.01.01.01.01.01.01.01.01.01.0
50.00.00.00.00.00.01.01.01.01.01.01.01.01.01.01.01.01.01.01.0
60.00.00.00.00.00.00.01.01.01.01.01.01.01.01.01.01.01.01.01.0
70.00.00.00.00.00.00.00.01.01.01.01.01.01.01.01.01.01.01.01.0
80.00.00.00.00.00.00.00.00.01.01.01.01.01.01.01.01.01.01.01.0
90.00.00.00.00.00.00.00.00.00.01.01.01.01.01.01.01.01.01.01.0
100.00.00.00.00.00.00.00.00.00.00.01.01.01.01.01.01.01.01.01.0
110.00.00.00.00.00.00.00.00.00.00.00.01.01.01.01.01.01.01.01.0
120.00.00.00.00.00.00.00.00.00.00.00.00.01.01.01.01.01.01.01.0
130.00.00.00.00.00.00.00.00.00.00.00.00.00.01.01.01.01.01.01.0
140.00.00.00.00.00.00.00.00.00.00.00.00.00.00.01.01.01.01.01.0
150.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.01.01.01.01.0
160.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.01.01.01.0
170.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.01.01.0
180.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.01.0
190.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
dec_self_attn_mask = ops.gt((dec_self_attn_pad_mask + dec_self_attn_subsequent_mask), 0)
dec_self_attn_mask
Tensor(shape=[1, 20, 20], dtype=Bool, value=
[[[False,  True,  True ...  True,  True,  True],
  [False, False,  True ...  True,  True,  True],
  [False, False, False ...  True,  True,  True],
  ...
  [False, False, False ...  True,  True,  True],
  [False, False, False ...  True,  True,  True],
  [False, False, False ...  True,  True,  True]]])
import numpy as np
import pandas as pd
# 将MindSpore张量转换为NumPy数组
np_mask = dec_self_attn_mask.asnumpy()

# 将NumPy数组转换为Pandas DataFrame进行可视化
df_mask = pd.DataFrame(np_mask[0])

# 打印DataFrame查看
#print(df_mask)

# 如果你想在Jupyter notebook中以表格形式更美观地展示,直接赋值即可
display(df_mask)
012345678910111213141516171819
0FalseTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
1FalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
2FalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
3FalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
4FalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
5FalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
6FalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
7FalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
8FalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
9FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
10FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
11FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
12FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
13FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
14FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
15FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
16FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
17FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
18FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
19FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs, src_pad_idx)
dec_enc_attn_mask
Tensor(shape=[1, 20, 20], dtype=Bool, value=
[[[False, False, False ...  True,  True,  True],
  [False, False, False ...  True,  True,  True],
  [False, False, False ...  True,  True,  True],
  ...
  [False, False, False ...  True,  True,  True],
  [False, False, False ...  True,  True,  True],
  [False, False, False ...  True,  True,  True]]])
import numpy as np
import pandas as pd
# 将MindSpore张量转换为NumPy数组
np_mask = dec_enc_attn_mask.asnumpy()

# 将NumPy数组转换为Pandas DataFrame进行可视化
df_mask = pd.DataFrame(np_mask[0])

# 打印DataFrame查看
#print(df_mask)

# 如果你想在Jupyter notebook中以表格形式更美观地展示,直接赋值即可
display(df_mask)
012345678910111213141516171819
0FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrue
1FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrue
2FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrue
3FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrue
4FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrue
5FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrue
6FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrue
7FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrue
8FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrue
9FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrue
10FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrue
11FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrue
12FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrue
13FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrue
14FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrue
15FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrue
16FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrue
17FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrue
18FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrue
19FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseTrueTrueTrueTrueTrueTrueTrueTrueTrue
dec_outputs
Tensor(shape=[1, 20, 512], dtype=Float32, value=
[[[-3.29979628e-01,  8.65685344e-01, -1.53060444e-02 ...  8.94590139e-01, -2.27691412e-01,  1.21550345e+00],
  [ 1.27103972e+00,  4.10478264e-01,  8.03606927e-01 ...  1.20142996e+00, -1.58704035e-02,  9.82034385e-01],
  [ 1.01256967e+00, -6.59448624e-01,  1.21936429e+00 ...  1.03363061e+00,  1.61111519e-01,  1.06760633e+00],
  ...
  [-5.15642047e-01, -4.91233796e-01, -6.98207080e-01 ...  1.34653473e+00, -3.68010491e-01,  1.06946480e+00],
  [-3.05232048e-01,  4.44246233e-01, -1.05705857e+00 ...  1.34653449e+00, -3.67906809e-01,  1.06946468e+00],
  [ 5.95632434e-01,  7.72634208e-01, -5.58417439e-01 ...  1.34653425e+00, -3.67803156e-01,  1.06946445e+00]]])
dec_outputs = projection(dec_outputs)
dec_outputs
Tensor(shape=[1, 20, 17], dtype=Float32, value=
[[[-8.26425076e-01, -1.20410120e+00, -1.28439963e-01 ...  5.64422965e-01,  1.06194824e-01, -2.42545847e-02],
  [-9.16367471e-01, -1.43073356e+00, -5.09288907e-02 ...  7.33686745e-01, -4.93338741e-02,  5.19632623e-02],
  [-9.17246103e-01, -1.34883523e+00, -3.61622870e-02 ...  8.07045281e-01,  3.27018559e-01,  7.60697350e-02],
  ...
  [-7.43718505e-01, -7.48091042e-01, -4.24297601e-01 ...  5.49952924e-01,  5.42376600e-02, -3.17106813e-01],
  [-7.17169762e-01, -7.46447027e-01, -3.70600104e-01 ...  5.41904747e-01,  2.55204104e-02, -2.68386006e-01],
  [-6.96062922e-01, -7.01602280e-01, -3.98487747e-01 ...  6.10271156e-01, -1.14318877e-01, -3.05555671e-01]]])
# #        # 将输入转换为Embedding,并添加位置信息
# #         # dec_outputs: [batch_size, trg_len, d_model]
# #         dec_outputs = self.trg_emb(dec_inputs.astype(mstype.int32))
# #         dec_outputs = self.pos_emb(dec_outputs * self.scaling_factor)

# #         # decoder中自注意力的掩码
# #         # dec_self_attn_mask: [batch_size, trg_len, trg_len]
# #         dec_self_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs, trg_pad_idx)
# #         dec_self_attn_subsequent_mask = get_attn_subsequent_mask(dec_inputs, dec_inputs)
# #         dec_self_attn_mask = ops.gt((dec_self_attn_pad_mask + dec_self_attn_subsequent_mask), 0)

# #         # encoder-decoder中的注意力padding掩码
# #         # dec_enc_attn_mask: [batch_size, trg_len, src_len]
# #         dec_enc_attn_mask = get_attn_pad_mask(dec_inputs, enc_inputs, src_pad_idx)



# # 堆叠decoder层
# # dec_outputs: [batch_size, trg_len, d_model]
# dec_self_attns, dec_enc_attns = [], []
# for layer in layers:
#     dec_outputs, dec_self_attn, dec_enc_attn = layer(dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)
#     dec_self_attns.append(dec_self_attn)
#     dec_enc_attns.append(dec_enc_attn)

# # 线性层
# # dec_outputs: [batch_size, trg_len, trg_vocab_size]
# dec_outputs = projection(dec_outputs)
# #return dec_outputs, dec_self_attns, dec_enc_attns
#     ###下面的dec_outputs 还有维度的变换
    
dec_outputs
Tensor(shape=[1, 20, 17], dtype=Float32, value=
[[[-8.26425076e-01, -1.20410120e+00, -1.28439963e-01 ...  5.64422965e-01,  1.06194824e-01, -2.42545847e-02],
  [-9.16367471e-01, -1.43073356e+00, -5.09288907e-02 ...  7.33686745e-01, -4.93338741e-02,  5.19632623e-02],
  [-9.17246103e-01, -1.34883523e+00, -3.61622870e-02 ...  8.07045281e-01,  3.27018559e-01,  7.60697350e-02],
  ...
  [-7.43718505e-01, -7.48091042e-01, -4.24297601e-01 ...  5.49952924e-01,  5.42376600e-02, -3.17106813e-01],
  [-7.17169762e-01, -7.46447027e-01, -3.70600104e-01 ...  5.41904747e-01,  2.55204104e-02, -2.68386006e-01],
  [-6.96062922e-01, -7.01602280e-01, -3.98487747e-01 ...  6.10271156e-01, -1.14318877e-01, -3.05555671e-01]]])
import numpy as np
import pandas as pd
# 将MindSpore张量转换为NumPy数组
np_mask = dec_outputs.asnumpy()

# 将NumPy数组转换为Pandas DataFrame进行可视化
df_mask = pd.DataFrame(np_mask[0])

# 打印DataFrame查看
#print(df_mask)

# 如果你想在Jupyter notebook中以表格形式更美观地展示,直接赋值即可
display(df_mask)
012345678910111213141516
0-0.826425-1.204101-0.1284400.3563290.452220-0.4524430.606300-0.2279300.269083-0.173742-1.121589-0.0931020.518840-0.3571700.5644230.106195-0.024255
1-0.916367-1.430734-0.0509290.6691960.806917-0.3688630.607177-0.1213520.6938820.175020-1.045657-0.1090760.280068-0.1373640.733687-0.0493340.051963
2-0.917246-1.348835-0.0361620.3191810.898127-0.2387850.533196-0.0690170.3789460.431980-1.048683-0.2210630.487657-0.2227890.8070450.3270190.076070
3-0.815883-1.066178-0.0089490.5607680.8114900.2023950.1513530.0493590.3876740.457533-0.773893-0.0701260.661461-0.2651920.6818980.4253250.144593
4-0.912663-1.1436630.0286450.5825700.7170010.1732760.264394-0.2384640.151882-0.051793-1.004650-0.2018170.390479-0.4884080.5630530.0702140.069295
5-0.597179-0.9160890.1510130.4923380.7316510.274230-0.319707-0.2803150.292216-0.101262-0.709801-0.1061900.033473-0.2666010.519980-0.311297-0.074184
6-0.386974-0.836567-0.3628280.3032860.6914300.3069320.119565-0.072272-0.051085-0.128228-0.738584-0.3422260.168470-0.4180370.4579690.0198830.071905
7-0.376257-0.822605-0.2730470.3704780.5977020.235920-0.052315-0.481007-0.074119-0.521327-0.295519-0.3012620.167297-0.2962590.062927-0.4585300.012958
8-0.083013-0.837436-0.3658910.3712930.661609-0.203302-0.033456-0.376518-0.236429-0.157299-0.149379-0.0170830.167552-0.4144380.437286-0.254605-0.090462
9-0.483190-0.765104-0.1180830.4010540.805544-0.215368-0.222057-0.296446-0.121933-0.0363130.0384580.0482050.039959-0.0378300.2618830.030427-0.001241
10-0.527957-0.571347-0.4058350.2568330.708732-0.2507150.105121-0.2729480.2027550.004397-0.3412110.1375330.153622-0.2451640.480117-0.095713-0.249430
11-0.587227-0.493291-0.4882850.3514680.639270-0.2552720.256668-0.2748700.284715-0.000538-0.4215330.1367920.249349-0.2285280.551138-0.198995-0.445792
12-0.661311-0.488233-0.6167580.3950220.478487-0.2579680.496442-0.2338570.276776-0.019413-0.5386510.0678370.327629-0.2425920.565016-0.408232-0.647129
13-0.731524-0.538342-0.7294290.3418590.308497-0.2551880.747433-0.1673980.222499-0.026797-0.6657320.0128700.367431-0.2764560.558572-0.561908-0.770647
14-0.778393-0.608182-0.7584280.2108610.217719-0.2226110.914947-0.1211130.195051-0.020288-0.7753760.0514840.334355-0.3094220.563821-0.534780-0.763223
15-0.790926-0.671619-0.6835070.0743110.240690-0.1435120.944068-0.1329240.232785-0.016804-0.8374510.2051780.215803-0.3199110.577553-0.333223-0.635557
16-0.773691-0.719550-0.5474440.0077930.341262-0.0367190.852336-0.1988900.310146-0.033683-0.8242180.4225680.054875-0.2965520.574741-0.086485-0.457625
17-0.743719-0.748091-0.4242980.0385840.4454640.0444250.713990-0.2711020.364647-0.072102-0.7257850.614307-0.058577-0.2458000.5499530.054238-0.317107
18-0.717170-0.746447-0.3706000.1314960.4922750.0471400.608417-0.2917050.352230-0.114866-0.5656510.709583-0.045305-0.1912940.5419050.025520-0.268386
19-0.696063-0.701602-0.3984880.2217750.463341-0.0377170.571261-0.2374000.282085-0.138031-0.3998490.6926070.099896-0.1636070.610271-0.114319-0.305556
src_emb = nn.Embedding(src_vocab_size, d_model)
pos_emb = PositionalEncoding(d_model, dropout_p)

enc_outputs = src_emb(enc_inputs.astype(mstype.int32))
enc_outputs = pos_emb(enc_outputs * scaling_factor)
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:17:00.481.768 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
enc_outputs
Tensor(shape=[1, 20, 512], dtype=Float32, value=
[[[ 2.38232508e-01,  1.23017490e+00,  9.87986699e-02 ...  1.39036012e+00, -2.15340674e-01,  6.75387204e-01],
  [ 6.48513019e-01,  3.48451197e-01,  1.02472794e+00 ...  9.24613714e-01, -3.20152938e-01,  8.93187225e-01],
  [ 8.02199662e-01, -9.74555016e-02,  8.44442010e-01 ...  1.00469291e+00,  1.54901832e-01,  1.37700772e+00],
  ...
  [-1.18655479e+00,  1.24188036e-01, -7.51270711e-01 ...  1.32025051e+00,  1.47339404e-01,  5.98086953e-01],
  [-9.76144791e-01,  1.05966806e+00, -1.11012220e+00 ...  1.32025039e+00,  1.47443071e-01,  5.98086774e-01],
  [-7.52802789e-02,  1.38805604e+00, -6.11481130e-01 ...  1.32025015e+00,  1.47546738e-01,  5.98086536e-01]]])
layers
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:17:09.275.680 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:17:09.278.104 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:17:09.279.221 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:17:09.280.636 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:17:09.281.643 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:17:09.282.647 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.





CellList<
  (0): DecoderLayer<
    (dec_self_attn): MultiHeadAttention<
      (W_Q): Dense<input_channels=512, output_channels=512, has_bias=True>
      (W_K): Dense<input_channels=512, output_channels=512, has_bias=True>
      (W_V): Dense<input_channels=512, output_channels=512, has_bias=True>
      (W_O): Dense<input_channels=512, output_channels=512, has_bias=True>
      (attention): ScaledDotProductAttention<
        (softmax): Softmax<>
        (dropout): Dropout<keep_prob=1.0>
        >
      >
    (dec_enc_attn): MultiHeadAttention<
      (W_Q): Dense<input_channels=512, output_channels=512, has_bias=True>
      (W_K): Dense<input_channels=512, output_channels=512, has_bias=True>
      (W_V): Dense<input_channels=512, output_channels=512, has_bias=True>
      (W_O): Dense<input_channels=512, output_channels=512, has_bias=True>
      (attention): ScaledDotProductAttention<
        (softmax): Softmax<>
        (dropout): Dropout<keep_prob=1.0>
        >
      >
    (pos_ffn): PoswiseFeedForward<
      (linear1): Dense<input_channels=512, output_channels=2048, has_bias=True>
      (linear2): Dense<input_channels=2048, output_channels=512, has_bias=True>
      (dropout): Dropout<keep_prob=1.0>
      (relu): ReLU<>
      >
    (add_norm1): AddNorm<
      (layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=0.add_norm1.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=0.add_norm1.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
      (dropout): Dropout<keep_prob=1.0>
      >
    (add_norm2): AddNorm<
      (layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=0.add_norm2.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=0.add_norm2.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
      (dropout): Dropout<keep_prob=1.0>
      >
    (add_norm3): AddNorm<
      (layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=0.add_norm3.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=0.add_norm3.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
      (dropout): Dropout<keep_prob=1.0>
      >
    >
  >
# dec_self_attns, dec_enc_attns = [], []
# for layer in layers:
#     dec_outputs, dec_self_attn, dec_enc_attn = layer(dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)
#     dec_self_attns.append(dec_self_attn)
#     dec_enc_attns.append(dec_enc_attn)
dec_outputs
Tensor(shape=[1, 20, 17], dtype=Float32, value=
[[[-8.26425076e-01, -1.20410120e+00, -1.28439963e-01 ...  5.64422965e-01,  1.06194824e-01, -2.42545847e-02],
  [-9.16367471e-01, -1.43073356e+00, -5.09288907e-02 ...  7.33686745e-01, -4.93338741e-02,  5.19632623e-02],
  [-9.17246103e-01, -1.34883523e+00, -3.61622870e-02 ...  8.07045281e-01,  3.27018559e-01,  7.60697350e-02],
  ...
  [-7.43718505e-01, -7.48091042e-01, -4.24297601e-01 ...  5.49952924e-01,  5.42376600e-02, -3.17106813e-01],
  [-7.17169762e-01, -7.46447027e-01, -3.70600104e-01 ...  5.41904747e-01,  2.55204104e-02, -2.68386006e-01],
  [-6.96062922e-01, -7.01602280e-01, -3.98487747e-01 ...  6.10271156e-01, -1.14318877e-01, -3.05555671e-01]]])
import numpy as np
import pandas as pd
# 将MindSpore张量转换为NumPy数组
np_mask = dec_outputs.asnumpy()

# 将NumPy数组转换为Pandas DataFrame进行可视化
df_mask = pd.DataFrame(np_mask[0])

# 打印DataFrame查看
#print(df_mask)

# 如果你想在Jupyter notebook中以表格形式更美观地展示,直接赋值即可
display(df_mask)
012345678910111213141516
0-0.826425-1.204101-0.1284400.3563290.452220-0.4524430.606300-0.2279300.269083-0.173742-1.121589-0.0931020.518840-0.3571700.5644230.106195-0.024255
1-0.916367-1.430734-0.0509290.6691960.806917-0.3688630.607177-0.1213520.6938820.175020-1.045657-0.1090760.280068-0.1373640.733687-0.0493340.051963
2-0.917246-1.348835-0.0361620.3191810.898127-0.2387850.533196-0.0690170.3789460.431980-1.048683-0.2210630.487657-0.2227890.8070450.3270190.076070
3-0.815883-1.066178-0.0089490.5607680.8114900.2023950.1513530.0493590.3876740.457533-0.773893-0.0701260.661461-0.2651920.6818980.4253250.144593
4-0.912663-1.1436630.0286450.5825700.7170010.1732760.264394-0.2384640.151882-0.051793-1.004650-0.2018170.390479-0.4884080.5630530.0702140.069295
5-0.597179-0.9160890.1510130.4923380.7316510.274230-0.319707-0.2803150.292216-0.101262-0.709801-0.1061900.033473-0.2666010.519980-0.311297-0.074184
6-0.386974-0.836567-0.3628280.3032860.6914300.3069320.119565-0.072272-0.051085-0.128228-0.738584-0.3422260.168470-0.4180370.4579690.0198830.071905
7-0.376257-0.822605-0.2730470.3704780.5977020.235920-0.052315-0.481007-0.074119-0.521327-0.295519-0.3012620.167297-0.2962590.062927-0.4585300.012958
8-0.083013-0.837436-0.3658910.3712930.661609-0.203302-0.033456-0.376518-0.236429-0.157299-0.149379-0.0170830.167552-0.4144380.437286-0.254605-0.090462
9-0.483190-0.765104-0.1180830.4010540.805544-0.215368-0.222057-0.296446-0.121933-0.0363130.0384580.0482050.039959-0.0378300.2618830.030427-0.001241
10-0.527957-0.571347-0.4058350.2568330.708732-0.2507150.105121-0.2729480.2027550.004397-0.3412110.1375330.153622-0.2451640.480117-0.095713-0.249430
11-0.587227-0.493291-0.4882850.3514680.639270-0.2552720.256668-0.2748700.284715-0.000538-0.4215330.1367920.249349-0.2285280.551138-0.198995-0.445792
12-0.661311-0.488233-0.6167580.3950220.478487-0.2579680.496442-0.2338570.276776-0.019413-0.5386510.0678370.327629-0.2425920.565016-0.408232-0.647129
13-0.731524-0.538342-0.7294290.3418590.308497-0.2551880.747433-0.1673980.222499-0.026797-0.6657320.0128700.367431-0.2764560.558572-0.561908-0.770647
14-0.778393-0.608182-0.7584280.2108610.217719-0.2226110.914947-0.1211130.195051-0.020288-0.7753760.0514840.334355-0.3094220.563821-0.534780-0.763223
15-0.790926-0.671619-0.6835070.0743110.240690-0.1435120.944068-0.1329240.232785-0.016804-0.8374510.2051780.215803-0.3199110.577553-0.333223-0.635557
16-0.773691-0.719550-0.5474440.0077930.341262-0.0367190.852336-0.1988900.310146-0.033683-0.8242180.4225680.054875-0.2965520.574741-0.086485-0.457625
17-0.743719-0.748091-0.4242980.0385840.4454640.0444250.713990-0.2711020.364647-0.072102-0.7257850.614307-0.058577-0.2458000.5499530.054238-0.317107
18-0.717170-0.746447-0.3706000.1314960.4922750.0471400.608417-0.2917050.352230-0.114866-0.5656510.709583-0.045305-0.1912940.5419050.025520-0.268386
19-0.696063-0.701602-0.3984880.2217750.463341-0.0377170.571261-0.2374000.282085-0.138031-0.3998490.6926070.099896-0.1636070.610271-0.114319-0.305556
dec_self_attn ##[:, 0, :, :]
Tensor(shape=[1, 2, 20, 20], dtype=Float32, value=
[[[[ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
   [ 4.88311976e-01,  5.11688054e-01,  0.00000000e+00 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
   [ 3.26110035e-01,  3.29261810e-01,  3.44628096e-01 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
   ...
   [ 1.19124077e-01,  1.04904100e-01,  1.12861328e-01 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
   [ 1.16319194e-01,  1.04303002e-01,  1.15375683e-01 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
   [ 1.13259226e-01,  1.03208810e-01,  1.16634019e-01 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00]],
  [[ 1.00000000e+00,  0.00000000e+00,  0.00000000e+00 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
   [ 5.10011137e-01,  4.89988834e-01,  0.00000000e+00 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
   [ 3.24341774e-01,  2.97710985e-01,  3.77947241e-01 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
   ...
   [ 7.01457039e-02,  5.64711951e-02,  8.03415105e-02 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
   [ 7.39139169e-02,  5.84961846e-02,  8.06718767e-02 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
   [ 7.74101689e-02,  6.16681278e-02,  8.33512843e-02 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00]]]])
import numpy as np
import pandas as pd
# 将MindSpore张量转换为NumPy数组
np_mask = dec_self_attn[:, 0, :, :].asnumpy()

# 将NumPy数组转换为Pandas DataFrame进行可视化
df_mask = pd.DataFrame(np_mask[0])

# 打印DataFrame查看
#print(df_mask)

# 如果你想在Jupyter notebook中以表格形式更美观地展示,直接赋值即可
display(df_mask)
012345678910111213141516171819
01.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.00.00.00.00.00.00.00.00.00.0
10.4883120.5116880.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.00.00.00.00.00.00.00.00.00.0
20.3261100.3292620.3446280.0000000.0000000.0000000.0000000.0000000.0000000.0000000.00.00.00.00.00.00.00.00.00.0
30.2591630.2590820.2579210.2238340.0000000.0000000.0000000.0000000.0000000.0000000.00.00.00.00.00.00.00.00.00.0
40.1997420.2192560.2065360.1975480.1769170.0000000.0000000.0000000.0000000.0000000.00.00.00.00.00.00.00.00.00.0
50.1974360.1953690.1649210.1503340.1463280.1456130.0000000.0000000.0000000.0000000.00.00.00.00.00.00.00.00.00.0
60.1518790.1547620.1539830.1354280.1261150.1329910.1448420.0000000.0000000.0000000.00.00.00.00.00.00.00.00.00.0
70.1321790.1416260.1406910.1190550.1080900.1320290.1222820.1040480.0000000.0000000.00.00.00.00.00.00.00.00.00.0
80.1326450.1373150.1190080.0946900.0923000.1166250.1072470.1066650.0935040.0000000.00.00.00.00.00.00.00.00.00.0
90.1237650.1371740.1243850.0975980.0892020.1010250.0895630.0782910.0774740.0815220.00.00.00.00.00.00.00.00.00.0
100.1220730.1293320.1186550.0996750.0951270.0989200.0887240.0833990.0786090.0854860.00.00.00.00.00.00.00.00.00.0
110.1215310.1253050.1172060.0981220.0954790.0937670.0881690.0863110.0824800.0916300.00.00.00.00.00.00.00.00.00.0
120.1206310.1197420.1149310.0957630.0927630.0898600.0893730.0902110.0874670.0992590.00.00.00.00.00.00.00.00.00.0
130.1188970.1134280.1122010.0938660.0892930.0887430.0923630.0946720.0919490.1045900.00.00.00.00.00.00.00.00.00.0
140.1184970.1088800.1098760.0932670.0871320.0893950.0951410.0979620.0943830.1054660.00.00.00.00.00.00.00.00.00.0
150.1193610.1058730.1090360.0939210.0871560.0910030.0959430.0994030.0946310.1036730.00.00.00.00.00.00.00.00.00.0
160.1197440.1048970.1102320.0949260.0888940.0919600.0941360.0994880.0929670.1027560.00.00.00.00.00.00.00.00.00.0
170.1191240.1049040.1128610.0965990.0910610.0924150.0903440.0979360.0908610.1038940.00.00.00.00.00.00.00.00.00.0
180.1163190.1043030.1153760.0986270.0926850.0940490.0879790.0959380.0892120.1055110.00.00.00.00.00.00.00.00.00.0
190.1132590.1032090.1166340.1003940.0940540.0975550.0892400.0935120.0880420.1041000.00.00.00.00.00.00.00.00.00.0
import numpy as np
import pandas as pd
# 将MindSpore张量转换为NumPy数组
np_mask = dec_self_attn[:, 1, :, :].asnumpy()

# 将NumPy数组转换为Pandas DataFrame进行可视化
df_mask = pd.DataFrame(np_mask[0])

# 打印DataFrame查看
#print(df_mask)

# 如果你想在Jupyter notebook中以表格形式更美观地展示,直接赋值即可
display(df_mask)
012345678910111213141516171819
01.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.00.00.00.00.00.00.00.00.00.0
10.5100110.4899890.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.00.00.00.00.00.00.00.00.00.0
20.3243420.2977110.3779470.0000000.0000000.0000000.0000000.0000000.0000000.0000000.00.00.00.00.00.00.00.00.00.0
30.2111800.2128400.2679730.3080070.0000000.0000000.0000000.0000000.0000000.0000000.00.00.00.00.00.00.00.00.00.0
40.1704990.1433970.1880100.2265260.2715680.0000000.0000000.0000000.0000000.0000000.00.00.00.00.00.00.00.00.00.0
50.1124370.1000110.1411470.1686720.2184800.2592530.0000000.0000000.0000000.0000000.00.00.00.00.00.00.00.00.00.0
60.0833260.0832590.1210610.1426170.1849520.2156440.1691410.0000000.0000000.0000000.00.00.00.00.00.00.00.00.00.0
70.0684360.0641600.0934520.1135390.1554800.1782000.1773200.1494150.0000000.0000000.00.00.00.00.00.00.00.00.00.0
80.0697310.0651220.0899480.1037480.1389180.1622210.1470510.1126270.1106350.0000000.00.00.00.00.00.00.00.00.00.0
90.0629820.0653330.0889160.0952520.1148610.1400660.1244540.1131550.1051120.0898700.00.00.00.00.00.00.00.00.00.0
100.0666070.0630840.0849690.0901220.1132620.1458460.1403140.1077410.1063240.0817310.00.00.00.00.00.00.00.00.00.0
110.0725980.0678490.0895550.0910520.1114570.1422400.1367180.1062330.1020440.0802550.00.00.00.00.00.00.00.00.00.0
120.0765730.0722560.0930440.0919360.1118910.1394450.1332660.1042330.0985310.0788250.00.00.00.00.00.00.00.00.00.0
130.0763590.0722770.0934110.0918420.1138510.1399540.1346790.1033900.0966690.0775680.00.00.00.00.00.00.00.00.00.0
140.0738220.0679980.0907990.0905000.1149100.1430970.1399860.1043070.0975740.0770080.00.00.00.00.00.00.00.00.00.0
150.0708220.0623670.0867520.0893840.1139030.1459030.1476200.1063060.0994460.0774970.00.00.00.00.00.00.00.00.00.0
160.0690800.0577390.0827150.0890480.1117880.1461690.1549420.1096000.1002600.0786590.00.00.00.00.00.00.00.00.00.0
170.0701460.0564710.0803420.0905840.1103240.1425500.1565220.1126670.0996540.0807400.00.00.00.00.00.00.00.00.00.0
180.0739140.0584960.0806720.0942160.1104510.1368590.1510210.1135690.0978170.0829840.00.00.00.00.00.00.00.00.00.0
190.0774100.0616680.0833510.0984990.1117280.1315750.1427260.1115210.0966790.0848430.00.00.00.00.00.00.00.00.00.0
dec_enc_attn
Tensor(shape=[1, 2, 20, 20], dtype=Float32, value=
[[[[ 1.01425938e-01,  9.65336189e-02,  8.71985629e-02 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
   [ 9.84132364e-02,  9.79700983e-02,  8.79131854e-02 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
   [ 1.00768864e-01,  9.35822800e-02,  8.65975544e-02 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
   ...
   [ 9.67069641e-02,  8.96922722e-02,  8.70323330e-02 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
   [ 9.73924100e-02,  8.99903551e-02,  8.55390504e-02 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
   [ 9.82568413e-02,  9.09924284e-02,  8.41078386e-02 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00]],
  [[ 7.43333548e-02,  8.84491727e-02,  8.89878944e-02 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
   [ 6.88009188e-02,  8.48135203e-02,  8.68554562e-02 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
   [ 7.22721964e-02,  8.68267119e-02,  8.65649804e-02 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
   ...
   [ 7.10219890e-02,  7.84036592e-02,  8.31521899e-02 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
   [ 7.41301030e-02,  8.02771002e-02,  8.50360170e-02 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
   [ 7.73167536e-02,  8.33537877e-02,  8.75786170e-02 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00]]]])
import numpy as np
import pandas as pd
# 将MindSpore张量转换为NumPy数组
np_mask = dec_enc_attn[:, 0, :, :].asnumpy()

# 将NumPy数组转换为Pandas DataFrame进行可视化
df_mask = pd.DataFrame(np_mask[0])

# 打印DataFrame查看
#print(df_mask)

# 如果你想在Jupyter notebook中以表格形式更美观地展示,直接赋值即可
display(df_mask)
012345678910111213141516171819
00.1014260.0965340.0871990.0898020.0757320.0875070.0911480.0995380.0837210.1034630.0839300.00.00.00.00.00.00.00.00.0
10.0984130.0979700.0879130.0903240.0723380.0774710.0894060.1015710.0906560.1103700.0835670.00.00.00.00.00.00.00.00.0
20.1007690.0935820.0865980.0868640.0705730.0778600.0862830.0999040.0908690.1157100.0909880.00.00.00.00.00.00.00.00.0
30.0988900.0927810.0857130.0903530.0699960.0786930.0855800.0971860.0856540.1209510.0942030.00.00.00.00.00.00.00.00.0
40.0915380.0942230.0822750.0926100.0735560.0820790.0949540.1025450.0828600.1101940.0931670.00.00.00.00.00.00.00.00.0
50.0904890.0924490.0842430.0937760.0759100.0909750.0972190.0999950.0837220.1030930.0881290.00.00.00.00.00.00.00.00.0
60.0929880.0939510.0855530.0907710.0782870.0936540.0973890.1015130.0801010.0978080.0879850.00.00.00.00.00.00.00.00.0
70.0876110.0906340.0856790.0944690.0922450.0996350.0965210.0965810.0844210.0890980.0831070.00.00.00.00.00.00.00.00.0
80.0915030.0887450.0846920.0909170.0870250.0977550.0981560.0969780.0910870.0890950.0840470.00.00.00.00.00.00.00.00.0
90.0930260.0898010.0861990.0928690.0850820.0930950.0974460.0961350.0878990.0936000.0848470.00.00.00.00.00.00.00.00.0
100.0923280.0884120.0841910.0935740.0890570.0924860.0920980.0951780.0870310.0925520.0930920.00.00.00.00.00.00.00.00.0
110.0900370.0883620.0822710.0930290.0893000.0934190.0926760.0959340.0857540.0942330.0949840.00.00.00.00.00.00.00.00.0
120.0892590.0892880.0819270.0921280.0900560.0950220.0930270.0965510.0849020.0937130.0941250.00.00.00.00.00.00.00.00.0
130.0904200.0903640.0828830.0909570.0905200.0958970.0928300.0969910.0853590.0924500.0913280.00.00.00.00.00.00.00.00.0
140.0928490.0907680.0848260.0905770.0901280.0958000.0914730.0967160.0863390.0916950.0888300.00.00.00.00.00.00.00.00.0
150.0948830.0903860.0865890.0913950.0892430.0953820.0901580.0960300.0865470.0916410.0877450.00.00.00.00.00.00.00.00.0
160.0958300.0896750.0873860.0931870.0886880.0948500.0898560.0947200.0857120.0917630.0883320.00.00.00.00.00.00.00.00.0
170.0967070.0896920.0870320.0948590.0884700.0944060.0908900.0924780.0844140.0915610.0894890.00.00.00.00.00.00.00.00.0
180.0973920.0899900.0855390.0952710.0877820.0936040.0933640.0907490.0840760.0915270.0907050.00.00.00.00.00.00.00.00.0
190.0982570.0909920.0841080.0943170.0867860.0922230.0955320.0899000.0853360.0918260.0907230.00.00.00.00.00.00.00.00.0
import numpy as np
import pandas as pd
# 将MindSpore张量转换为NumPy数组
np_mask = dec_enc_attn[:, 1, :, :].asnumpy()

# 将NumPy数组转换为Pandas DataFrame进行可视化
df_mask = pd.DataFrame(np_mask[0])

# 打印DataFrame查看
#print(df_mask)

# 如果你想在Jupyter notebook中以表格形式更美观地展示,直接赋值即可
display(df_mask)
012345678910111213141516171819
00.0743330.0884490.0889880.0997350.1120510.0944310.0947090.0878940.0810490.0861650.0921950.00.00.00.00.00.00.00.00.0
10.0688010.0848140.0868550.0985260.1115490.0962430.0938340.0890650.0845000.0917620.0940500.00.00.00.00.00.00.00.00.0
20.0722720.0868270.0865650.0866570.1006700.0851090.0970110.0933840.0987370.0925800.1001870.00.00.00.00.00.00.00.00.0
30.0777590.0942530.0831730.0829820.0934280.0843450.0920840.0957420.0969650.1003730.0988960.00.00.00.00.00.00.00.00.0
40.0795610.0961690.0869720.0847010.0869480.0814800.0949670.0960850.0962510.1017300.0951360.00.00.00.00.00.00.00.00.0
50.0789830.0929700.0808780.0823040.0860060.0808270.0974340.1012960.1055440.0979980.0957600.00.00.00.00.00.00.00.00.0
60.0805870.0909720.0809110.0869330.0886510.0855690.0966750.0980180.1090900.0949270.0876670.00.00.00.00.00.00.00.00.0
70.0725830.0862860.0777300.0866290.0924460.0865500.1000310.0996400.1077280.0940980.0962790.00.00.00.00.00.00.00.00.0
80.0706550.0844300.0793640.0810340.0883040.0892190.0999650.1001910.1134400.0975780.0958200.00.00.00.00.00.00.00.00.0
90.0716440.0854570.0802280.0780220.0926350.0869500.0967330.0986120.1098040.1011210.0987940.00.00.00.00.00.00.00.00.0
100.0743290.0850050.0806940.0797010.0966530.0927290.1052490.0954420.1099250.0904990.0897750.00.00.00.00.00.00.00.00.0
110.0776880.0878400.0841560.0806570.0969970.0937180.1033360.0924650.1056090.0887510.0887840.00.00.00.00.00.00.00.00.0
120.0791080.0886080.0856620.0831790.0962360.0948080.1028450.0906890.1026700.0883340.0878610.00.00.00.00.00.00.00.00.0
130.0779280.0869920.0852190.0861740.0953630.0955660.1038020.0906310.1024130.0888050.0871050.00.00.00.00.00.00.00.00.0
140.0747510.0835860.0833730.0878890.0941680.0955310.1054140.0923790.1051550.0899650.0877890.00.00.00.00.00.00.00.00.0
150.0711850.0802070.0820900.0884530.0932370.0948180.1055570.0945890.1090680.0911610.0896350.00.00.00.00.00.00.00.00.0
160.0696880.0782870.0820860.0886410.0931390.0943430.1043040.0957260.1116650.0910290.0910930.00.00.00.00.00.00.00.00.0
170.0710220.0784040.0831520.0893940.0939290.0945730.1026700.0953950.1106110.0896710.0911790.00.00.00.00.00.00.00.00.0
180.0741300.0802770.0850360.0914640.0954250.0950110.1011230.0938570.1057170.0880630.0898970.00.00.00.00.00.00.00.00.0
190.0773170.0833540.0875790.0937920.0969430.0946730.0996240.0920720.0998380.0865410.0882670.00.00.00.00.00.00.00.00.0

dec_self_attns
[]
dec_enc_attns
[]
#                 # encoder,输出表示源序列信息tensor
#         # enc_ouputs: [batch_size, src_len, d_model]
#         enc_outputs, enc_self_attns = self.encoder(enc_inputs, src_pad_idx)

#         # decoder
#         # de_outputs: [batch_size, trg_len, trg_vocab_size]
#         dec_outputs, dec_self_attns, dec_enc_attns = self.decoder(dec_inputs, enc_inputs, enc_outputs, src_pad_idx, trg_pad_idx)

    
    # dec_logits = dec_outputs.view((-1, dec_outputs.shape[-1]))
        # print("shape:",dec_logits.shape)  ###输出的其中一个是输出的目标最大长度trg_vocab_size
        
        
#######对应下面

#     # 训练过程中不应该包含目标序列中的最后一个词元<eos>
#     # logits: [batch_size * (trg_len - 1), trg_vocab_size]
#     logits, _, _, _ = model(enc_inputs, dec_inputs[:, :-1], src_pad_idx, trg_pad_idx)  ##logits=dec_logits
    
#     # 推理结果不应该包含目标序列中的第一个词元<bos>
#     # targets: [batch_size * (trg_len -1), ]
#     targets = dec_inputs[:, 1:].view(-1)
#     loss = loss_fn(logits, targets)
dec_outputs  ##目前最后一维度是512
Tensor(shape=[1, 20, 17], dtype=Float32, value=
[[[-8.26425076e-01, -1.20410120e+00, -1.28439963e-01 ...  5.64422965e-01,  1.06194824e-01, -2.42545847e-02],
  [-9.16367471e-01, -1.43073356e+00, -5.09288907e-02 ...  7.33686745e-01, -4.93338741e-02,  5.19632623e-02],
  [-9.17246103e-01, -1.34883523e+00, -3.61622870e-02 ...  8.07045281e-01,  3.27018559e-01,  7.60697350e-02],
  ...
  [-7.43718505e-01, -7.48091042e-01, -4.24297601e-01 ...  5.49952924e-01,  5.42376600e-02, -3.17106813e-01],
  [-7.17169762e-01, -7.46447027e-01, -3.70600104e-01 ...  5.41904747e-01,  2.55204104e-02, -2.68386006e-01],
  [-6.96062922e-01, -7.01602280e-01, -3.98487747e-01 ...  6.10271156e-01, -1.14318877e-01, -3.05555671e-01]]])
# enc_outputs, enc_self_attns = encoder(enc_inputs, src_pad_idx)
# dec_outputs, dec_self_attns, dec_enc_attns = decoder(dec_inputs, enc_inputs, enc_outputs, src_pad_idx, trg_pad_idx)
dec_outputs
Tensor(shape=[1, 20, 17], dtype=Float32, value=
[[[-8.26425076e-01, -1.20410120e+00, -1.28439963e-01 ...  5.64422965e-01,  1.06194824e-01, -2.42545847e-02],
  [-9.16367471e-01, -1.43073356e+00, -5.09288907e-02 ...  7.33686745e-01, -4.93338741e-02,  5.19632623e-02],
  [-9.17246103e-01, -1.34883523e+00, -3.61622870e-02 ...  8.07045281e-01,  3.27018559e-01,  7.60697350e-02],
  ...
  [-7.43718505e-01, -7.48091042e-01, -4.24297601e-01 ...  5.49952924e-01,  5.42376600e-02, -3.17106813e-01],
  [-7.17169762e-01, -7.46447027e-01, -3.70600104e-01 ...  5.41904747e-01,  2.55204104e-02, -2.68386006e-01],
  [-6.96062922e-01, -7.01602280e-01, -3.98487747e-01 ...  6.10271156e-01, -1.14318877e-01, -3.05555671e-01]]])
dec_logits = dec_outputs.view((-1, dec_outputs.shape[-1]))
dec_logits
Tensor(shape=[20, 17], dtype=Float32, value=
[[-8.26425076e-01, -1.20410120e+00, -1.28439963e-01 ...  5.64422965e-01,  1.06194824e-01, -2.42545847e-02],
 [-9.16367471e-01, -1.43073356e+00, -5.09288907e-02 ...  7.33686745e-01, -4.93338741e-02,  5.19632623e-02],
 [-9.17246103e-01, -1.34883523e+00, -3.61622870e-02 ...  8.07045281e-01,  3.27018559e-01,  7.60697350e-02],
 ...
 [-7.43718505e-01, -7.48091042e-01, -4.24297601e-01 ...  5.49952924e-01,  5.42376600e-02, -3.17106813e-01],
 [-7.17169762e-01, -7.46447027e-01, -3.70600104e-01 ...  5.41904747e-01,  2.55204104e-02, -2.68386006e-01],
 [-6.96062922e-01, -7.01602280e-01, -3.98487747e-01 ...  6.10271156e-01, -1.14318877e-01, -3.05555671e-01]])
import numpy as np
import pandas as pd
# 将MindSpore张量转换为NumPy数组
np_mask = dec_logits.asnumpy()

# 将NumPy数组转换为Pandas DataFrame进行可视化
df_mask = pd.DataFrame(np_mask)

# 打印DataFrame查看
#print(df_mask)

# 如果你想在Jupyter notebook中以表格形式更美观地展示,直接赋值即可
display(df_mask)
012345678910111213141516
0-0.826425-1.204101-0.1284400.3563290.452220-0.4524430.606300-0.2279300.269083-0.173742-1.121589-0.0931020.518840-0.3571700.5644230.106195-0.024255
1-0.916367-1.430734-0.0509290.6691960.806917-0.3688630.607177-0.1213520.6938820.175020-1.045657-0.1090760.280068-0.1373640.733687-0.0493340.051963
2-0.917246-1.348835-0.0361620.3191810.898127-0.2387850.533196-0.0690170.3789460.431980-1.048683-0.2210630.487657-0.2227890.8070450.3270190.076070
3-0.815883-1.066178-0.0089490.5607680.8114900.2023950.1513530.0493590.3876740.457533-0.773893-0.0701260.661461-0.2651920.6818980.4253250.144593
4-0.912663-1.1436630.0286450.5825700.7170010.1732760.264394-0.2384640.151882-0.051793-1.004650-0.2018170.390479-0.4884080.5630530.0702140.069295
5-0.597179-0.9160890.1510130.4923380.7316510.274230-0.319707-0.2803150.292216-0.101262-0.709801-0.1061900.033473-0.2666010.519980-0.311297-0.074184
6-0.386974-0.836567-0.3628280.3032860.6914300.3069320.119565-0.072272-0.051085-0.128228-0.738584-0.3422260.168470-0.4180370.4579690.0198830.071905
7-0.376257-0.822605-0.2730470.3704780.5977020.235920-0.052315-0.481007-0.074119-0.521327-0.295519-0.3012620.167297-0.2962590.062927-0.4585300.012958
8-0.083013-0.837436-0.3658910.3712930.661609-0.203302-0.033456-0.376518-0.236429-0.157299-0.149379-0.0170830.167552-0.4144380.437286-0.254605-0.090462
9-0.483190-0.765104-0.1180830.4010540.805544-0.215368-0.222057-0.296446-0.121933-0.0363130.0384580.0482050.039959-0.0378300.2618830.030427-0.001241
10-0.527957-0.571347-0.4058350.2568330.708732-0.2507150.105121-0.2729480.2027550.004397-0.3412110.1375330.153622-0.2451640.480117-0.095713-0.249430
11-0.587227-0.493291-0.4882850.3514680.639270-0.2552720.256668-0.2748700.284715-0.000538-0.4215330.1367920.249349-0.2285280.551138-0.198995-0.445792
12-0.661311-0.488233-0.6167580.3950220.478487-0.2579680.496442-0.2338570.276776-0.019413-0.5386510.0678370.327629-0.2425920.565016-0.408232-0.647129
13-0.731524-0.538342-0.7294290.3418590.308497-0.2551880.747433-0.1673980.222499-0.026797-0.6657320.0128700.367431-0.2764560.558572-0.561908-0.770647
14-0.778393-0.608182-0.7584280.2108610.217719-0.2226110.914947-0.1211130.195051-0.020288-0.7753760.0514840.334355-0.3094220.563821-0.534780-0.763223
15-0.790926-0.671619-0.6835070.0743110.240690-0.1435120.944068-0.1329240.232785-0.016804-0.8374510.2051780.215803-0.3199110.577553-0.333223-0.635557
16-0.773691-0.719550-0.5474440.0077930.341262-0.0367190.852336-0.1988900.310146-0.033683-0.8242180.4225680.054875-0.2965520.574741-0.086485-0.457625
17-0.743719-0.748091-0.4242980.0385840.4454640.0444250.713990-0.2711020.364647-0.072102-0.7257850.614307-0.058577-0.2458000.5499530.054238-0.317107
18-0.717170-0.746447-0.3706000.1314960.4922750.0471400.608417-0.2917050.352230-0.114866-0.5656510.709583-0.045305-0.1912940.5419050.025520-0.268386
19-0.696063-0.701602-0.3984880.2217750.463341-0.0377170.571261-0.2374000.282085-0.138031-0.3998490.6926070.099896-0.1636070.610271-0.114319-0.305556
print("shape:",dec_logits.shape)  ###输出的其中一个是输出的目标最大长度trg_vocab_size
shape: (20, 17)
logits, _, _, _ = model(enc_inputs, dec_inputs[:, :-1], src_pad_idx, trg_pad_idx)
logits
shape: (19, 17)





Tensor(shape=[19, 17], dtype=Float32, value=
[[-2.90044618e+00, -2.73945975e+00, -2.43362784e+00 ...  3.80752683e-01, -5.63995481e-01, -5.36451638e-01],
 [-2.33331084e+00, -2.51542068e+00, -2.80998254e+00 ...  6.92990720e-01,  1.48079768e-02, -2.17762724e-01],
 [-2.44148254e+00, -2.42424273e+00, -2.58729625e+00 ...  5.25278568e-01, -2.94933200e-01, -4.10319030e-01],
 ...
 [-2.09368324e+00, -9.18468475e-01, -2.58029604e+00 ...  3.18956435e-01,  3.11222553e-01, -9.76157486e-02],
 [-2.36811519e+00, -1.12266421e+00, -2.77283788e+00 ... -3.81676793e-01,  5.28824031e-02,  2.79054910e-01],
 [-2.15698123e+00, -1.33755660e+00, -2.65571284e+00 ...  1.98437542e-01,  2.83312023e-01,  5.97692244e-02]])
targets = dec_inputs[:, 1:].view(-1)
targets
Tensor(shape=[19], dtype=Int32, value= [ 6,  7,  8,  9, 10,  4, 11,  5,  3,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1])
loss = loss_fn(logits, targets)
loss
Tensor(shape=[], dtype=Float32, value= 1.97173)



print("yangge  mindspore 打卡第十一天  trainsformer之padding过程 2024-07-01")
yangge  mindspore 打卡第十一天  trainsformer之padding过程 2024-07-01


# 反向传播计算梯度
grad_fn = ops.value_and_grad(forward, None, optimizer.parameters)
grad_fn
<function mindspore.ops.composite.base._Grad.__call__.<locals>.after_grad(*args, **kwargs)>
# logits, _, _, _ = model(enc_inputs, dec_inputs[:, :-1], src_pad_idx, trg_pad_idx) 
# 训练一个step的逻辑
def train_step(enc_inputs, dec_inputs):
    # 反向传播,获得梯度
    loss, grads = grad_fn(enc_inputs, dec_inputs)
    # 权重更新
    optimizer(grads)
    return loss
def train(iterator, epoch=0):
    model.set_train(True)
    num_batches = len(iterator)
    total_loss = 0  # 所有batch训练loss的累加
    total_steps = 0  # 训练步数

    with tqdm(total=num_batches) as t:
        t.set_description(f'EpochT: {epoch}')
        for src, src_len, trg in iterator():
            print("src:",src)
            print(src.shape)
            print("src_len:",src_len)
            print("------")
            print("trg:",trg)
            print(trg.shape)
            
            # 计算当前batch数据的loss
            loss = train_step(src, trg)  ###grad_fn = ops.value_and_grad(forward  输入encoder 和decoder
            total_loss += loss.asnumpy()
            print("loss---->:",loss)
            total_steps += 1
            # 当前的平均loss
            curr_loss = total_loss / total_steps
            t.set_postfix({'lossT': f'{curr_loss:.2f}'})
            t.update(1)

    return total_loss / total_steps
def evaluate(iterator):
    model.set_train(False)
    num_batches = len(iterator)
    total_loss = 0  # 所有batch训练loss的累加
    total_steps = 0  # 训练步数

    with tqdm(total=num_batches) as t:
        for src, _, trg in iterator():
            # 计算当前batch数据的loss
            loss = forward(src, trg)  ####这个直接forword 得到loss ---- logits: [batch_size * (trg_len - 1), trg_vocab_size]
            total_loss += loss.asnumpy()
            total_steps += 1
            # 当前的平均loss
            curr_loss = total_loss / total_steps
            t.set_postfix({'loss': f'{curr_loss:.2f}'})
            t.update(1)

    return total_loss / total_steps
cache_dir="./"
from download import download
from pathlib import Path
from tqdm import tqdm
import os

from mindspore import save_checkpoint

num_epochs = 2  # 训练迭代数
best_valid_loss = float('inf')  # 当前最佳验证损失
ckpt_file_name = os.path.join(cache_dir, 'transformer.ckpt')  # 模型保存路径


for i in range(num_epochs):
    # 模型训练,网络权重更新
    train_loss = train(train_iterator, i)
    # 网络权重更新后对模型进行验证
    #valid_loss = evaluate(valid_iterator)
    
    # # 保存当前效果最好的模型
    # if valid_loss < best_valid_loss:
    #     best_valid_loss = valid_loss
    #     save_checkpoint(model, ckpt_file_name)
EpochT: 0: 100%|██████████| 2/2 [00:00<00:00, 12.03it/s, lossT=2.18]


src: [[ 2  6  7  8  9 10 11 12 13  4  3  1  1  1  1  1  1  1  1  1]]
(1, 20)
src_len: [11]
------
trg: [[ 2  6  7  8  9 10  4 11  5  3  1  1  1  1  1  1  1  1  1  1]]
(1, 20)
shape: (19, 17)
loss---->: 2.2212398
src: [[ 2  5 14 15 16 17 18  5 19 20  4  3  1  1  1  1  1  1  1  1]]
(1, 20)
src_len: [12]
------
trg: [[ 2  4 12 13 14  4 15 16  5  3  1  1  1  1  1  1  1  1  1  1]]
(1, 20)
shape: (19, 17)
loss---->: 2.1410027


EpochT: 1:   0%|          | 0/2 [00:00<?, ?it/s]

src: [[ 2  6  7  8  9 10 11 12 13  4  3  1  1  1  1  1  1  1  1  1]]
(1, 20)
src_len: [11]
------
trg: [[ 2  6  7  8  9 10  4 11  5  3  1  1  1  1  1  1  1  1  1  1]]
(1, 20)
shape: (19, 17)


EpochT: 1: 100%|██████████| 2/2 [00:00<00:00, 12.09it/s, lossT=1.96]

loss---->: 2.0625696
src: [[ 2  5 14 15 16 17 18  5 19 20  4  3  1  1  1  1  1  1  1  1]]
(1, 20)
src_len: [12]
------
trg: [[ 2  4 12 13 14  4 15 16  5  3  1  1  1  1  1  1  1  1  1  1]]
(1, 20)
shape: (19, 17)
loss---->: 1.8615117
import mindspore as ms

# 定义张量的具体值
values = [2, 6, 7, 8, 9, 10, 11, 12, 13, 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1]

# 使用mindspore.tensor创建张量
SRC = ms.Tensor([values], dtype=ms.int32)

print(SRC)
# 定义张量的具体值
values = [2, 6, 7, 8, 9, 10, 4, 11, 5, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

# 使用mindspore.tensor创建张量
TRG = ms.Tensor([values], dtype=ms.int32)

print(TRG)
[[ 2  6  7  8  9 10 11 12 13  4  3  1  1  1  1  1  1  1  1  1]]
SRC.shape
(1, 20)

[[ 2  6  7  8  9 10  4 11  5  3  1  1  1  1  1  1  1  1  1  1]]

  • 3
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值