mindspore打卡第十一天对trainsformer的padding代码的理解
修改训练数据 对padding的计算过程进行理解
Several women wait outside in a city.
A old man having a beer alone.
Several women wait outside in a city.
Mehrere Frauen warten in einer Stadt im Freien.
Ein alter Mann, der allein ein Bier trinkt.
Mehrere Frauen warten in einer Stadt im Freien.
Cell In[56], line 2
Several women wait outside in a city.
^
SyntaxError: invalid syntax
A person on a snowmobile in mid jump.
A woman sits at a dark bar.
A person on a snowmobile in mid jump.
Eine Person auf einem Schneemobil mitten im Sprung.
Eine Frau sitzt an einer dunklen Bar.
Eine Person auf einem Schneemobil mitten im Sprung.
A man in an orange hat starring at something.
People are fixing the roof of a house.
A man in an orange hat starring at something.
Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.
Leute Reparieren das Dach eines Hauses.
Ein Mann mit einem orangefarbenen Hut, der etwas anstarrt.
Cell In[57], line 2
A man in an orange hat starring at something.
^
SyntaxError: invalid syntax
train_path = "/home/nginx/work/data/train"
valid_path = "/home/nginx/work/data/valid"
test_path = "/home/nginx/work/data/test"
import re
import os
class Multi30K ( ) :
"""Multi30K数据集加载器
加载Multi30K数据集并处理为一个Python迭代对象。
"""
def __init__ ( self, path) :
self. data = self. _load( path)
def _load ( self, path) :
def tokenize ( text) :
text = text. rstrip( )
return [ tok. lower( ) for tok in re. findall( r'\w+|[^\w\s]' , text) ]
members = { i. split( '.' ) [ - 1 ] : i for i in os. listdir( path) }
de_path = os. path. join( path, members[ 'de1' ] )
en_path = os. path. join( path, members[ 'en1' ] )
with open ( de_path, 'r' , encoding= 'utf-8' ) as de_file:
de = de_file. readlines( ) [ : - 1 ]
de = [ tokenize( i) for i in de]
with open ( en_path, 'r' , encoding= 'utf-8' ) as en_file:
en = en_file. readlines( ) [ : - 1 ]
en = [ tokenize( i) for i in en]
return list ( zip ( de, en) )
def __getitem__ ( self, idx) :
return self. data[ idx]
def __len__ ( self) :
return len ( self. data)
train_dataset= Multi30K( train_path)
len ( train_dataset)
2
valid_dataset= Multi30K( valid_path)
len ( valid_dataset)
2
test_dataset = Multi30K( test_path)
len ( test_dataset)
2
train_dataset, valid_dataset, test_dataset = Multi30K( train_path) , Multi30K( valid_path) , Multi30K( test_path)
train_dataset
<__main__.Multi30K at 0xfffe106da1c0>
for de, en in test_dataset:
print ( f'de = { de} ' )
print ( f'en = { en} ' )
break
de = ['ein', 'mann', 'mit', 'einem', 'orangefarbenen', 'hut', ',', 'der', 'etwas', 'anstarrt', '.']
en = ['a', 'man', 'in', 'an', 'orange', 'hat', 'starring', 'at', 'something', '.']
class Vocab :
"""通过词频字典,构建词典"""
special_tokens = [ '<unk>' , '<pad>' , '<bos>' , '<eos>' ]
def __init__ ( self, word_count_dict, min_freq= 1 ) :
self. word2idx = { }
for idx, tok in enumerate ( self. special_tokens) :
self. word2idx[ tok] = idx
filted_dict = {
w: c
for w, c in word_count_dict. items( ) if c >= min_freq
}
for w, _ in filted_dict. items( ) :
self. word2idx[ w] = len ( self. word2idx)
self. idx2word = { idx: word for word, idx in self. word2idx. items( ) }
self. bos_idx = self. word2idx[ '<bos>' ]
self. eos_idx = self. word2idx[ '<eos>' ]
self. pad_idx = self. word2idx[ '<pad>' ]
self. unk_idx = self. word2idx[ '<unk>' ]
def _word2idx ( self, word) :
"""单词映射至数字索引"""
if word not in self. word2idx:
return self. unk_idx
return self. word2idx[ word]
def _idx2word ( self, idx) :
"""数字索引映射至单词"""
if idx not in self. idx2word:
raise ValueError( 'input index is not in vocabulary.' )
return self. idx2word[ idx]
def encode ( self, word_or_list) :
"""将单个单词或单词数组映射至单个数字索引或数字索引数组"""
if isinstance ( word_or_list, list ) :
return [ self. _word2idx( i) for i in word_or_list]
return self. _word2idx( word_or_list)
def decode ( self, idx_or_list) :
"""将单个数字索引或数字索引数组映射至单个单词或单词数组"""
if isinstance ( idx_or_list, list ) :
return [ self. _idx2word( i) for i in idx_or_list]
return self. _idx2word( idx_or_list)
def __len__ ( self) :
return len ( self. word2idx)
from collections import Counter, OrderedDict
def build_vocab ( dataset) :
de_words, en_words = [ ] , [ ]
for de, en in dataset:
de_words. extend( de)
en_words. extend( en)
de_count_dict = OrderedDict( sorted ( Counter( de_words) . items( ) , key= lambda t: t[ 1 ] , reverse= True ) )
en_count_dict = OrderedDict( sorted ( Counter( en_words) . items( ) , key= lambda t: t[ 1 ] , reverse= True ) )
return Vocab( de_count_dict, min_freq= 1 ) , Vocab( en_count_dict, min_freq= 1 )
de_vocab, en_vocab = build_vocab( train_dataset)
print ( 'Unique tokens in de vocabulary:' , len ( de_vocab) )
Unique tokens in de vocabulary: 21
de_vocab
<__main__.Vocab at 0xfffe18995c70>
import mindspore
class Iterator ( ) :
"""创建数据迭代器"""
def __init__ ( self, dataset, de_vocab, en_vocab, batch_size, max_len= 32 , drop_reminder= False ) :
self. dataset = dataset
self. de_vocab = de_vocab
self. en_vocab = en_vocab
self. batch_size = batch_size
self. max_len = max_len
self. drop_reminder = drop_reminder
length = len ( self. dataset) // batch_size
self. len = length if drop_reminder else length + 1
def __call__ ( self) :
def pad ( idx_list, vocab, max_len) :
"""统一序列长度,并记录有效长度"""
idx_pad_list, idx_len = [ ] , [ ]
for i in idx_list:
if len ( i) > max_len - 2 :
idx_pad_list. append(
[ vocab. bos_idx] + i[ : max_len- 2 ] + [ vocab. eos_idx]
)
idx_len. append( max_len)
else :
idx_pad_list. append(
[ vocab. bos_idx] + i + [ vocab. eos_idx] + [ vocab. pad_idx] * ( max_len - len ( i) - 2 )
)
idx_len. append( len ( i) + 2 )
return idx_pad_list, idx_len
def sort_by_length ( src, trg) :
"""对德/英语的字段长度进行排序"""
data = zip ( src, trg)
data = sorted ( data, key= lambda t: len ( t[ 0 ] ) , reverse= True )
return zip ( * list ( data) )
def encode_and_pad ( batch_data, max_len) :
"""将批量中的文本数据转换为数字索引,并统一每个序列的长度"""
src_data, trg_data = zip ( * batch_data)
src_idx = [ self. de_vocab. encode( i) for i in src_data]
trg_idx = [ self. en_vocab. encode( i) for i in trg_data]
src_idx, trg_idx = sort_by_length( src_idx, trg_idx)
src_idx_pad, src_len = pad( src_idx, de_vocab, max_len)
trg_idx_pad, _ = pad( trg_idx, en_vocab, max_len)
return src_idx_pad, src_len, trg_idx_pad
for i in range ( self. len ) :
if i == self. len - 1 and not self. drop_reminder:
batch_data = self. dataset[ i * self. batch_size: ]
else :
batch_data = self. dataset[ i * self. batch_size: ( i+ 1 ) * self. batch_size]
src_idx, src_len, trg_idx = encode_and_pad( batch_data, self. max_len)
yield mindspore. Tensor( src_idx, mindspore. int32) , \
mindspore. Tensor( src_len, mindspore. int32) , \
mindspore. Tensor( trg_idx, mindspore. int32)
def __len__ ( self) :
return self. len
len ( train_dataset) // 1
2
len ( train_dataset)
2
train_iterator = Iterator( train_dataset, de_vocab, en_vocab, batch_size= 1 , max_len= 20 , drop_reminder= True )
valid_iterator = Iterator( valid_dataset, de_vocab, en_vocab, batch_size= 1 , max_len= 20 , drop_reminder= False )
test_iterator = Iterator( test_dataset, de_vocab, en_vocab, batch_size= 1 , max_len= 20 , drop_reminder= False )
train_iterator
<__main__.Iterator at 0xffff98030b20>
len ( de_vocab)
21
len ( en_vocab)
17
de_vocab. pad_idx
1
en_vocab. pad_idx
1
import mindspore
from mindspore import nn
from mindspore import ops
from mindspore import Tensor
from mindspore import dtype as mstype
class ScaledDotProductAttention ( nn. Cell) :
def __init__ ( self, dropout_p= 0. ) :
super ( ) . __init__( )
self. softmax = nn. Softmax( )
self. dropout = nn. Dropout( 1 - dropout_p)
self. sqrt = ops. Sqrt( )
def construct ( self, query, key, value, attn_mask= None ) :
"""scaled dot product attention"""
embed_size = query. shape[ - 1 ]
scaling_factor = self. sqrt( Tensor( embed_size, mstype. float32) )
attn = ops. matmul( query, key. swapaxes( - 2 , - 1 ) / scaling_factor)
if attn_mask is not None :
attn = attn. masked_fill( attn_mask, - 1e9 )
attn = self. softmax( attn)
attn = self. dropout( attn)
output = ops. matmul( attn, value)
return ( output, attn)
def get_attn_pad_mask ( seq_q, seq_k, pad_idx) :
"""注意力掩码:识别序列中的<pad>占位符
Args:
seq_q (Tensor): query序列,shape = [batch size, query len]
seq_k (Tensor): key序列,shape = [batch size, key len]
pad_idx (Tensor): key序列<pad>占位符对应的数字索引
"""
batch_size, len_q = seq_q. shape
batch_size, len_k = seq_k. shape
pad_attn_mask = ops. equal( seq_k, pad_idx)
pad_attn_mask = pad_attn_mask. expand_dims( 1 )
pad_attn_mask = ops. broadcast_to( pad_attn_mask, ( batch_size, len_q, len_k) )
return pad_attn_mask
class MultiHeadAttention ( nn. Cell) :
def __init__ ( self, d_model, d_k, n_heads, dropout_p= 0. ) :
super ( ) . __init__( )
self. n_heads = n_heads
self. d_k = d_k
self. W_Q = nn. Dense( d_model, d_k * n_heads)
self. W_K = nn. Dense( d_model, d_k * n_heads)
self. W_V = nn. Dense( d_model, d_k * n_heads)
self. W_O = nn. Dense( n_heads * d_k, d_model)
self. attention = ScaledDotProductAttention( dropout_p= dropout_p)
def construct ( self, query, key, value, attn_mask) :
"""
query: [batch_size, len_q, d_model]
key: [batch_size, len_k, d_model]
value: [batch_size, len_k, d_model]
attn_mask: [batch_size, seq_len, seq_len]
"""
batch_size = query. shape[ 0 ]
q_s = self. W_Q( query) . view( batch_size, - 1 , self. n_heads, self. d_k)
k_s = self. W_K( key) . view( batch_size, - 1 , self. n_heads, self. d_k)
v_s = self. W_V( value) . view( batch_size, - 1 , self. n_heads, self. d_k)
q_s = q_s. transpose( ( 0 , 2 , 1 , 3 ) )
k_s = k_s. transpose( ( 0 , 2 , 1 , 3 ) )
v_s = v_s. transpose( ( 0 , 2 , 1 , 3 ) )
attn_mask = attn_mask. expand_dims( 1 )
attn_mask = ops. tile( attn_mask, ( 1 , self. n_heads, 1 , 1 ) )
context, attn = self. attention( q_s, k_s, v_s, attn_mask)
context = context. transpose( ( 0 , 2 , 1 , 3 ) ) . view( ( batch_size, - 1 , self. n_heads * self. d_k) )
output = self. W_O( context)
return output, attn
from mindspore import numpy as mnp
class PositionalEncoding ( nn. Cell) :
"""位置编码"""
def __init__ ( self, d_model, dropout_p= 0.1 , max_len= 100 ) :
super ( ) . __init__( )
self. dropout = nn. Dropout( 1 - dropout_p)
self. pe = ops. Zeros( ) ( ( max_len, d_model) , mstype. float32)
pos = mnp. arange( 0 , max_len, dtype= mstype. float32) . view( ( - 1 , 1 ) )
angle = ops. pow ( 10000.0 , mnp. arange( 0 , d_model, 2 , dtype= mstype. float32) / d_model)
self. pe[ : , 0 : : 2 ] = ops. sin( pos/ angle)
self. pe[ : , 1 : : 2 ] = ops. cos( pos/ angle)
def construct ( self, x) :
batch_size = x. shape[ 0 ]
pe = self. pe. expand_dims( 0 )
pe = ops. broadcast_to( pe, ( batch_size, - 1 , - 1 ) )
x = x + pe[ : , : x. shape[ 1 ] , : ]
return self. dropout( x)
class PoswiseFeedForward ( nn. Cell) :
def __init__ ( self, d_ff, d_model, dropout_p= 0. ) :
super ( ) . __init__( )
self. linear1 = nn. Dense( d_model, d_ff)
self. linear2 = nn. Dense( d_ff, d_model)
self. dropout = nn. Dropout( 1 - dropout_p)
self. relu = nn. ReLU( )
def construct ( self, x) :
"""前馈神经网络
x: [batch_size, seq_len, d_model]
"""
x = self. linear1( x)
x = self. relu( x)
x = self. dropout( x)
output = self. linear2( x)
return output
class AddNorm ( nn. Cell) :
def __init__ ( self, d_model, dropout_p= 0. ) :
super ( ) . __init__( )
self. layer_norm = nn. LayerNorm( ( d_model, ) , epsilon= 1e-5 )
self. dropout = nn. Dropout( 1 - dropout_p)
def construct ( self, x, residual) :
return self. layer_norm( self. dropout( x) + residual)
class EncoderLayer ( nn. Cell) :
def __init__ ( self, d_model, n_heads, d_ff, dropout_p= 0. ) :
super ( ) . __init__( )
d_k = d_model // n_heads
if d_k * n_heads != d_model:
raise ValueError( f"The `d_model` { d_model} can not be divisible by `num_heads` { n_heads} ." )
self. enc_self_attn = MultiHeadAttention( d_model, d_k, n_heads, dropout_p)
self. pos_ffn = PoswiseFeedForward( d_ff, d_model, dropout_p)
self. add_norm1 = AddNorm( d_model, dropout_p)
self. add_norm2 = AddNorm( d_model, dropout_p)
def construct ( self, enc_inputs, enc_self_attn_mask) :
"""
enc_inputs: [batch_size, src_len, d_model]
enc_self_attn_mask: [batch_size, src_len, src_len]
"""
residual = enc_inputs
enc_outputs, attn = self. enc_self_attn( enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask)
enc_outputs = self. add_norm1( enc_outputs, residual)
residual = enc_outputs
enc_outputs = self. pos_ffn( enc_outputs)
enc_outputs = self. add_norm2( enc_outputs, residual)
return enc_outputs, attn
class Encoder ( nn. Cell) :
def __init__ ( self, src_vocab_size, d_model, n_heads, d_ff, n_layers, dropout_p= 0. ) :
super ( ) . __init__( )
self. src_emb = nn. Embedding( src_vocab_size, d_model)
self. pos_emb = PositionalEncoding( d_model, dropout_p)
self. layers = nn. CellList( [ EncoderLayer( d_model, n_heads, d_ff, dropout_p) for _ in range ( n_layers) ] )
self. scaling_factor = ops. Sqrt( ) ( Tensor( d_model, mstype. float32) )
def construct ( self, enc_inputs, src_pad_idx) :
"""enc_inputs : [batch_size, src_len]
"""
enc_outputs = self. src_emb( enc_inputs. astype( mstype. int32) )
enc_outputs = self. pos_emb( enc_outputs * self. scaling_factor)
enc_self_attn_mask = get_attn_pad_mask( enc_inputs, enc_inputs, src_pad_idx)
enc_self_attns = [ ]
for layer in self. layers:
enc_outputs, enc_self_attn = layer( enc_outputs, enc_self_attn_mask)
enc_self_attns. append( enc_self_attn)
return enc_outputs, enc_self_attns
def get_attn_subsequent_mask ( seq_q, seq_k) :
"""生成时间掩码,使decoder在第t时刻只能看到序列的前t-1个元素
Args:
seq_q (Tensor): query序列,shape = [batch size, len_q]
seq_k (Tensor): key序列,shape = [batch size, len_k]
"""
batch_size, len_q = seq_q. shape
batch_size, len_k = seq_k. shape
ones = ops. ones( ( batch_size, len_q, len_k) , mindspore. float32)
subsequent_mask = mnp. triu( ones, k= 1 )
return subsequent_mask
class DecoderLayer ( nn. Cell) :
def __init__ ( self, d_model, n_heads, d_ff, dropout_p= 0. ) :
super ( ) . __init__( )
d_k = d_model // n_heads
if d_k * n_heads != d_model:
raise ValueError( f"The `d_model` { d_model} can not be divisible by `num_heads` { n_heads} ." )
self. dec_self_attn = MultiHeadAttention( d_model, d_k, n_heads, dropout_p)
self. dec_enc_attn = MultiHeadAttention( d_model, d_k, n_heads, dropout_p)
self. pos_ffn = PoswiseFeedForward( d_ff, d_model, dropout_p)
self. add_norm1 = AddNorm( d_model, dropout_p)
self. add_norm2 = AddNorm( d_model, dropout_p)
self. add_norm3 = AddNorm( d_model, dropout_p)
def construct ( self, dec_inputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask) :
"""
dec_inputs: [batch_size, trg_len, d_model]
enc_outputs: [batch_size, src_len, d_model]
dec_self_attn_mask: [batch_size, trg_len, trg_len]
dec_enc_attn_mask: [batch_size, trg_len, src_len]
"""
residual = dec_inputs
dec_outputs, dec_self_attn = self. dec_self_attn( dec_inputs, dec_inputs, dec_inputs, dec_self_attn_mask)
dec_outputs = self. add_norm1( dec_outputs, residual)
residual = dec_outputs
dec_outputs, dec_enc_attn = self. dec_enc_attn( dec_outputs, enc_outputs, enc_outputs, dec_enc_attn_mask)
dec_outputs = self. add_norm2( dec_outputs, residual)
residual = dec_outputs
dec_outputs = self. pos_ffn( dec_outputs)
dec_outputs = self. add_norm3( dec_outputs, residual)
return dec_outputs, dec_self_attn, dec_enc_attn
class Decoder ( nn. Cell) :
def __init__ ( self, trg_vocab_size, d_model, n_heads, d_ff, n_layers, dropout_p= 0. ) :
super ( ) . __init__( )
self. trg_emb = nn. Embedding( trg_vocab_size, d_model)
self. pos_emb = PositionalEncoding( d_model, dropout_p)
self. layers = nn. CellList( [ DecoderLayer( d_model, n_heads, d_ff) for _ in range ( n_layers) ] )
self. projection = nn. Dense( d_model, trg_vocab_size)
self. scaling_factor = ops. Sqrt( ) ( Tensor( d_model, mstype. float32) )
def construct ( self, dec_inputs, enc_inputs, enc_outputs, src_pad_idx, trg_pad_idx) :
"""
dec_inputs: [batch_size, trg_len]
enc_inputs: [batch_size, src_len]
enc_outputs: [batch_size, src_len, d_model]
"""
dec_outputs = self. trg_emb( dec_inputs. astype( mstype. int32) )
dec_outputs = self. pos_emb( dec_outputs * self. scaling_factor)
dec_self_attn_pad_mask = get_attn_pad_mask( dec_inputs, dec_inputs, trg_pad_idx)
dec_self_attn_subsequent_mask = get_attn_subsequent_mask( dec_inputs, dec_inputs)
dec_self_attn_mask = ops. gt( ( dec_self_attn_pad_mask + dec_self_attn_subsequent_mask) , 0 )
dec_enc_attn_mask = get_attn_pad_mask( dec_inputs, enc_inputs, src_pad_idx)
dec_self_attns, dec_enc_attns = [ ] , [ ]
for layer in self. layers:
dec_outputs, dec_self_attn, dec_enc_attn = layer( dec_outputs, enc_outputs, dec_self_attn_mask, dec_enc_attn_mask)
dec_self_attns. append( dec_self_attn)
dec_enc_attns. append( dec_enc_attn)
dec_outputs = self. projection( dec_outputs)
return dec_outputs, dec_self_attns, dec_enc_attns
class Transformer ( nn. Cell) :
def __init__ ( self, encoder, decoder) :
super ( ) . __init__( )
self. encoder = encoder
self. decoder = decoder
def construct ( self, enc_inputs, dec_inputs, src_pad_idx, trg_pad_idx) :
"""
enc_inputs: [batch_size, src_len]
dec_inputs: [batch_size, trg_len]
"""
enc_outputs, enc_self_attns = self. encoder( enc_inputs, src_pad_idx)
dec_outputs, dec_self_attns, dec_enc_attns = self. decoder( dec_inputs, enc_inputs, enc_outputs, src_pad_idx, trg_pad_idx)
dec_logits = dec_outputs. view( ( - 1 , dec_outputs. shape[ - 1 ] ) )
print ( "shape:" , dec_logits. shape)
return dec_logits, enc_self_attns, dec_self_attns, dec_enc_attns
拆解代码
src_vocab_size = len ( de_vocab)
trg_vocab_size = len ( en_vocab)
src_pad_idx = de_vocab. pad_idx
trg_pad_idx = en_vocab. pad_idx
d_model = 512
d_ff = 2048
n_layers = 1
n_heads = 2
encoder = Encoder( src_vocab_size, d_model, n_heads, d_ff, n_layers, dropout_p= 0.1 )
decoder = Decoder( trg_vocab_size, d_model, n_heads, d_ff, n_layers, dropout_p= 0.1 )
model = Transformer( encoder, decoder)
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.455.085 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.507.455 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.548.181 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.552.848 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.557.008 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.574.551 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.630.250 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.685.188 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.731.413 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.735.747 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.739.445 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.742.999 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx
(21, 17, 1, 1)
encoder
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.848.909 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.850.098 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.850.754 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.851.548 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:12.852.300 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
Encoder<
(src_emb): Embedding<vocab_size=21, embedding_size=512, use_one_hot=False, embedding_table=Parameter (name=encoder.src_emb.embedding_table, shape=(21, 512), dtype=Float32, requires_grad=True), dtype=Float32, padding_idx=None>
(pos_emb): PositionalEncoding<
(dropout): Dropout<keep_prob=0.9>
>
(layers): CellList<
(0): EncoderLayer<
(enc_self_attn): MultiHeadAttention<
(W_Q): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_K): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_V): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_O): Dense<input_channels=512, output_channels=512, has_bias=True>
(attention): ScaledDotProductAttention<
(softmax): Softmax<>
(dropout): Dropout<keep_prob=0.9>
>
>
(pos_ffn): PoswiseFeedForward<
(linear1): Dense<input_channels=512, output_channels=2048, has_bias=True>
(linear2): Dense<input_channels=2048, output_channels=512, has_bias=True>
(dropout): Dropout<keep_prob=0.9>
(relu): ReLU<>
>
(add_norm1): AddNorm<
(layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=encoder.layers.0.add_norm1.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=encoder.layers.0.add_norm1.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
(dropout): Dropout<keep_prob=0.9>
>
(add_norm2): AddNorm<
(layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=encoder.layers.0.add_norm2.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=encoder.layers.0.add_norm2.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
(dropout): Dropout<keep_prob=0.9>
>
>
>
>
decoder
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.189.660 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.190.848 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.191.471 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.192.247 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.192.899 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.193.581 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.194.262 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
Decoder<
(trg_emb): Embedding<vocab_size=17, embedding_size=512, use_one_hot=False, embedding_table=Parameter (name=decoder.trg_emb.embedding_table, shape=(17, 512), dtype=Float32, requires_grad=True), dtype=Float32, padding_idx=None>
(pos_emb): PositionalEncoding<
(dropout): Dropout<keep_prob=0.9>
>
(layers): CellList<
(0): DecoderLayer<
(dec_self_attn): MultiHeadAttention<
(W_Q): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_K): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_V): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_O): Dense<input_channels=512, output_channels=512, has_bias=True>
(attention): ScaledDotProductAttention<
(softmax): Softmax<>
(dropout): Dropout<keep_prob=1.0>
>
>
(dec_enc_attn): MultiHeadAttention<
(W_Q): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_K): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_V): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_O): Dense<input_channels=512, output_channels=512, has_bias=True>
(attention): ScaledDotProductAttention<
(softmax): Softmax<>
(dropout): Dropout<keep_prob=1.0>
>
>
(pos_ffn): PoswiseFeedForward<
(linear1): Dense<input_channels=512, output_channels=2048, has_bias=True>
(linear2): Dense<input_channels=2048, output_channels=512, has_bias=True>
(dropout): Dropout<keep_prob=1.0>
(relu): ReLU<>
>
(add_norm1): AddNorm<
(layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=decoder.layers.0.add_norm1.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=decoder.layers.0.add_norm1.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
(dropout): Dropout<keep_prob=1.0>
>
(add_norm2): AddNorm<
(layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=decoder.layers.0.add_norm2.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=decoder.layers.0.add_norm2.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
(dropout): Dropout<keep_prob=1.0>
>
(add_norm3): AddNorm<
(layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=decoder.layers.0.add_norm3.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=decoder.layers.0.add_norm3.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
(dropout): Dropout<keep_prob=1.0>
>
>
>
(projection): Dense<input_channels=512, output_channels=17, has_bias=True>
>
model
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.413.623 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.414.666 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.415.426 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.416.565 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.417.290 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.417.947 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.418.626 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.419.811 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.420.428 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.421.098 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.421.764 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-02:59:13.422.448 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
Transformer<
(encoder): Encoder<
(src_emb): Embedding<vocab_size=21, embedding_size=512, use_one_hot=False, embedding_table=Parameter (name=encoder.src_emb.embedding_table, shape=(21, 512), dtype=Float32, requires_grad=True), dtype=Float32, padding_idx=None>
(pos_emb): PositionalEncoding<
(dropout): Dropout<keep_prob=0.9>
>
(layers): CellList<
(0): EncoderLayer<
(enc_self_attn): MultiHeadAttention<
(W_Q): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_K): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_V): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_O): Dense<input_channels=512, output_channels=512, has_bias=True>
(attention): ScaledDotProductAttention<
(softmax): Softmax<>
(dropout): Dropout<keep_prob=0.9>
>
>
(pos_ffn): PoswiseFeedForward<
(linear1): Dense<input_channels=512, output_channels=2048, has_bias=True>
(linear2): Dense<input_channels=2048, output_channels=512, has_bias=True>
(dropout): Dropout<keep_prob=0.9>
(relu): ReLU<>
>
(add_norm1): AddNorm<
(layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=encoder.layers.0.add_norm1.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=encoder.layers.0.add_norm1.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
(dropout): Dropout<keep_prob=0.9>
>
(add_norm2): AddNorm<
(layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=encoder.layers.0.add_norm2.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=encoder.layers.0.add_norm2.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
(dropout): Dropout<keep_prob=0.9>
>
>
>
>
(decoder): Decoder<
(trg_emb): Embedding<vocab_size=17, embedding_size=512, use_one_hot=False, embedding_table=Parameter (name=decoder.trg_emb.embedding_table, shape=(17, 512), dtype=Float32, requires_grad=True), dtype=Float32, padding_idx=None>
(pos_emb): PositionalEncoding<
(dropout): Dropout<keep_prob=0.9>
>
(layers): CellList<
(0): DecoderLayer<
(dec_self_attn): MultiHeadAttention<
(W_Q): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_K): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_V): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_O): Dense<input_channels=512, output_channels=512, has_bias=True>
(attention): ScaledDotProductAttention<
(softmax): Softmax<>
(dropout): Dropout<keep_prob=1.0>
>
>
(dec_enc_attn): MultiHeadAttention<
(W_Q): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_K): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_V): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_O): Dense<input_channels=512, output_channels=512, has_bias=True>
(attention): ScaledDotProductAttention<
(softmax): Softmax<>
(dropout): Dropout<keep_prob=1.0>
>
>
(pos_ffn): PoswiseFeedForward<
(linear1): Dense<input_channels=512, output_channels=2048, has_bias=True>
(linear2): Dense<input_channels=2048, output_channels=512, has_bias=True>
(dropout): Dropout<keep_prob=1.0>
(relu): ReLU<>
>
(add_norm1): AddNorm<
(layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=decoder.layers.0.add_norm1.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=decoder.layers.0.add_norm1.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
(dropout): Dropout<keep_prob=1.0>
>
(add_norm2): AddNorm<
(layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=decoder.layers.0.add_norm2.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=decoder.layers.0.add_norm2.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
(dropout): Dropout<keep_prob=1.0>
>
(add_norm3): AddNorm<
(layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=decoder.layers.0.add_norm3.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=decoder.layers.0.add_norm3.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
(dropout): Dropout<keep_prob=1.0>
>
>
>
(projection): Dense<input_channels=512, output_channels=17, has_bias=True>
>
>
trg_pad_idx
1
model. trainable_params( )
[Parameter (name=encoder.src_emb.embedding_table, shape=(21, 512), dtype=Float32, requires_grad=True),
Parameter (name=encoder.layers.0.enc_self_attn.W_Q.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
Parameter (name=encoder.layers.0.enc_self_attn.W_Q.bias, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=encoder.layers.0.enc_self_attn.W_K.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
Parameter (name=encoder.layers.0.enc_self_attn.W_K.bias, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=encoder.layers.0.enc_self_attn.W_V.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
Parameter (name=encoder.layers.0.enc_self_attn.W_V.bias, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=encoder.layers.0.enc_self_attn.W_O.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
Parameter (name=encoder.layers.0.enc_self_attn.W_O.bias, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=encoder.layers.0.pos_ffn.linear1.weight, shape=(2048, 512), dtype=Float32, requires_grad=True),
Parameter (name=encoder.layers.0.pos_ffn.linear1.bias, shape=(2048,), dtype=Float32, requires_grad=True),
Parameter (name=encoder.layers.0.pos_ffn.linear2.weight, shape=(512, 2048), dtype=Float32, requires_grad=True),
Parameter (name=encoder.layers.0.pos_ffn.linear2.bias, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=encoder.layers.0.add_norm1.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=encoder.layers.0.add_norm1.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=encoder.layers.0.add_norm2.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=encoder.layers.0.add_norm2.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=decoder.trg_emb.embedding_table, shape=(17, 512), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.dec_self_attn.W_Q.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.dec_self_attn.W_Q.bias, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.dec_self_attn.W_K.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.dec_self_attn.W_K.bias, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.dec_self_attn.W_V.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.dec_self_attn.W_V.bias, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.dec_self_attn.W_O.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.dec_self_attn.W_O.bias, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.dec_enc_attn.W_Q.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.dec_enc_attn.W_Q.bias, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.dec_enc_attn.W_K.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.dec_enc_attn.W_K.bias, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.dec_enc_attn.W_V.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.dec_enc_attn.W_V.bias, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.dec_enc_attn.W_O.weight, shape=(512, 512), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.dec_enc_attn.W_O.bias, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.pos_ffn.linear1.weight, shape=(2048, 512), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.pos_ffn.linear1.bias, shape=(2048,), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.pos_ffn.linear2.weight, shape=(512, 2048), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.pos_ffn.linear2.bias, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.add_norm1.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.add_norm1.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.add_norm2.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.add_norm2.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.add_norm3.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=decoder.layers.0.add_norm3.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True),
Parameter (name=decoder.projection.weight, shape=(17, 512), dtype=Float32, requires_grad=True),
Parameter (name=decoder.projection.bias, shape=(17,), dtype=Float32, requires_grad=True)]
loss_fn = nn. CrossEntropyLoss( ignore_index= trg_pad_idx)
optimizer = nn. Adam( model. trainable_params( ) , learning_rate= 0.0001 )
loss_fn
CrossEntropyLoss<>
optimizer
Adam<>
def forward ( enc_inputs, dec_inputs) :
"""前向网络
enc_inputs: [batch_size, src_len]
dec_inputs: [batch_size, trg_len]
"""
logits, _, _, _ = model( enc_inputs, dec_inputs[ : , : - 1 ] , src_pad_idx, trg_pad_idx)
targets = dec_inputs[ : , 1 : ] . view( - 1 )
loss = loss_fn( logits, targets)
return loss
拆解
import mindspore as ms
values = [ 2 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 4 , 3 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ]
enc_inputs = ms. Tensor( [ values] , dtype= ms. int32)
print ( enc_inputs)
values = [ 2 , 6 , 7 , 8 , 9 , 10 , 4 , 11 , 5 , 3 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ]
dec_inputs = ms. Tensor( [ values] , dtype= ms. int32)
print ( dec_inputs)
[[ 2 6 7 8 9 10 11 12 13 4 3 1 1 1 1 1 1 1 1 1]]
[[ 2 6 7 8 9 10 4 11 5 3 1 1 1 1 1 1 1 1 1 1]]
logits1, _, _, _= model( enc_inputs, dec_inputs[ : , : - 1 ] , 1 , 1 )
logits1
shape: (19, 17)
Tensor(shape=[19, 17], dtype=Float32, value=
[[-2.55366063e+00, -3.06832147e+00, -2.64111972e+00 ... -2.12405175e-01, -6.74118578e-01, -5.54979622e-01],
[-2.59403634e+00, -3.07656002e+00, -2.21387792e+00 ... 6.27510190e-01, -1.68930933e-01, -2.08749816e-01],
[-2.14952374e+00, -2.65785956e+00, -2.27898407e+00 ... 8.72464657e-01, 2.22310439e-01, -5.46112001e-01],
...
[-2.10627747e+00, -1.17115605e+00, -2.49626946e+00 ... 1.63479745e-01, 2.89909244e-02, -3.14286537e-02],
[-2.41592431e+00, -1.02976418e+00, -2.43828940e+00 ... 3.43136370e-01, 7.33953714e-03, 3.92905682e-01],
[-2.60901284e+00, -9.56778646e-01, -2.45280385e+00 ... -1.38613194e-01, -9.00381804e-03, 2.95851976e-01]])
trg_vocab_size, d_model, n_heads, d_ff, n_layers
(17, 512, 2, 2048, 1)
dropout_p= 0.
trg_emb = nn. Embedding( trg_vocab_size, d_model)
pos_emb = PositionalEncoding( d_model, dropout_p)
layers = nn. CellList( [ DecoderLayer( d_model, n_heads, d_ff) for _ in range ( n_layers) ] )
projection = nn. Dense( d_model, trg_vocab_size)
scaling_factor = ops. Sqrt( ) ( Tensor( d_model, mstype. float32) )
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:16:32.154.289 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:16:32.316.986 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:16:32.492.765 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:16:32.870.995 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:16:32.876.128 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:16:32.880.744 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:16:32.884.699 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
dec_outputs = trg_emb( dec_inputs. astype( mstype. int32) )
dec_outputs = pos_emb( dec_outputs * scaling_factor)
dec_outputs
Tensor(shape=[1, 20, 512], dtype=Float32, value=
[[[-3.29979628e-01, 8.65685344e-01, -1.53060444e-02 ... 8.94590139e-01, -2.27691412e-01, 1.21550345e+00],
[ 1.27103972e+00, 4.10478264e-01, 8.03606927e-01 ... 1.20142996e+00, -1.58704035e-02, 9.82034385e-01],
[ 1.01256967e+00, -6.59448624e-01, 1.21936429e+00 ... 1.03363061e+00, 1.61111519e-01, 1.06760633e+00],
...
[-5.15642047e-01, -4.91233796e-01, -6.98207080e-01 ... 1.34653473e+00, -3.68010491e-01, 1.06946480e+00],
[-3.05232048e-01, 4.44246233e-01, -1.05705857e+00 ... 1.34653449e+00, -3.67906809e-01, 1.06946468e+00],
[ 5.95632434e-01, 7.72634208e-01, -5.58417439e-01 ... 1.34653425e+00, -3.67803156e-01, 1.06946445e+00]]])
dec_outputs
Tensor(shape=[1, 20, 512], dtype=Float32, value=
[[[-3.29979628e-01, 8.65685344e-01, -1.53060444e-02 ... 8.94590139e-01, -2.27691412e-01, 1.21550345e+00],
[ 1.27103972e+00, 4.10478264e-01, 8.03606927e-01 ... 1.20142996e+00, -1.58704035e-02, 9.82034385e-01],
[ 1.01256967e+00, -6.59448624e-01, 1.21936429e+00 ... 1.03363061e+00, 1.61111519e-01, 1.06760633e+00],
...
[-5.15642047e-01, -4.91233796e-01, -6.98207080e-01 ... 1.34653473e+00, -3.68010491e-01, 1.06946480e+00],
[-3.05232048e-01, 4.44246233e-01, -1.05705857e+00 ... 1.34653449e+00, -3.67906809e-01, 1.06946468e+00],
[ 5.95632434e-01, 7.72634208e-01, -5.58417439e-01 ... 1.34653425e+00, -3.67803156e-01, 1.06946445e+00]]])
dec_inputs
Tensor(shape=[1, 20], dtype=Int32, value=
[[2, 6, 7 ... 1, 1, 1]])
dec_self_attn_pad_mask = get_attn_pad_mask( dec_inputs, dec_inputs, trg_pad_idx)
dec_self_attn_pad_mask
Tensor(shape=[1, 20, 20], dtype=Bool, value=
[[[False, False, False ... True, True, True],
[False, False, False ... True, True, True],
[False, False, False ... True, True, True],
...
[False, False, False ... True, True, True],
[False, False, False ... True, True, True],
[False, False, False ... True, True, True]]])
dec_self_attn_subsequent_mask = get_attn_subsequent_mask( dec_inputs, dec_inputs)
dec_self_attn_subsequent_mask
Tensor(shape=[1, 20, 20], dtype=Float32, value=
[[[ 0.00000000e+00, 1.00000000e+00, 1.00000000e+00 ... 1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
[ 0.00000000e+00, 0.00000000e+00, 1.00000000e+00 ... 1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
[ 0.00000000e+00, 0.00000000e+00, 0.00000000e+00 ... 1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
...
[ 0.00000000e+00, 0.00000000e+00, 0.00000000e+00 ... 0.00000000e+00, 1.00000000e+00, 1.00000000e+00],
[ 0.00000000e+00, 0.00000000e+00, 0.00000000e+00 ... 0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
[ 0.00000000e+00, 0.00000000e+00, 0.00000000e+00 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]]])
import numpy as np
import pandas as pd
np_mask = dec_self_attn_subsequent_mask. asnumpy( )
df_mask = pd. DataFrame( np_mask[ 0 ] )
display( df_mask)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1 0.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 2 0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 4 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 5 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 6 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 7 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 8 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 9 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 10 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 11 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 12 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 13 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0 14 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 1.0 15 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 16 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 17 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 18 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 19 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
dec_self_attn_mask = ops. gt( ( dec_self_attn_pad_mask + dec_self_attn_subsequent_mask) , 0 )
dec_self_attn_mask
Tensor(shape=[1, 20, 20], dtype=Bool, value=
[[[False, True, True ... True, True, True],
[False, False, True ... True, True, True],
[False, False, False ... True, True, True],
...
[False, False, False ... True, True, True],
[False, False, False ... True, True, True],
[False, False, False ... True, True, True]]])
import numpy as np
import pandas as pd
np_mask = dec_self_attn_mask. asnumpy( )
df_mask = pd. DataFrame( np_mask[ 0 ] )
display( df_mask)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 0 False True True True True True True True True True True True True True True True True True True True 1 False False True True True True True True True True True True True True True True True True True True 2 False False False True True True True True True True True True True True True True True True True True 3 False False False False True True True True True True True True True True True True True True True True 4 False False False False False True True True True True True True True True True True True True True True 5 False False False False False False True True True True True True True True True True True True True True 6 False False False False False False False True True True True True True True True True True True True True 7 False False False False False False False False True True True True True True True True True True True True 8 False False False False False False False False False True True True True True True True True True True True 9 False False False False False False False False False False True True True True True True True True True True 10 False False False False False False False False False False True True True True True True True True True True 11 False False False False False False False False False False True True True True True True True True True True 12 False False False False False False False False False False True True True True True True True True True True 13 False False False False False False False False False False True True True True True True True True True True 14 False False False False False False False False False False True True True True True True True True True True 15 False False False False False False False False False False True True True True True True True True True True 16 False False False False False False False False False False True True True True True True True True True True 17 False False False False False False False False False False True True True True True True True True True True 18 False False False False False False False False False False True True True True True True True True True True 19 False False False False False False False False False False True True True True True True True True True True
dec_enc_attn_mask = get_attn_pad_mask( dec_inputs, enc_inputs, src_pad_idx)
dec_enc_attn_mask
Tensor(shape=[1, 20, 20], dtype=Bool, value=
[[[False, False, False ... True, True, True],
[False, False, False ... True, True, True],
[False, False, False ... True, True, True],
...
[False, False, False ... True, True, True],
[False, False, False ... True, True, True],
[False, False, False ... True, True, True]]])
import numpy as np
import pandas as pd
np_mask = dec_enc_attn_mask. asnumpy( )
df_mask = pd. DataFrame( np_mask[ 0 ] )
display( df_mask)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 0 False False False False False False False False False False False True True True True True True True True True 1 False False False False False False False False False False False True True True True True True True True True 2 False False False False False False False False False False False True True True True True True True True True 3 False False False False False False False False False False False True True True True True True True True True 4 False False False False False False False False False False False True True True True True True True True True 5 False False False False False False False False False False False True True True True True True True True True 6 False False False False False False False False False False False True True True True True True True True True 7 False False False False False False False False False False False True True True True True True True True True 8 False False False False False False False False False False False True True True True True True True True True 9 False False False False False False False False False False False True True True True True True True True True 10 False False False False False False False False False False False True True True True True True True True True 11 False False False False False False False False False False False True True True True True True True True True 12 False False False False False False False False False False False True True True True True True True True True 13 False False False False False False False False False False False True True True True True True True True True 14 False False False False False False False False False False False True True True True True True True True True 15 False False False False False False False False False False False True True True True True True True True True 16 False False False False False False False False False False False True True True True True True True True True 17 False False False False False False False False False False False True True True True True True True True True 18 False False False False False False False False False False False True True True True True True True True True 19 False False False False False False False False False False False True True True True True True True True True
dec_outputs
Tensor(shape=[1, 20, 512], dtype=Float32, value=
[[[-3.29979628e-01, 8.65685344e-01, -1.53060444e-02 ... 8.94590139e-01, -2.27691412e-01, 1.21550345e+00],
[ 1.27103972e+00, 4.10478264e-01, 8.03606927e-01 ... 1.20142996e+00, -1.58704035e-02, 9.82034385e-01],
[ 1.01256967e+00, -6.59448624e-01, 1.21936429e+00 ... 1.03363061e+00, 1.61111519e-01, 1.06760633e+00],
...
[-5.15642047e-01, -4.91233796e-01, -6.98207080e-01 ... 1.34653473e+00, -3.68010491e-01, 1.06946480e+00],
[-3.05232048e-01, 4.44246233e-01, -1.05705857e+00 ... 1.34653449e+00, -3.67906809e-01, 1.06946468e+00],
[ 5.95632434e-01, 7.72634208e-01, -5.58417439e-01 ... 1.34653425e+00, -3.67803156e-01, 1.06946445e+00]]])
dec_outputs = projection( dec_outputs)
dec_outputs
Tensor(shape=[1, 20, 17], dtype=Float32, value=
[[[-8.26425076e-01, -1.20410120e+00, -1.28439963e-01 ... 5.64422965e-01, 1.06194824e-01, -2.42545847e-02],
[-9.16367471e-01, -1.43073356e+00, -5.09288907e-02 ... 7.33686745e-01, -4.93338741e-02, 5.19632623e-02],
[-9.17246103e-01, -1.34883523e+00, -3.61622870e-02 ... 8.07045281e-01, 3.27018559e-01, 7.60697350e-02],
...
[-7.43718505e-01, -7.48091042e-01, -4.24297601e-01 ... 5.49952924e-01, 5.42376600e-02, -3.17106813e-01],
[-7.17169762e-01, -7.46447027e-01, -3.70600104e-01 ... 5.41904747e-01, 2.55204104e-02, -2.68386006e-01],
[-6.96062922e-01, -7.01602280e-01, -3.98487747e-01 ... 6.10271156e-01, -1.14318877e-01, -3.05555671e-01]]])
dec_outputs
Tensor(shape=[1, 20, 17], dtype=Float32, value=
[[[-8.26425076e-01, -1.20410120e+00, -1.28439963e-01 ... 5.64422965e-01, 1.06194824e-01, -2.42545847e-02],
[-9.16367471e-01, -1.43073356e+00, -5.09288907e-02 ... 7.33686745e-01, -4.93338741e-02, 5.19632623e-02],
[-9.17246103e-01, -1.34883523e+00, -3.61622870e-02 ... 8.07045281e-01, 3.27018559e-01, 7.60697350e-02],
...
[-7.43718505e-01, -7.48091042e-01, -4.24297601e-01 ... 5.49952924e-01, 5.42376600e-02, -3.17106813e-01],
[-7.17169762e-01, -7.46447027e-01, -3.70600104e-01 ... 5.41904747e-01, 2.55204104e-02, -2.68386006e-01],
[-6.96062922e-01, -7.01602280e-01, -3.98487747e-01 ... 6.10271156e-01, -1.14318877e-01, -3.05555671e-01]]])
import numpy as np
import pandas as pd
np_mask = dec_outputs. asnumpy( )
df_mask = pd. DataFrame( np_mask[ 0 ] )
display( df_mask)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 0 -0.826425 -1.204101 -0.128440 0.356329 0.452220 -0.452443 0.606300 -0.227930 0.269083 -0.173742 -1.121589 -0.093102 0.518840 -0.357170 0.564423 0.106195 -0.024255 1 -0.916367 -1.430734 -0.050929 0.669196 0.806917 -0.368863 0.607177 -0.121352 0.693882 0.175020 -1.045657 -0.109076 0.280068 -0.137364 0.733687 -0.049334 0.051963 2 -0.917246 -1.348835 -0.036162 0.319181 0.898127 -0.238785 0.533196 -0.069017 0.378946 0.431980 -1.048683 -0.221063 0.487657 -0.222789 0.807045 0.327019 0.076070 3 -0.815883 -1.066178 -0.008949 0.560768 0.811490 0.202395 0.151353 0.049359 0.387674 0.457533 -0.773893 -0.070126 0.661461 -0.265192 0.681898 0.425325 0.144593 4 -0.912663 -1.143663 0.028645 0.582570 0.717001 0.173276 0.264394 -0.238464 0.151882 -0.051793 -1.004650 -0.201817 0.390479 -0.488408 0.563053 0.070214 0.069295 5 -0.597179 -0.916089 0.151013 0.492338 0.731651 0.274230 -0.319707 -0.280315 0.292216 -0.101262 -0.709801 -0.106190 0.033473 -0.266601 0.519980 -0.311297 -0.074184 6 -0.386974 -0.836567 -0.362828 0.303286 0.691430 0.306932 0.119565 -0.072272 -0.051085 -0.128228 -0.738584 -0.342226 0.168470 -0.418037 0.457969 0.019883 0.071905 7 -0.376257 -0.822605 -0.273047 0.370478 0.597702 0.235920 -0.052315 -0.481007 -0.074119 -0.521327 -0.295519 -0.301262 0.167297 -0.296259 0.062927 -0.458530 0.012958 8 -0.083013 -0.837436 -0.365891 0.371293 0.661609 -0.203302 -0.033456 -0.376518 -0.236429 -0.157299 -0.149379 -0.017083 0.167552 -0.414438 0.437286 -0.254605 -0.090462 9 -0.483190 -0.765104 -0.118083 0.401054 0.805544 -0.215368 -0.222057 -0.296446 -0.121933 -0.036313 0.038458 0.048205 0.039959 -0.037830 0.261883 0.030427 -0.001241 10 -0.527957 -0.571347 -0.405835 0.256833 0.708732 -0.250715 0.105121 -0.272948 0.202755 0.004397 -0.341211 0.137533 0.153622 -0.245164 0.480117 -0.095713 -0.249430 11 -0.587227 -0.493291 -0.488285 0.351468 0.639270 -0.255272 0.256668 -0.274870 0.284715 -0.000538 -0.421533 0.136792 0.249349 -0.228528 0.551138 -0.198995 -0.445792 12 -0.661311 -0.488233 -0.616758 0.395022 0.478487 -0.257968 0.496442 -0.233857 0.276776 -0.019413 -0.538651 0.067837 0.327629 -0.242592 0.565016 -0.408232 -0.647129 13 -0.731524 -0.538342 -0.729429 0.341859 0.308497 -0.255188 0.747433 -0.167398 0.222499 -0.026797 -0.665732 0.012870 0.367431 -0.276456 0.558572 -0.561908 -0.770647 14 -0.778393 -0.608182 -0.758428 0.210861 0.217719 -0.222611 0.914947 -0.121113 0.195051 -0.020288 -0.775376 0.051484 0.334355 -0.309422 0.563821 -0.534780 -0.763223 15 -0.790926 -0.671619 -0.683507 0.074311 0.240690 -0.143512 0.944068 -0.132924 0.232785 -0.016804 -0.837451 0.205178 0.215803 -0.319911 0.577553 -0.333223 -0.635557 16 -0.773691 -0.719550 -0.547444 0.007793 0.341262 -0.036719 0.852336 -0.198890 0.310146 -0.033683 -0.824218 0.422568 0.054875 -0.296552 0.574741 -0.086485 -0.457625 17 -0.743719 -0.748091 -0.424298 0.038584 0.445464 0.044425 0.713990 -0.271102 0.364647 -0.072102 -0.725785 0.614307 -0.058577 -0.245800 0.549953 0.054238 -0.317107 18 -0.717170 -0.746447 -0.370600 0.131496 0.492275 0.047140 0.608417 -0.291705 0.352230 -0.114866 -0.565651 0.709583 -0.045305 -0.191294 0.541905 0.025520 -0.268386 19 -0.696063 -0.701602 -0.398488 0.221775 0.463341 -0.037717 0.571261 -0.237400 0.282085 -0.138031 -0.399849 0.692607 0.099896 -0.163607 0.610271 -0.114319 -0.305556
src_emb = nn. Embedding( src_vocab_size, d_model)
pos_emb = PositionalEncoding( d_model, dropout_p)
enc_outputs = src_emb( enc_inputs. astype( mstype. int32) )
enc_outputs = pos_emb( enc_outputs * scaling_factor)
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:17:00.481.768 [mindspore/nn/layer/basic.py:173] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
enc_outputs
Tensor(shape=[1, 20, 512], dtype=Float32, value=
[[[ 2.38232508e-01, 1.23017490e+00, 9.87986699e-02 ... 1.39036012e+00, -2.15340674e-01, 6.75387204e-01],
[ 6.48513019e-01, 3.48451197e-01, 1.02472794e+00 ... 9.24613714e-01, -3.20152938e-01, 8.93187225e-01],
[ 8.02199662e-01, -9.74555016e-02, 8.44442010e-01 ... 1.00469291e+00, 1.54901832e-01, 1.37700772e+00],
...
[-1.18655479e+00, 1.24188036e-01, -7.51270711e-01 ... 1.32025051e+00, 1.47339404e-01, 5.98086953e-01],
[-9.76144791e-01, 1.05966806e+00, -1.11012220e+00 ... 1.32025039e+00, 1.47443071e-01, 5.98086774e-01],
[-7.52802789e-02, 1.38805604e+00, -6.11481130e-01 ... 1.32025015e+00, 1.47546738e-01, 5.98086536e-01]]])
layers
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:17:09.275.680 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:17:09.278.104 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:17:09.279.221 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:17:09.280.636 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:17:09.281.643 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
[WARNING] ME(5356:281473540106544,MainProcess):2024-07-01-04:17:09.282.647 [mindspore/nn/layer/basic.py:199] For Dropout, this parameter `keep_prob` will be deprecated, please use `p` instead.
CellList<
(0): DecoderLayer<
(dec_self_attn): MultiHeadAttention<
(W_Q): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_K): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_V): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_O): Dense<input_channels=512, output_channels=512, has_bias=True>
(attention): ScaledDotProductAttention<
(softmax): Softmax<>
(dropout): Dropout<keep_prob=1.0>
>
>
(dec_enc_attn): MultiHeadAttention<
(W_Q): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_K): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_V): Dense<input_channels=512, output_channels=512, has_bias=True>
(W_O): Dense<input_channels=512, output_channels=512, has_bias=True>
(attention): ScaledDotProductAttention<
(softmax): Softmax<>
(dropout): Dropout<keep_prob=1.0>
>
>
(pos_ffn): PoswiseFeedForward<
(linear1): Dense<input_channels=512, output_channels=2048, has_bias=True>
(linear2): Dense<input_channels=2048, output_channels=512, has_bias=True>
(dropout): Dropout<keep_prob=1.0>
(relu): ReLU<>
>
(add_norm1): AddNorm<
(layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=0.add_norm1.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=0.add_norm1.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
(dropout): Dropout<keep_prob=1.0>
>
(add_norm2): AddNorm<
(layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=0.add_norm2.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=0.add_norm2.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
(dropout): Dropout<keep_prob=1.0>
>
(add_norm3): AddNorm<
(layer_norm): LayerNorm<normalized_shape=(512,), begin_norm_axis=-1, begin_params_axis=-1, gammaParameter (name=0.add_norm3.layer_norm.gamma, shape=(512,), dtype=Float32, requires_grad=True), beta=Parameter (name=0.add_norm3.layer_norm.beta, shape=(512,), dtype=Float32, requires_grad=True)>
(dropout): Dropout<keep_prob=1.0>
>
>
>
dec_outputs
Tensor(shape=[1, 20, 17], dtype=Float32, value=
[[[-8.26425076e-01, -1.20410120e+00, -1.28439963e-01 ... 5.64422965e-01, 1.06194824e-01, -2.42545847e-02],
[-9.16367471e-01, -1.43073356e+00, -5.09288907e-02 ... 7.33686745e-01, -4.93338741e-02, 5.19632623e-02],
[-9.17246103e-01, -1.34883523e+00, -3.61622870e-02 ... 8.07045281e-01, 3.27018559e-01, 7.60697350e-02],
...
[-7.43718505e-01, -7.48091042e-01, -4.24297601e-01 ... 5.49952924e-01, 5.42376600e-02, -3.17106813e-01],
[-7.17169762e-01, -7.46447027e-01, -3.70600104e-01 ... 5.41904747e-01, 2.55204104e-02, -2.68386006e-01],
[-6.96062922e-01, -7.01602280e-01, -3.98487747e-01 ... 6.10271156e-01, -1.14318877e-01, -3.05555671e-01]]])
import numpy as np
import pandas as pd
np_mask = dec_outputs. asnumpy( )
df_mask = pd. DataFrame( np_mask[ 0 ] )
display( df_mask)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 0 -0.826425 -1.204101 -0.128440 0.356329 0.452220 -0.452443 0.606300 -0.227930 0.269083 -0.173742 -1.121589 -0.093102 0.518840 -0.357170 0.564423 0.106195 -0.024255 1 -0.916367 -1.430734 -0.050929 0.669196 0.806917 -0.368863 0.607177 -0.121352 0.693882 0.175020 -1.045657 -0.109076 0.280068 -0.137364 0.733687 -0.049334 0.051963 2 -0.917246 -1.348835 -0.036162 0.319181 0.898127 -0.238785 0.533196 -0.069017 0.378946 0.431980 -1.048683 -0.221063 0.487657 -0.222789 0.807045 0.327019 0.076070 3 -0.815883 -1.066178 -0.008949 0.560768 0.811490 0.202395 0.151353 0.049359 0.387674 0.457533 -0.773893 -0.070126 0.661461 -0.265192 0.681898 0.425325 0.144593 4 -0.912663 -1.143663 0.028645 0.582570 0.717001 0.173276 0.264394 -0.238464 0.151882 -0.051793 -1.004650 -0.201817 0.390479 -0.488408 0.563053 0.070214 0.069295 5 -0.597179 -0.916089 0.151013 0.492338 0.731651 0.274230 -0.319707 -0.280315 0.292216 -0.101262 -0.709801 -0.106190 0.033473 -0.266601 0.519980 -0.311297 -0.074184 6 -0.386974 -0.836567 -0.362828 0.303286 0.691430 0.306932 0.119565 -0.072272 -0.051085 -0.128228 -0.738584 -0.342226 0.168470 -0.418037 0.457969 0.019883 0.071905 7 -0.376257 -0.822605 -0.273047 0.370478 0.597702 0.235920 -0.052315 -0.481007 -0.074119 -0.521327 -0.295519 -0.301262 0.167297 -0.296259 0.062927 -0.458530 0.012958 8 -0.083013 -0.837436 -0.365891 0.371293 0.661609 -0.203302 -0.033456 -0.376518 -0.236429 -0.157299 -0.149379 -0.017083 0.167552 -0.414438 0.437286 -0.254605 -0.090462 9 -0.483190 -0.765104 -0.118083 0.401054 0.805544 -0.215368 -0.222057 -0.296446 -0.121933 -0.036313 0.038458 0.048205 0.039959 -0.037830 0.261883 0.030427 -0.001241 10 -0.527957 -0.571347 -0.405835 0.256833 0.708732 -0.250715 0.105121 -0.272948 0.202755 0.004397 -0.341211 0.137533 0.153622 -0.245164 0.480117 -0.095713 -0.249430 11 -0.587227 -0.493291 -0.488285 0.351468 0.639270 -0.255272 0.256668 -0.274870 0.284715 -0.000538 -0.421533 0.136792 0.249349 -0.228528 0.551138 -0.198995 -0.445792 12 -0.661311 -0.488233 -0.616758 0.395022 0.478487 -0.257968 0.496442 -0.233857 0.276776 -0.019413 -0.538651 0.067837 0.327629 -0.242592 0.565016 -0.408232 -0.647129 13 -0.731524 -0.538342 -0.729429 0.341859 0.308497 -0.255188 0.747433 -0.167398 0.222499 -0.026797 -0.665732 0.012870 0.367431 -0.276456 0.558572 -0.561908 -0.770647 14 -0.778393 -0.608182 -0.758428 0.210861 0.217719 -0.222611 0.914947 -0.121113 0.195051 -0.020288 -0.775376 0.051484 0.334355 -0.309422 0.563821 -0.534780 -0.763223 15 -0.790926 -0.671619 -0.683507 0.074311 0.240690 -0.143512 0.944068 -0.132924 0.232785 -0.016804 -0.837451 0.205178 0.215803 -0.319911 0.577553 -0.333223 -0.635557 16 -0.773691 -0.719550 -0.547444 0.007793 0.341262 -0.036719 0.852336 -0.198890 0.310146 -0.033683 -0.824218 0.422568 0.054875 -0.296552 0.574741 -0.086485 -0.457625 17 -0.743719 -0.748091 -0.424298 0.038584 0.445464 0.044425 0.713990 -0.271102 0.364647 -0.072102 -0.725785 0.614307 -0.058577 -0.245800 0.549953 0.054238 -0.317107 18 -0.717170 -0.746447 -0.370600 0.131496 0.492275 0.047140 0.608417 -0.291705 0.352230 -0.114866 -0.565651 0.709583 -0.045305 -0.191294 0.541905 0.025520 -0.268386 19 -0.696063 -0.701602 -0.398488 0.221775 0.463341 -0.037717 0.571261 -0.237400 0.282085 -0.138031 -0.399849 0.692607 0.099896 -0.163607 0.610271 -0.114319 -0.305556
dec_self_attn
Tensor(shape=[1, 2, 20, 20], dtype=Float32, value=
[[[[ 1.00000000e+00, 0.00000000e+00, 0.00000000e+00 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[ 4.88311976e-01, 5.11688054e-01, 0.00000000e+00 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[ 3.26110035e-01, 3.29261810e-01, 3.44628096e-01 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
...
[ 1.19124077e-01, 1.04904100e-01, 1.12861328e-01 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[ 1.16319194e-01, 1.04303002e-01, 1.15375683e-01 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[ 1.13259226e-01, 1.03208810e-01, 1.16634019e-01 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]],
[[ 1.00000000e+00, 0.00000000e+00, 0.00000000e+00 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[ 5.10011137e-01, 4.89988834e-01, 0.00000000e+00 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[ 3.24341774e-01, 2.97710985e-01, 3.77947241e-01 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
...
[ 7.01457039e-02, 5.64711951e-02, 8.03415105e-02 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[ 7.39139169e-02, 5.84961846e-02, 8.06718767e-02 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[ 7.74101689e-02, 6.16681278e-02, 8.33512843e-02 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]]]])
import numpy as np
import pandas as pd
np_mask = dec_self_attn[ : , 0 , : , : ] . asnumpy( )
df_mask = pd. DataFrame( np_mask[ 0 ] )
display( df_mask)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 0 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1 0.488312 0.511688 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2 0.326110 0.329262 0.344628 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3 0.259163 0.259082 0.257921 0.223834 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4 0.199742 0.219256 0.206536 0.197548 0.176917 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5 0.197436 0.195369 0.164921 0.150334 0.146328 0.145613 0.000000 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 6 0.151879 0.154762 0.153983 0.135428 0.126115 0.132991 0.144842 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 7 0.132179 0.141626 0.140691 0.119055 0.108090 0.132029 0.122282 0.104048 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 8 0.132645 0.137315 0.119008 0.094690 0.092300 0.116625 0.107247 0.106665 0.093504 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9 0.123765 0.137174 0.124385 0.097598 0.089202 0.101025 0.089563 0.078291 0.077474 0.081522 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 10 0.122073 0.129332 0.118655 0.099675 0.095127 0.098920 0.088724 0.083399 0.078609 0.085486 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 11 0.121531 0.125305 0.117206 0.098122 0.095479 0.093767 0.088169 0.086311 0.082480 0.091630 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 12 0.120631 0.119742 0.114931 0.095763 0.092763 0.089860 0.089373 0.090211 0.087467 0.099259 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 13 0.118897 0.113428 0.112201 0.093866 0.089293 0.088743 0.092363 0.094672 0.091949 0.104590 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 14 0.118497 0.108880 0.109876 0.093267 0.087132 0.089395 0.095141 0.097962 0.094383 0.105466 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 15 0.119361 0.105873 0.109036 0.093921 0.087156 0.091003 0.095943 0.099403 0.094631 0.103673 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 16 0.119744 0.104897 0.110232 0.094926 0.088894 0.091960 0.094136 0.099488 0.092967 0.102756 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 17 0.119124 0.104904 0.112861 0.096599 0.091061 0.092415 0.090344 0.097936 0.090861 0.103894 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 18 0.116319 0.104303 0.115376 0.098627 0.092685 0.094049 0.087979 0.095938 0.089212 0.105511 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 19 0.113259 0.103209 0.116634 0.100394 0.094054 0.097555 0.089240 0.093512 0.088042 0.104100 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
import numpy as np
import pandas as pd
np_mask = dec_self_attn[ : , 1 , : , : ] . asnumpy( )
df_mask = pd. DataFrame( np_mask[ 0 ] )
display( df_mask)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 0 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1 0.510011 0.489989 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2 0.324342 0.297711 0.377947 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3 0.211180 0.212840 0.267973 0.308007 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4 0.170499 0.143397 0.188010 0.226526 0.271568 0.000000 0.000000 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5 0.112437 0.100011 0.141147 0.168672 0.218480 0.259253 0.000000 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 6 0.083326 0.083259 0.121061 0.142617 0.184952 0.215644 0.169141 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 7 0.068436 0.064160 0.093452 0.113539 0.155480 0.178200 0.177320 0.149415 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 8 0.069731 0.065122 0.089948 0.103748 0.138918 0.162221 0.147051 0.112627 0.110635 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9 0.062982 0.065333 0.088916 0.095252 0.114861 0.140066 0.124454 0.113155 0.105112 0.089870 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 10 0.066607 0.063084 0.084969 0.090122 0.113262 0.145846 0.140314 0.107741 0.106324 0.081731 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 11 0.072598 0.067849 0.089555 0.091052 0.111457 0.142240 0.136718 0.106233 0.102044 0.080255 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 12 0.076573 0.072256 0.093044 0.091936 0.111891 0.139445 0.133266 0.104233 0.098531 0.078825 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 13 0.076359 0.072277 0.093411 0.091842 0.113851 0.139954 0.134679 0.103390 0.096669 0.077568 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 14 0.073822 0.067998 0.090799 0.090500 0.114910 0.143097 0.139986 0.104307 0.097574 0.077008 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 15 0.070822 0.062367 0.086752 0.089384 0.113903 0.145903 0.147620 0.106306 0.099446 0.077497 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 16 0.069080 0.057739 0.082715 0.089048 0.111788 0.146169 0.154942 0.109600 0.100260 0.078659 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 17 0.070146 0.056471 0.080342 0.090584 0.110324 0.142550 0.156522 0.112667 0.099654 0.080740 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 18 0.073914 0.058496 0.080672 0.094216 0.110451 0.136859 0.151021 0.113569 0.097817 0.082984 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 19 0.077410 0.061668 0.083351 0.098499 0.111728 0.131575 0.142726 0.111521 0.096679 0.084843 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
dec_enc_attn
Tensor(shape=[1, 2, 20, 20], dtype=Float32, value=
[[[[ 1.01425938e-01, 9.65336189e-02, 8.71985629e-02 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[ 9.84132364e-02, 9.79700983e-02, 8.79131854e-02 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[ 1.00768864e-01, 9.35822800e-02, 8.65975544e-02 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
...
[ 9.67069641e-02, 8.96922722e-02, 8.70323330e-02 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[ 9.73924100e-02, 8.99903551e-02, 8.55390504e-02 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[ 9.82568413e-02, 9.09924284e-02, 8.41078386e-02 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]],
[[ 7.43333548e-02, 8.84491727e-02, 8.89878944e-02 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[ 6.88009188e-02, 8.48135203e-02, 8.68554562e-02 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[ 7.22721964e-02, 8.68267119e-02, 8.65649804e-02 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
...
[ 7.10219890e-02, 7.84036592e-02, 8.31521899e-02 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[ 7.41301030e-02, 8.02771002e-02, 8.50360170e-02 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[ 7.73167536e-02, 8.33537877e-02, 8.75786170e-02 ... 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]]]])
import numpy as np
import pandas as pd
np_mask = dec_enc_attn[ : , 0 , : , : ] . asnumpy( )
df_mask = pd. DataFrame( np_mask[ 0 ] )
display( df_mask)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 0 0.101426 0.096534 0.087199 0.089802 0.075732 0.087507 0.091148 0.099538 0.083721 0.103463 0.083930 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1 0.098413 0.097970 0.087913 0.090324 0.072338 0.077471 0.089406 0.101571 0.090656 0.110370 0.083567 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2 0.100769 0.093582 0.086598 0.086864 0.070573 0.077860 0.086283 0.099904 0.090869 0.115710 0.090988 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3 0.098890 0.092781 0.085713 0.090353 0.069996 0.078693 0.085580 0.097186 0.085654 0.120951 0.094203 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4 0.091538 0.094223 0.082275 0.092610 0.073556 0.082079 0.094954 0.102545 0.082860 0.110194 0.093167 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5 0.090489 0.092449 0.084243 0.093776 0.075910 0.090975 0.097219 0.099995 0.083722 0.103093 0.088129 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 6 0.092988 0.093951 0.085553 0.090771 0.078287 0.093654 0.097389 0.101513 0.080101 0.097808 0.087985 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 7 0.087611 0.090634 0.085679 0.094469 0.092245 0.099635 0.096521 0.096581 0.084421 0.089098 0.083107 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 8 0.091503 0.088745 0.084692 0.090917 0.087025 0.097755 0.098156 0.096978 0.091087 0.089095 0.084047 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9 0.093026 0.089801 0.086199 0.092869 0.085082 0.093095 0.097446 0.096135 0.087899 0.093600 0.084847 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 10 0.092328 0.088412 0.084191 0.093574 0.089057 0.092486 0.092098 0.095178 0.087031 0.092552 0.093092 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 11 0.090037 0.088362 0.082271 0.093029 0.089300 0.093419 0.092676 0.095934 0.085754 0.094233 0.094984 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 12 0.089259 0.089288 0.081927 0.092128 0.090056 0.095022 0.093027 0.096551 0.084902 0.093713 0.094125 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 13 0.090420 0.090364 0.082883 0.090957 0.090520 0.095897 0.092830 0.096991 0.085359 0.092450 0.091328 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 14 0.092849 0.090768 0.084826 0.090577 0.090128 0.095800 0.091473 0.096716 0.086339 0.091695 0.088830 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 15 0.094883 0.090386 0.086589 0.091395 0.089243 0.095382 0.090158 0.096030 0.086547 0.091641 0.087745 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 16 0.095830 0.089675 0.087386 0.093187 0.088688 0.094850 0.089856 0.094720 0.085712 0.091763 0.088332 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 17 0.096707 0.089692 0.087032 0.094859 0.088470 0.094406 0.090890 0.092478 0.084414 0.091561 0.089489 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 18 0.097392 0.089990 0.085539 0.095271 0.087782 0.093604 0.093364 0.090749 0.084076 0.091527 0.090705 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 19 0.098257 0.090992 0.084108 0.094317 0.086786 0.092223 0.095532 0.089900 0.085336 0.091826 0.090723 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
import numpy as np
import pandas as pd
np_mask = dec_enc_attn[ : , 1 , : , : ] . asnumpy( )
df_mask = pd. DataFrame( np_mask[ 0 ] )
display( df_mask)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 0 0.074333 0.088449 0.088988 0.099735 0.112051 0.094431 0.094709 0.087894 0.081049 0.086165 0.092195 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1 0.068801 0.084814 0.086855 0.098526 0.111549 0.096243 0.093834 0.089065 0.084500 0.091762 0.094050 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2 0.072272 0.086827 0.086565 0.086657 0.100670 0.085109 0.097011 0.093384 0.098737 0.092580 0.100187 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 3 0.077759 0.094253 0.083173 0.082982 0.093428 0.084345 0.092084 0.095742 0.096965 0.100373 0.098896 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4 0.079561 0.096169 0.086972 0.084701 0.086948 0.081480 0.094967 0.096085 0.096251 0.101730 0.095136 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5 0.078983 0.092970 0.080878 0.082304 0.086006 0.080827 0.097434 0.101296 0.105544 0.097998 0.095760 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 6 0.080587 0.090972 0.080911 0.086933 0.088651 0.085569 0.096675 0.098018 0.109090 0.094927 0.087667 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 7 0.072583 0.086286 0.077730 0.086629 0.092446 0.086550 0.100031 0.099640 0.107728 0.094098 0.096279 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 8 0.070655 0.084430 0.079364 0.081034 0.088304 0.089219 0.099965 0.100191 0.113440 0.097578 0.095820 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9 0.071644 0.085457 0.080228 0.078022 0.092635 0.086950 0.096733 0.098612 0.109804 0.101121 0.098794 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 10 0.074329 0.085005 0.080694 0.079701 0.096653 0.092729 0.105249 0.095442 0.109925 0.090499 0.089775 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 11 0.077688 0.087840 0.084156 0.080657 0.096997 0.093718 0.103336 0.092465 0.105609 0.088751 0.088784 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 12 0.079108 0.088608 0.085662 0.083179 0.096236 0.094808 0.102845 0.090689 0.102670 0.088334 0.087861 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 13 0.077928 0.086992 0.085219 0.086174 0.095363 0.095566 0.103802 0.090631 0.102413 0.088805 0.087105 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 14 0.074751 0.083586 0.083373 0.087889 0.094168 0.095531 0.105414 0.092379 0.105155 0.089965 0.087789 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 15 0.071185 0.080207 0.082090 0.088453 0.093237 0.094818 0.105557 0.094589 0.109068 0.091161 0.089635 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 16 0.069688 0.078287 0.082086 0.088641 0.093139 0.094343 0.104304 0.095726 0.111665 0.091029 0.091093 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 17 0.071022 0.078404 0.083152 0.089394 0.093929 0.094573 0.102670 0.095395 0.110611 0.089671 0.091179 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 18 0.074130 0.080277 0.085036 0.091464 0.095425 0.095011 0.101123 0.093857 0.105717 0.088063 0.089897 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 19 0.077317 0.083354 0.087579 0.093792 0.096943 0.094673 0.099624 0.092072 0.099838 0.086541 0.088267 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
dec_self_attns
[]
dec_enc_attns
[]
dec_outputs
Tensor(shape=[1, 20, 17], dtype=Float32, value=
[[[-8.26425076e-01, -1.20410120e+00, -1.28439963e-01 ... 5.64422965e-01, 1.06194824e-01, -2.42545847e-02],
[-9.16367471e-01, -1.43073356e+00, -5.09288907e-02 ... 7.33686745e-01, -4.93338741e-02, 5.19632623e-02],
[-9.17246103e-01, -1.34883523e+00, -3.61622870e-02 ... 8.07045281e-01, 3.27018559e-01, 7.60697350e-02],
...
[-7.43718505e-01, -7.48091042e-01, -4.24297601e-01 ... 5.49952924e-01, 5.42376600e-02, -3.17106813e-01],
[-7.17169762e-01, -7.46447027e-01, -3.70600104e-01 ... 5.41904747e-01, 2.55204104e-02, -2.68386006e-01],
[-6.96062922e-01, -7.01602280e-01, -3.98487747e-01 ... 6.10271156e-01, -1.14318877e-01, -3.05555671e-01]]])
dec_outputs
Tensor(shape=[1, 20, 17], dtype=Float32, value=
[[[-8.26425076e-01, -1.20410120e+00, -1.28439963e-01 ... 5.64422965e-01, 1.06194824e-01, -2.42545847e-02],
[-9.16367471e-01, -1.43073356e+00, -5.09288907e-02 ... 7.33686745e-01, -4.93338741e-02, 5.19632623e-02],
[-9.17246103e-01, -1.34883523e+00, -3.61622870e-02 ... 8.07045281e-01, 3.27018559e-01, 7.60697350e-02],
...
[-7.43718505e-01, -7.48091042e-01, -4.24297601e-01 ... 5.49952924e-01, 5.42376600e-02, -3.17106813e-01],
[-7.17169762e-01, -7.46447027e-01, -3.70600104e-01 ... 5.41904747e-01, 2.55204104e-02, -2.68386006e-01],
[-6.96062922e-01, -7.01602280e-01, -3.98487747e-01 ... 6.10271156e-01, -1.14318877e-01, -3.05555671e-01]]])
dec_logits = dec_outputs. view( ( - 1 , dec_outputs. shape[ - 1 ] ) )
dec_logits
Tensor(shape=[20, 17], dtype=Float32, value=
[[-8.26425076e-01, -1.20410120e+00, -1.28439963e-01 ... 5.64422965e-01, 1.06194824e-01, -2.42545847e-02],
[-9.16367471e-01, -1.43073356e+00, -5.09288907e-02 ... 7.33686745e-01, -4.93338741e-02, 5.19632623e-02],
[-9.17246103e-01, -1.34883523e+00, -3.61622870e-02 ... 8.07045281e-01, 3.27018559e-01, 7.60697350e-02],
...
[-7.43718505e-01, -7.48091042e-01, -4.24297601e-01 ... 5.49952924e-01, 5.42376600e-02, -3.17106813e-01],
[-7.17169762e-01, -7.46447027e-01, -3.70600104e-01 ... 5.41904747e-01, 2.55204104e-02, -2.68386006e-01],
[-6.96062922e-01, -7.01602280e-01, -3.98487747e-01 ... 6.10271156e-01, -1.14318877e-01, -3.05555671e-01]])
import numpy as np
import pandas as pd
np_mask = dec_logits. asnumpy( )
df_mask = pd. DataFrame( np_mask)
display( df_mask)
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 0 -0.826425 -1.204101 -0.128440 0.356329 0.452220 -0.452443 0.606300 -0.227930 0.269083 -0.173742 -1.121589 -0.093102 0.518840 -0.357170 0.564423 0.106195 -0.024255 1 -0.916367 -1.430734 -0.050929 0.669196 0.806917 -0.368863 0.607177 -0.121352 0.693882 0.175020 -1.045657 -0.109076 0.280068 -0.137364 0.733687 -0.049334 0.051963 2 -0.917246 -1.348835 -0.036162 0.319181 0.898127 -0.238785 0.533196 -0.069017 0.378946 0.431980 -1.048683 -0.221063 0.487657 -0.222789 0.807045 0.327019 0.076070 3 -0.815883 -1.066178 -0.008949 0.560768 0.811490 0.202395 0.151353 0.049359 0.387674 0.457533 -0.773893 -0.070126 0.661461 -0.265192 0.681898 0.425325 0.144593 4 -0.912663 -1.143663 0.028645 0.582570 0.717001 0.173276 0.264394 -0.238464 0.151882 -0.051793 -1.004650 -0.201817 0.390479 -0.488408 0.563053 0.070214 0.069295 5 -0.597179 -0.916089 0.151013 0.492338 0.731651 0.274230 -0.319707 -0.280315 0.292216 -0.101262 -0.709801 -0.106190 0.033473 -0.266601 0.519980 -0.311297 -0.074184 6 -0.386974 -0.836567 -0.362828 0.303286 0.691430 0.306932 0.119565 -0.072272 -0.051085 -0.128228 -0.738584 -0.342226 0.168470 -0.418037 0.457969 0.019883 0.071905 7 -0.376257 -0.822605 -0.273047 0.370478 0.597702 0.235920 -0.052315 -0.481007 -0.074119 -0.521327 -0.295519 -0.301262 0.167297 -0.296259 0.062927 -0.458530 0.012958 8 -0.083013 -0.837436 -0.365891 0.371293 0.661609 -0.203302 -0.033456 -0.376518 -0.236429 -0.157299 -0.149379 -0.017083 0.167552 -0.414438 0.437286 -0.254605 -0.090462 9 -0.483190 -0.765104 -0.118083 0.401054 0.805544 -0.215368 -0.222057 -0.296446 -0.121933 -0.036313 0.038458 0.048205 0.039959 -0.037830 0.261883 0.030427 -0.001241 10 -0.527957 -0.571347 -0.405835 0.256833 0.708732 -0.250715 0.105121 -0.272948 0.202755 0.004397 -0.341211 0.137533 0.153622 -0.245164 0.480117 -0.095713 -0.249430 11 -0.587227 -0.493291 -0.488285 0.351468 0.639270 -0.255272 0.256668 -0.274870 0.284715 -0.000538 -0.421533 0.136792 0.249349 -0.228528 0.551138 -0.198995 -0.445792 12 -0.661311 -0.488233 -0.616758 0.395022 0.478487 -0.257968 0.496442 -0.233857 0.276776 -0.019413 -0.538651 0.067837 0.327629 -0.242592 0.565016 -0.408232 -0.647129 13 -0.731524 -0.538342 -0.729429 0.341859 0.308497 -0.255188 0.747433 -0.167398 0.222499 -0.026797 -0.665732 0.012870 0.367431 -0.276456 0.558572 -0.561908 -0.770647 14 -0.778393 -0.608182 -0.758428 0.210861 0.217719 -0.222611 0.914947 -0.121113 0.195051 -0.020288 -0.775376 0.051484 0.334355 -0.309422 0.563821 -0.534780 -0.763223 15 -0.790926 -0.671619 -0.683507 0.074311 0.240690 -0.143512 0.944068 -0.132924 0.232785 -0.016804 -0.837451 0.205178 0.215803 -0.319911 0.577553 -0.333223 -0.635557 16 -0.773691 -0.719550 -0.547444 0.007793 0.341262 -0.036719 0.852336 -0.198890 0.310146 -0.033683 -0.824218 0.422568 0.054875 -0.296552 0.574741 -0.086485 -0.457625 17 -0.743719 -0.748091 -0.424298 0.038584 0.445464 0.044425 0.713990 -0.271102 0.364647 -0.072102 -0.725785 0.614307 -0.058577 -0.245800 0.549953 0.054238 -0.317107 18 -0.717170 -0.746447 -0.370600 0.131496 0.492275 0.047140 0.608417 -0.291705 0.352230 -0.114866 -0.565651 0.709583 -0.045305 -0.191294 0.541905 0.025520 -0.268386 19 -0.696063 -0.701602 -0.398488 0.221775 0.463341 -0.037717 0.571261 -0.237400 0.282085 -0.138031 -0.399849 0.692607 0.099896 -0.163607 0.610271 -0.114319 -0.305556
print ( "shape:" , dec_logits. shape)
shape: (20, 17)
logits, _, _, _ = model( enc_inputs, dec_inputs[ : , : - 1 ] , src_pad_idx, trg_pad_idx)
logits
shape: (19, 17)
Tensor(shape=[19, 17], dtype=Float32, value=
[[-2.90044618e+00, -2.73945975e+00, -2.43362784e+00 ... 3.80752683e-01, -5.63995481e-01, -5.36451638e-01],
[-2.33331084e+00, -2.51542068e+00, -2.80998254e+00 ... 6.92990720e-01, 1.48079768e-02, -2.17762724e-01],
[-2.44148254e+00, -2.42424273e+00, -2.58729625e+00 ... 5.25278568e-01, -2.94933200e-01, -4.10319030e-01],
...
[-2.09368324e+00, -9.18468475e-01, -2.58029604e+00 ... 3.18956435e-01, 3.11222553e-01, -9.76157486e-02],
[-2.36811519e+00, -1.12266421e+00, -2.77283788e+00 ... -3.81676793e-01, 5.28824031e-02, 2.79054910e-01],
[-2.15698123e+00, -1.33755660e+00, -2.65571284e+00 ... 1.98437542e-01, 2.83312023e-01, 5.97692244e-02]])
targets = dec_inputs[ : , 1 : ] . view( - 1 )
targets
Tensor(shape=[19], dtype=Int32, value= [ 6, 7, 8, 9, 10, 4, 11, 5, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
loss = loss_fn( logits, targets)
loss
Tensor(shape=[], dtype=Float32, value= 1.97173)
print ( "yangge mindspore 打卡第十一天 trainsformer之padding过程 2024-07-01" )
yangge mindspore 打卡第十一天 trainsformer之padding过程 2024-07-01
grad_fn = ops. value_and_grad( forward, None , optimizer. parameters)
grad_fn
<function mindspore.ops.composite.base._Grad.__call__.<locals>.after_grad(*args, **kwargs)>
def train_step ( enc_inputs, dec_inputs) :
loss, grads = grad_fn( enc_inputs, dec_inputs)
optimizer( grads)
return loss
def train ( iterator, epoch= 0 ) :
model. set_train( True )
num_batches = len ( iterator)
total_loss = 0
total_steps = 0
with tqdm( total= num_batches) as t:
t. set_description( f'EpochT: { epoch} ' )
for src, src_len, trg in iterator( ) :
print ( "src:" , src)
print ( src. shape)
print ( "src_len:" , src_len)
print ( "------" )
print ( "trg:" , trg)
print ( trg. shape)
loss = train_step( src, trg)
total_loss += loss. asnumpy( )
print ( "loss---->:" , loss)
total_steps += 1
curr_loss = total_loss / total_steps
t. set_postfix( { 'lossT' : f' { curr_loss: .2f } ' } )
t. update( 1 )
return total_loss / total_steps
def evaluate ( iterator) :
model. set_train( False )
num_batches = len ( iterator)
total_loss = 0
total_steps = 0
with tqdm( total= num_batches) as t:
for src, _, trg in iterator( ) :
loss = forward( src, trg)
total_loss += loss. asnumpy( )
total_steps += 1
curr_loss = total_loss / total_steps
t. set_postfix( { 'loss' : f' { curr_loss: .2f } ' } )
t. update( 1 )
return total_loss / total_steps
cache_dir= "./"
from download import download
from pathlib import Path
from tqdm import tqdm
import os
from mindspore import save_checkpoint
num_epochs = 2
best_valid_loss = float ( 'inf' )
ckpt_file_name = os. path. join( cache_dir, 'transformer.ckpt' )
for i in range ( num_epochs) :
train_loss = train( train_iterator, i)
EpochT: 0: 100%|██████████| 2/2 [00:00<00:00, 12.03it/s, lossT=2.18]
src: [[ 2 6 7 8 9 10 11 12 13 4 3 1 1 1 1 1 1 1 1 1]]
(1, 20)
src_len: [11]
------
trg: [[ 2 6 7 8 9 10 4 11 5 3 1 1 1 1 1 1 1 1 1 1]]
(1, 20)
shape: (19, 17)
loss---->: 2.2212398
src: [[ 2 5 14 15 16 17 18 5 19 20 4 3 1 1 1 1 1 1 1 1]]
(1, 20)
src_len: [12]
------
trg: [[ 2 4 12 13 14 4 15 16 5 3 1 1 1 1 1 1 1 1 1 1]]
(1, 20)
shape: (19, 17)
loss---->: 2.1410027
EpochT: 1: 0%| | 0/2 [00:00<?, ?it/s]
src: [[ 2 6 7 8 9 10 11 12 13 4 3 1 1 1 1 1 1 1 1 1]]
(1, 20)
src_len: [11]
------
trg: [[ 2 6 7 8 9 10 4 11 5 3 1 1 1 1 1 1 1 1 1 1]]
(1, 20)
shape: (19, 17)
EpochT: 1: 100%|██████████| 2/2 [00:00<00:00, 12.09it/s, lossT=1.96]
loss---->: 2.0625696
src: [[ 2 5 14 15 16 17 18 5 19 20 4 3 1 1 1 1 1 1 1 1]]
(1, 20)
src_len: [12]
------
trg: [[ 2 4 12 13 14 4 15 16 5 3 1 1 1 1 1 1 1 1 1 1]]
(1, 20)
shape: (19, 17)
loss---->: 1.8615117
import mindspore as ms
values = [ 2 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 4 , 3 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ]
SRC = ms. Tensor( [ values] , dtype= ms. int32)
print ( SRC)
values = [ 2 , 6 , 7 , 8 , 9 , 10 , 4 , 11 , 5 , 3 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ]
TRG = ms. Tensor( [ values] , dtype= ms. int32)
print ( TRG)
[[ 2 6 7 8 9 10 11 12 13 4 3 1 1 1 1 1 1 1 1 1]]
SRC. shape
(1, 20)
[[ 2 6 7 8 9 10 4 11 5 3 1 1 1 1 1 1 1 1 1 1]]