NLP中的transformer实现

最新推荐文章于 2024-06-10 18:56:40 发布

潘诺西亚的火山

最新推荐文章于 2024-06-10 18:56:40 发布

阅读量1.1k

点赞数 1

本文链接：https://blog.csdn.net/helldoger/article/details/106287779

版权

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
import numpy as np
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)
print(sys.version_info)

2.0.0
sys.version_info(major=3, minor=6, micro=10, releaselevel='final', serial=0)

for i in mpl,np,pd,sklearn,tf,keras:
    print(i.__name__,i.__version__)

matplotlib 3.1.3
numpy 1.18.1
pandas 1.0.3
sklearn 0.22.1
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf

import tensorflow_datasets as tfds

examples, info = tfds.load(name = 'ted_hrlr_translate/pt_to_en',shuffle_files=True, with_info=True,as_supervised=True)

train_examples, val_examples = examples['train'], examples['validation']

print(info)

tfds.core.DatasetInfo(
    name='ted_hrlr_translate',
    version=0.0.1,
    description='Data sets derived from TED talk transcripts for comparing similar language pairs
where one is high resource and the other is low resource.
',
    urls=['https://github.com/neulab/word-embeddings-for-nmt'],
    features=Translation({
        'en': Text(shape=(), dtype=tf.string),
        'pt': Text(shape=(), dtype=tf.string),
    }),
    total_num_examples=54781,
    splits={
        'test': 1803,
        'train': 51785,
        'validation': 1193,
    },
    supervised_keys=('pt', 'en'),
    citation="""@inproceedings{Ye2018WordEmbeddings,
      author  = {Ye, Qi and Devendra, Sachan and Matthieu, Felix and Sarguna, Padmanabhan and Graham, Neubig},
      title   = {When and Why are pre-trained word embeddings useful for Neural Machine Translation},
      booktitle = {HLT-NAACL},
      year    = {2018},
      }""",
    redistribution_info=,
)

for pt,en in train_examples.take(5):
    print(pt.numpy())
    print(en.numpy())
    print()

b'os astr\xc3\xb3nomos acreditam que cada estrela da gal\xc3\xa1xia tem um planeta , e especulam que at\xc3\xa9 um quinto deles tem um planeta do tipo da terra que poder\xc3\xa1 ter vida , mas ainda n\xc3\xa3o vimos nenhum deles .'
b"astronomers now believe that every star in the galaxy has a planet , and they speculate that up to one fifth of them have an earth-like planet that might be able to harbor life , but we have n't seen any of them ."

b'o problema \xc3\xa9 que nunca vivi l\xc3\xa1 um \xc3\xbanico dia .'
b"except , i 've never lived one day of my life there ."

b'agora aqui temos imagens sendo extra\xc3\xaddas em tempo real diretamente do feed ,'
b'now here are live images being pulled straight from the feed .'

b'agora : um , dois , tr\xc3\xaas , vai .'
b'so : one , two , three , go .'

b'eventualmente , vamos ver se teremos todos os sentidos humanos empregues , e se vamos ter meios para viver a hist\xc3\xb3ria qualquer que seja a via escolhida .'
b'eventually , we can see if we will have all of our human senses employed , and we will have agency to live the story in any path we choose .'

将数据转化为subwords格式

tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for pt,en in train_examples),target_vocab_size=2**13)
tokenizer_pt = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    (pt.numpy() for pt, en in train_examples), target_vocab_size=2**13)

en_tokenizer = tokenizer_en
pt_tokenizer = tokenizer_pt

sample_string = 'hello world, tensorflow 2'
tokenizer_string  = en_tokenizer.encode(sample_string)

tokenizer_string

[3222, 439, 150, 7345, 1378, 2824, 2370, 7881]

orgin_string = en_tokenizer.decode(tokenizer_string)

orgin_string

'hello world, tensorflow 2'

assert orgin_string == sample_string

for token in tokenizer_string:
    print('{}-->"{}"'.format(token,en_tokenizer.decode([token])))

3222-->"hell"
439-->"o "
150-->"world"
7345-->", "
1378-->"ten"
2824-->"sor"
2370-->"flow "
7881-->"2"

添加start、end的token表示

buffer_size = 2000
batch_size = 64
max_lenth = 40

def encode_to_subword(pt_sentence,en_sentence): #把句子转换为subword之后的句子
    pt_sequence = [pt_tokenizer.vocab_size] \
    + pt_tokenizer.encode(pt_sentence.numpy()) \
    + [pt_tokenizer.vocab_size + 1]
    en_sentence = [en_tokenizer.vocab_size] \
    + en_tokenizer.encode(en_sentence.numpy()) \
    + [en_tokenizer.vocab_size + 1]
    return pt_sequence,en_sentence

过滤长度低于40(max_lenth)的数据

def filter_by_max_lenth(pt,en):
    return tf.logical_and(tf.size(pt) <= max_lenth,
                          tf.size(en) <= max_lenth)

将python运算，转换为tensorflow运算节点

def tf_encode_to_subword(pt_sentence,en_sentence):
    return tf.py_function(encode_to_subword, #定义过的函数
                         [pt_sentence,en_sentence],
                         [tf.int64,tf.int64])

构造训练集

# 使用.map()运行相关图操作
train_dataset = train_examples.map(tf_encode_to_subword)
# 过滤过长的数据
train_dataset = train_dataset.filter(filter_by_max_lenth)
# 使用缓存数据加速读入
train_dataset = train_dataset.cache()
# 打乱并获取批数据
train_dataset = train_dataset.shuffle(
    buffer_size).padded_batch(
    batch_size, padded_shapes=([-1], [-1])) #padded_shapes有两个维度,每个维度都扩展到当前维度最大值
# 设置预取数据
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)

构造测试集

valid_dataset = val_examples.map(tf_encode_to_subword)
valid_dataset = valid_dataset.filter(
    filter_by_max_lenth).padded_batch(
    batch_size, padded_shapes=([-1], [-1]))

for pt_batch,en_batch in valid_dataset.take(5):
    print(pt_batch.shape,en_batch.shape)

(64, 40) (64, 40)
(64, 38) (64, 40)
(64, 40) (64, 40)
(64, 39) (64, 39)
(64, 37) (64, 38)

de_batch, en_batch = next(iter(valid_dataset))
de_batch, en_batch

(<tf.Tensor: id=209188, shape=(64, 40), dtype=int64, numpy=
 array([[8214, 1259,    5, ...,    0,    0,    0],
        [8214,  299,   13, ...,    0,    0,    0],
        [8214,   59,    8, ...,    0,    0,    0],
        ...,
        [8214,   95,    3, ...,    0,    0,    0],
        [8214, 5157,    1, ...,    0,    0,    0],
        [8214, 4479, 7990, ...,    0,    0,    0]], dtype=int64)>,
 <tf.Tensor: id=209189, shape=(64, 40), dtype=int64, numpy=
 array([[8087,   18,   12, ...,    0,    0,    0],
        [8087,  634,   30, ...,    0,    0,    0],
        [8087,   16,   13, ...,    0,    0,    0],
        ...,
        [8087,   12,   20, ...,    0,    0,    0],
        [8087,   17, 4981, ...,    0,    0,    0],
        [8087,   12, 5453, ...,    0,    0,    0]], dtype=int64)>)

2.位置嵌入

将位置编码矢量添加得到词嵌入，相同位置的词嵌入将会更接近，但并不能直接编码相对位置

基于角度的位置编码方法如下：

$\Large{PE_{(pos, 2i)} = sin(pos / 10000^{2i / d_{model}})}$
$\Large{PE_{(pos, 2i+1)} = cos(pos / 10000^{2i / d_{model}})}$

# PE(pos,2i) = sin(pos/10000^(2i/d_model))
# PE(pos,2i+1) = cos(pos/10000^(2i/d_model))
# pos.shape[sentence_lenth,1]
#i.shape: [1,d_model]
# result.shape:[sentence_lenth,d_model]
# angle_rates.shape = i.shape
def get_angels(pos, i, d_model):# pos:词语在句子中的位置,i:词语在embedding 中的位置
    angle_rates = 1/np.power(10000,(2*(i//2))/np.float32(d_model))
    return pos * angle_rates

def get_position_embedding(sentence_lenth,d_model):#d_model:word_embedding的长度
    angle_rads = get_angels(np.arange(sentence_lenth)[:,np.newaxis],
                            np.arange(d_model)[np.newaxis,:],
                            d_model)
    #sines.shape:[sentence_lenth,d_model/2] 第二维只取了一半 shape减半
    #cosines.shape:[sentence_lenth,d_model/2]
    sines = np.sin(angle_rads[:, 0::2])#0:,表示1到最后 :2 每隔一个取一位
    # 第2i+1项使用cos
    cosines = np.cos(angle_rads[:, 1::2])
    # position_embedding.shape: [sentence_lenth,d_model]
    position_embedding = np.concatenate([sines, cosines], axis=-1)
    # position_embedding.shape: [1,sentence_lenth,d_model]
    position_embedding = position_embedding[np.newaxis, ...]
    
    return tf.cast(position_embedding,dtype=tf.float32)

position_embbeding = get_position_embedding(50,512)

position_embbeding.shape

TensorShape([1, 50, 512])

def plot_position_embedding(position_embedding):
    plt.pcolormesh(position_embbeding[0], cmap='RdBu')
    plt.xlabel('Depth')
    plt.xlim((0, 512))
    plt.ylabel('Position')
    plt.colorbar()
    plt.show() # 在这里左右边分别为原来2i 和 2i+1的特征

获得位置嵌入编码

plot_position_embedding(position_embbeding)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-p7y44bLK-1590139711696)(output_35_0.png)]

# 1. padding mask, 2. look ahead 只能与之前的词语发生关系

#batch_data.shape :[batch_size,seq_len]
def create_padding_mask(batch_data):
    # 获取为0的padding项
    padding_mask = tf.cast(tf.math.equal(batch_data, 0), tf.float32) #比较是否相等
    #[batch_size,1,1,seq_len]
    # 扩充维度以便用于attention矩阵
    return padding_mask[:, tf.newaxis, tf.newaxis, :] # (batch_size,1,1,seq_len)

x = tf.constant([[7,6,0,0,1],
                 [1,2,3,0,0],
                 [0,0,0,4,5]])
create_padding_mask(x)

<tf.Tensor: id=209203, shape=(3, 1, 1, 5), dtype=float32, numpy=
array([[[[0., 0., 1., 1., 0.]]],


       [[[0., 0., 0., 1., 1.]]],


       [[[1., 1., 1., 0., 0.]]]], dtype=float32)>

# mark 测试
create_padding_mask([[1,2,0,0,3],[3,4,5,0,0]])

<tf.Tensor: id=209211, shape=(2, 1, 1, 5), dtype=float32, numpy=
array([[[[0., 0., 1., 1., 0.]]],


       [[[0., 0., 0., 1., 1.]]]], dtype=float32)>

look-ahead mask 用于对未预测的token进行掩码

这意味着要预测第三个单词，只会使用第一个和第二个单词。

要预测第四个单词，仅使用第一个，第二个和第三个单词，依此类推。

#[[1,2,3], #1代表1和自己的attention,2代表1和自己后面的字的attention
# [4,5,6],
# [7,8,9]]  
# 把上三角置零即可
def create_look_ahead_mask(size):
    # 1 - 对角线和取下三角的全部对角线（-1->全部）
    # 这样就可以构造出每个时刻未预测token的掩码
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0) # num_lower = -1时,下三角 keep,上三角置零
    return mask  # (seq_len, seq_len)

create_look_ahead_mask(3)

<tf.Tensor: id=209219, shape=(3, 3), dtype=float32, numpy=
array([[0., 1., 1.],
       [0., 0., 1.],
       [0., 0., 0.]], dtype=float32)>

4.Scaled dot product attention

进行attention计算的时候有3个输入 Q (query), K (key), V (value)。计算公式如下：
$\Large{Attention(Q, K, V) = softmax_k(\frac{QK^T}{\sqrt{d_k}}) V}$

点积注意力通过深度d_k的平方根进行缩放,因为较大的深度会使点积变大，由于使用softmax，会使梯度变小。
例如，考虑Q和K的均值为0且方差为1.它们的矩阵乘法的均值为0，方差为dk。我们使用dk的根用于缩放（而不是任何其他数字），因为Q和K的matmul应该具有0的均值和1的方差。

在这里我们将被掩码的token乘以-1e9(表示负无穷),这样softmax之后就为0,不对其他token产生影响。

q : shape == (…,seq_len_q,depth)

k : shape == (…,seq_len_k,depth)

v : shape == (…,seq_len_v,depth_v)

seq_len_k == seq_len_v

mask:shape == (…,seq_len_q,seq_len_k )

returns:

out_put:weighted sum

attention_weights : weights of attention

def scaled_dot_product_attention(q, k, v, mask):
    # query key 相乘获取匹配关系
    # matmul_qk.shape:(...,seq_len_q,seq_len_k)
    matmul_qk = tf.matmul(q, k, transpose_b=True) #第二个矩阵做转置
    
    # 使用dk进行缩放
    dk = tf.cast(tf.shape(k)[-1], tf.float32) #取出k的最后一维做为dk ,depth
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk) # logits:未经过softmax,
    #scaled_attention_logits.shape : (...,seq_len_q,seq_len_k)
    
    # 掩码
    if mask is not None:
        # 使得在softmax后趋近于0
        scaled_attention_logits += (mask * -1e9)
    
    #attention_weights.shape : (...,seq_len_q,seq_len_k)
    # 通过softmax获取attention权重
    attention_weights = tf.nn.softmax(
        scaled_attention_logits, axis=-1) #attention_weights.shape : (...,seq_len_q,seq_len_k)
    
    # attention 乘上value
    # seq_len_k == seq_len_v
    # output.shape:(...,seq_len_q,depth_v)
    output = tf.matmul(attention_weights, v) # （.., seq_len_v, depth）
    
    return output, attention_weights

调试函数

def print_scaled_dot_product_attention(q, k, v):
    temp_out, temp_att = scaled_dot_product_attention(
    q, k, v, None)
    print('attention weight:')
    print(temp_att)
    print('output:')
    print(temp_out)

q : shape == (…,seq_len_q,depth)
k : shape == (…,seq_len_k,depth)
v : shape == (…,seq_len_v,depth_v)
seq_len_k == seq_len_v
mask:shape == (…,seq_len_q,seq_len_k )

np.set_printoptions(suppress=True) # suppress是否压缩由科学计数法表示的浮点数
# precision : int, optional，float输出的精度，即小数点后维数，默认8
#threshold : int, optional，当数组数目过大时，设置显示几个数字，其余用省略号

temp_k = tf.constant([[10,0,0],
                      [0,10,0], 
                      [0,0,10],
                      [0,0,10]], dtype=tf.float32)  # (4, 3) k : shape == (...,seq_len_k,depth)

temp_v = tf.constant([[   1,0],
                      [  10,0],# 被取出
                      [ 100,5],
                      [1000,6]], dtype=tf.float32)  # (4, 2) v : shape == (...,seq_len_v,depth_v)
# 关注第2个key, 返回对应的value
temp_q1 = tf.constant([[0,10,0]], dtype=tf.float32)  #(1,3) q : shape == (...,seq_len_q,depth)
print_scaled_dot_product_attention(temp_q1, temp_k, temp_v)
# attention_weight : (...,seq_len_q,seq_len_k) 1,4
# output.shape:(...,seq_len_q,depth_v) 1,2

attention weight:
tf.Tensor([[0. 1. 0. 0.]], shape=(1, 4), dtype=float32)
output:
tf.Tensor([[10.  0.]], shape=(1, 2), dtype=float32)

# 关注重复的key(第3、4个), 返回对应的value（平均）
temp_q2 = tf.constant([[0,0,10]], dtype=tf.float32) #(1,3)
print_scaled_dot_product_attention(temp_q2, temp_k, temp_v)

attention weight:
tf.Tensor([[0.  0.  0.5 0.5]], shape=(1, 4), dtype=float32)
output:
tf.Tensor([[550.    5.5]], shape=(1, 2), dtype=float32)

# 关注第1、2个key, 返回对应的value（平均）
temp_q3 = tf.constant([[10,10,0]], dtype=tf.float32)
print_scaled_dot_product_attention(temp_q3, temp_k, temp_v)

attention weight:
tf.Tensor([[0.5 0.5 0.  0. ]], shape=(1, 4), dtype=float32)
output:
tf.Tensor([[5.5 0. ]], shape=(1, 2), dtype=float32)

# 依次放入每个query,q1,q2,q3
temp_q4 = tf.constant([ [0, 10, 0],[0, 0, 10], [10, 10, 0]], dtype=tf.float32)  # (3, 3)
print_scaled_dot_product_attention(temp_q4, temp_k, temp_v)

attention weight:
tf.Tensor(
[[0.  1.  0.  0. ]
 [0.  0.  0.5 0.5]
 [0.5 0.5 0.  0. ]], shape=(3, 4), dtype=float32)
output:
tf.Tensor(
[[ 10.    0. ]
 [550.    5.5]
 [  5.5   0. ]], shape=(3, 2), dtype=float32)

5.Mutil-Head Attention

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-CWza1d3C-1590139711698)(https://www.tensorflow.org/images/tutorials/transformer/multi_head_attention.png)]

mutil-head attention包含3部分：

线性层与分头
缩放点积注意力
头连接
末尾线性层

每个多头注意块有三个输入; Q（查询），K（密钥），V（值）。它们通过第一层线性层并分成多个头。

注意:点积注意力时需要使用mask，多头输出需要使用tf.transpose调整各维度。

Q，K和V不是一个单独的注意头，而是分成多个头，因为它允许模型共同参与来自不同表征空间的不同信息。在拆分之后，每个头部具有降低的维度，总计算成本与具有全维度的单个头部注意力相同。

'''
x-> Wq0-> q0
x-> Wk0-> k0
x-> Wv0-> v0

# 实战中
q-> Wq0-> q0
k-> Wk0-> k0
v-> Wv0-> v0

q -> Wq0-> Q -> split -> q0,q1,q2...
'''

'\nx-> Wq0-> q0\nx-> Wk0-> k0\nx-> Wv0-> v0\n\n# 实战中\nq-> Wq0-> q0\nk-> Wk0-> k0\nv-> Wv0-> v0\n\nq -> Wq0-> Q -> split -> q0,q1,q2...\n'

# 构造mutil head attention层
class MutilHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads): # d_model:model的维度, num_heads:多头的头数
        super(MutilHeadAttention, self).__init__()
        self.num_heads = num_heads #属性
        self.d_model = d_model
        
        # d_model 必须可以正确分为各个头
        assert self.d_model % self.num_heads == 0 #判断机制
        # 分头后的维度
        self.depth = self.d_model // self.num_heads
        
        self.WQ = tf.keras.layers.Dense(d_model)
        self.WK = tf.keras.layers.Dense(d_model)
        self.WV = tf.keras.layers.Dense(d_model)
        #拼接
        self.dense = tf.keras.layers.Dense(d_model)
        
    def split_heads(self, x, batch_size):
        # 分头, 将头个数的维度 放到 seq_len 前面
        #x.shape :[batch_size, seq_len, d_model]
        # d_model = num_heads * depth
        #x-> [batch_size, num_heads, seq_len, depth]
        x = tf.reshape(x, 
                       (batch_size, -1, self.num_heads, self.depth)) # -1:seq_len
        return tf.transpose(x, perm=[0, 2, 1, 3]) #维度重新排列 交换 seq_len和num_heads
    
    def call(self, q, k, v, mask): #方法
        batch_size = tf.shape(q)[0]
        
        # 分头前的前向网络，获取q、k、v语义
        q = self.WQ(q)  # (batch_size, seq_len_q, d_model)
        k = self.WK(k)  # (batch_size, seq_len_k, d_model) seq_len_k==seq_len_v
        v = self.WV(v)  # (batch_size, seq_len_v, d_model)
        
        # 分头
        # q.shape:(batch_size, num_heads,seq_len_q, depth)
        q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth)
        # k.shape:(batch_size, num_heads, seq_len_k, depth)
        k = self.split_heads(k, batch_size)
        # v.shape:(batch_size, num_heads, seq_len_v, depth)
        v = self.split_heads(v, batch_size)
        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        
        # 通过缩放点积注意力层
        scaled_attention_outputs, attention_weights = \
        scaled_dot_product_attention(q, k, v, mask)
        # 把多头维度后移
        scaled_attention_outputs = tf.transpose(
            scaled_attention_outputs, perm = [0, 2, 1, 3]) # (batch_size, seq_len_q, num_heads, depth) 切换回来

        # 合并多头
        concat_attention = tf.reshape(scaled_attention_outputs, 
                                      (batch_size, -1, self.d_model))
        
        #output.shape :(batch_size,seq_len_q,d_model)
        # 全连接重塑
        output = self.dense(concat_attention)
        
        return output, attention_weights

测试

temp_mha = MutilHeadAttention(d_model=512, num_heads=8)
y = tf.random.uniform((1, 60, 256))
output, att = temp_mha(y, y, y, mask=None) #调用方法
print(output.shape, att.shape)

(1, 60, 512) (1, 8, 60, 60)

point wise前向网络

def feed_forward_network(d_model, dff):#dff:dim of feed_forward_network
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),
        tf.keras.layers.Dense(d_model)
    ])

sample_fnn = feed_forward_network(512, 2048)
sample_fnn(tf.random.uniform((64, 50, 512))).shape

TensorShape([64, 50, 512])

6.编码器和解码器

通过N个编码器层，为序列中的每个字/令牌生成输出。
解码器连接编码器的输出和它自己的输入（自我注意）以预测下一个字。

编码层

每个编码层包含以下子层

多头注意力（带掩码）
前向网络

每个子层中都有残差连接，并最后通过一个正则化层。残差连接有助于避免深度网络中的梯度消失问题。
每个子层输出是LayerNorm(x + Sublayer(x))，规范化是在d_model维的向量上。Transformer一共有n个编码层。

class EncoderLayer(keras.layers.Layer):
    """
    x->self attention -> add&normalize & dropout
    -> feed_forward -> add& normalize & dropout
    """
    def __init__(self, d_model, num_heads, dff, rate = 0.1):
        super(EncoderLayer,self).__init__()
        self.mha = MutilHeadAttention(d_model,num_heads) #MutilHeadAttention
        self.ffn = feed_forward_network(d_model,dff)#feed forward network
        
        self.layer_norm1 = keras.layers.LayerNormalization(
            epsilon=1e-6)
        self.layer_norm2 = keras.layers.LayerNormalization(
            epsilon=1e-6)
        
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)
    
    def call(self,x,training,encoder_padding_mask):
        # x.shape:(batch_size,seq_len,dim),dim = d_model
        # atten_output.shape:(batch_size,seq_len,d_model)
        attn_output,_ = self.mha(x,x,x,encoder_padding_mask)
        attn_output = self.dropout1(attn_output,training = training)
        out1 = self.layer_norm1(x + attn_output)
        # ffn.shape:(batch_size,seq_len,dim),dim = d_model
        # out2.shape:(batch_size,seq_len,d_model)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output,training = training)
        out2 = self.layer_norm2(out1 + ffn_output)
        
        return out2

sample_encoder_layer = EncoderLayer(512,8,2048)
sample_input = tf.random.uniform((64,50,512))
sample_output = sample_encoder_layer(sample_input,False,None)
print(sample_output.shape)

(64, 50, 512)

class DecoderLayer(keras.layers.Layer):
    """
    x->self attention -> add&normalize & dropout->
    out1->encoding_outputs->attention->  add& normalize & dropout->
    out2->feed_forward ->add& normalize & dropout->out3
    """
    def __init__(self, d_model, num_heads, dff, rate = 0.1):
        super(DecoderLayer,self).__init__()
        
        self.mha1 = MutilHeadAttention(d_model,num_heads) #MutilHeadAttention
        self.mha2 = MutilHeadAttention(d_model,num_heads)
        
        self.ffn = feed_forward_network(d_model,dff)#feed forward network
        
        self.layer_norm1 = keras.layers.LayerNormalization(
            epsilon=1e-6)
        self.layer_norm2 = keras.layers.LayerNormalization(
            epsilon=1e-6)
        self.layer_norm3 = keras.layers.LayerNormalization(
            epsilon=1e-6)
        
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)
        self.dropout3 = keras.layers.Dropout(rate)
    
    def call(self,x,encoding_outputs,training,
             decoder_mask,encoder_decoder_padding_mask):
        # decoder_mask : 由 look_ahead_mask 和decoder_padding_mask 合并而来
        # x.shape:(batch_size,seq_len,dim),dim = d_model
        # atten_output.shape:(batch_size,input_seq_len,d_model)
        
        #attn1,out1.shape: (batch_size,target_seq_len,d_model))
        attn1,attn_weights1 = self.mha1(x,x,x,decoder_mask)
        attn1 = self.dropout1(attn1,training = training)
        out1 = self.layer_norm1(attn1 + x)
        
        #attn2,out2.shape: (batch_size,target_seq_len,d_model))
        attn2,attn_weights2 = self.mha2(
            out1,encoding_outputs,encoding_outputs,
            encoder_decoder_padding_mask)
        attn2 = self.dropout2(attn2, training = training)
        out2 = self.layer_norm2(attn2 + out1)
        
        #ffn_output,out3.shape: (batch_size,target_seq_len,d_model))
        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output,training = training)
        out3 = self.layer_norm3(ffn_output + out2)
        
        return out3, attn_weights1, attn_weights2

sample_decoder_layer = DecoderLayer(512,8,2048)
sample_decoder_input = tf.random.uniform((64,60,512))
sample_decoder_output, sample_attn_weights1, sample_attn_weights2 = sample_decoder_layer(
    sample_decoder_input,sample_output,False,None,None)
print(sample_decoder_output.shape)
print(sample_attn_weights1.shape)
print(sample_attn_weights2.shape)

(64, 60, 512)
(64, 8, 60, 60)
(64, 8, 60, 50)

class EncoderModel(keras.layers.Layer):
    def __init__(self, num_layers,input_vocab_size, max_lenth,
                 d_model,num_heads,dff,rate = 0.1):
        super(EncoderModel,self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        self.max_lenth = max_lenth
        
        self.embedding = keras.layers.Embedding(input_vocab_size,
                                                self.d_model)
        #  position_embedding.shape : (1,max_lenth,d_model)
        self.position_embedding = get_position_embedding(max_lenth,d_model)
        
        self.dropout = keras.layers.Dropout(rate)
        self.encoder_layers =[
            EncoderLayer(d_model,num_heads,dff,rate) 
            for _ in range(self.num_layers)]
        
    def call (self, x, training, encoder_padding_mask):
        # x.shape:(batch_size,input_seq_len)
        input_seq_len = tf.shape(x)[1]
        tf.debugging.assert_less_equal(
            input_seq_len , self.max_lenth,
            "input_seq_len should be less or equal to self.max_lenth ") 
        
        # x.shape:(batch_size,input_seq_len,d_model)
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model,tf.float32))
        x += self.position_embedding[:,:input_seq_len,:]
        
        x = self.dropout(x, training = training)
        
        for i in range(self.num_layers):
            x = self.encoder_layers[i](x,training,
                                       encoder_padding_mask)
            
            # x.shape:(batch_size,input_seq_len,d_model)
        return x

sample_encoder_model = EncoderModel(2,8500,max_lenth,
                                   512,8,2048) #相当于实例化
sample_encoder_model_input = tf.random.uniform((64,37))
sample_encoder_model_output = sample_encoder_model(
    sample_encoder_model_input,False,encoder_padding_mask = None) #相当于调用方法

print(sample_encoder_model_output.shape)

(64, 37, 512)

class DecoderModel(keras.layers.Layer):
    def __init__(self, num_layers,target_vocab_size, max_lenth,
                 d_model,num_heads,dff,rate = 0.1):
        super(DecoderModel,self).__init__()
        self.num_layers = num_layers
        self.max_lenth = max_lenth
        self.d_model = d_model
        
        self.embedding = keras.layers.Embedding(target_vocab_size,
                                                d_model)
        #  position_embedding.shape : (1,max_lenth,d_model)
        self.position_embedding = get_position_embedding(max_lenth,
                                                         d_model)
        
        self.dropout = keras.layers.Dropout(rate)
        self.decoder_layers =[
            DecoderLayer(d_model, num_heads, dff, rate) 
            for _ in range(self.num_layers)]
        
    def call (self, x ,encoding_outputs,training, 
              decoder_mask, encoder_decoder_padding_mask):
        # x.shape:(batch_size,output_seq_len)
        output_seq_len = tf.shape(x)[1]
        tf.debugging.assert_less_equal(
            output_seq_len , self.max_lenth,
            "output_seq_len should be less or equal to self.max_lenth ")
        
        attention_weights = {}
        # x.shape:(batch_size,output_seq_len,d_model)
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model,tf.float32))
        x += self.position_embedding[:,:output_seq_len,:]
        
        x = self.dropout(x, training = training)
        
        for i in range(self.num_layers):
            x, att1,att2 = self.decoder_layers[i](
                x,encoding_outputs,training,
                decoder_mask,encoder_decoder_padding_mask)
            attention_weights[
                'decoder_layer{}_att1'.format(i+1)] = att1
            attention_weights[
                'decoder_layer{}_att2'.format(i+1)] = att2
            # x.shape:(batch_size,output_seq_len,d_model)
        return x,attention_weights

sample_decoder_model = DecoderModel(2,8000,max_lenth,
                                   512,8,2048) #相当于实例化
sample_decoder_model_input = tf.random.uniform((64,35))
sample_decoder_model_output, sample_decoder_model_att \
= sample_decoder_model(
    sample_decoder_model_input,
    sample_encoder_model_output,
    training = False,
    decoder_mask = None,
    encoder_decoder_padding_mask = None) #相当于调用方法

print(sample_decoder_model_output.shape)

for key in sample_decoder_model_att:
    print(sample_decoder_model_att[key].shape) # 通过 key 检索字典

(64, 35, 512)
(64, 8, 35, 35)
(64, 8, 35, 37)
(64, 8, 35, 35)
(64, 8, 35, 37)

sample_decoder_model_att.keys()

dict_keys(['decoder_layer1_att1', 'decoder_layer1_att2', 'decoder_layer2_att1', 'decoder_layer2_att2'])

sample_decoder_model_att['decoder_layer1_att1'].shape

TensorShape([64, 8, 35, 35])

for i,j in sample_decoder_model_att.items():
    print(j.shape)

(64, 8, 35, 35)
(64, 8, 35, 37)
(64, 8, 35, 35)
(64, 8, 35, 37)

class Transformer(keras.Model):
    def __init__(self, num_layers,input_vocab_size, target_vocab_size,
                 max_lenth, d_model, num_heads, dff, rate = 0.1):
        super(Transformer,self).__init__()
        
        self.encoder_model = EncoderModel(
            num_layers,input_vocab_size,max_lenth,
            d_model,num_heads,dff,rate)
        
        self.decoder_model = DecoderModel(
            num_layers,target_vocab_size,max_lenth,
            d_model,num_heads,dff,rate)
        
        self.final_layer = keras.layers.Dense(target_vocab_size)
        
    def call(self,inp,tar,training,encoder_padding_mask,
            decoder_mask,encoder_decoder_padding_mask):
        #encoding_outputs.shape: (batch_size,input_seq_len,d_model)
        encoding_outputs = self.encoder_model(
            inp,training,encoder_padding_mask)
        
        #decoding_outputs.shape: (batch_size,output_seq_len,d_model)
        decoding_outputs,attention_weights = self.decoder_model(
            tar,encoding_outputs,training,
            decoder_mask,encoder_decoder_padding_mask)
        
        #predictions.shape: (batch_size,output_seq_len,target_vocab_size)
        predictions = self.final_layer(decoding_outputs)
        
        return predictions,attention_weights

sample_transformer = Transformer(2,8500,8000,max_lenth,
                                 512,8,2048,rate=0.1)
temp_input = tf.random.uniform((64,26))
temp_target = tf.random.uniform((64,31))

predictions , attention_weights = sample_transformer(
    temp_input,temp_target,training = False,
    encoder_padding_mask = None,
    decoder_mask = None,
    encoder_decoder_padding_mask = None)

print(predictions.shape)
for key in attention_weights:
    print(attention_weights[key].shape)

(64, 31, 8000)
(64, 8, 31, 31)
(64, 8, 31, 26)
(64, 8, 31, 31)
(64, 8, 31, 26)

# 1 initializes model
# 2 define loss, optimizer, learning_rate schedule
# 3 train_step
# 4 train process

num_layers = 4
d_model = 128
dff = 512
num_heads = 8

input_vocab_size = pt_tokenizer.vocab_size + 2
target_vocab_size = en_tokenizer.vocab_size + 2

drop_rate = 0.1

transformer = Transformer(num_layers,
                          input_vocab_size,
                          target_vocab_size,
                          max_lenth,d_model,
                          num_heads,dff,
                          drop_rate)

# Irate = (d_model ** -0.5)*min(setp_num**(-0.5)*min(step_num ** (-0.5),
                               #step_num*warm_up_step **(-1.5)))
    
class CustomizedSchedule(keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self,d_model,warmup_steps = 4000):
        super (CustomizedSchedule,self).__init__()
        
        self.d_model = tf.cast(d_model,tf.float32)
        self.warmup_steps = warmup_steps
        
    def __call__(self,step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step*(self.warmup_steps **(-1.5))
        arg3 = tf.math.rsqrt(self.d_model)
        
        return arg3*tf.math.minimum(arg1,arg2)

learning_rate = CustomizedSchedule(d_model)
optimizer = keras.optimizers.Adam(learning_rate,
                                  beta_1=0.9,
                                  beta_2=0.98,
                                  epsilon=1e-9)

temp_learning_rate_schedule = CustomizedSchedule(d_model)


plt.plot(temp_learning_rate_schedule(
    tf.range(40000,dtype=tf.float32)))
plt.xlabel("learning rate")
plt.ylabel("Train step")

Text(0, 0.5, 'Train step')

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-xX4gNb9S-1590139711701)(output_84_1.png)]

loss_object = keras.losses.SparseCategoricalCrossentropy(from_logits = True,reduction='none')

def loss_function(real,pred):
    mask = tf.math.logical_not(tf.math.equal(real,0))
    loss_ = loss_object(real,pred)
    
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_mean(loss_)

    """
    Encoder :
     - encode_padding_mask :##作用在encoder layers上(self attention of EncoderLayer)
     Decoder:
     - look_ahead_mask #目标值的后半段还没有预测出来  #作用在decoder的 layers上 (self attention of DecoderLayer)
     - encoder_decoder_padding_mask  # 做 encoder decoder attention的时候,
     decoder的隐含状态与Encoder的每一个输出做attention,但Encoder上面有padding,
     不应该在Encoder上的padding花注意力,要把Encoder上的padding mask 掉
      ## look_ahead_mask 作用在Encoder和 Decoder之间的attention的layers上
     -decoder_padding_mask #也要把decoder上的padding mask掉  #作用在decoder的 layers上  (self attention of DecoderLayer)
     
     第一selfattention层上只接受一个mask的输入,但是有 look_ahead_mask ,decoder_padding_mask
     两个,所以要先合并,与操作
    """

'\nEncoder :\n - encode_padding_mask :##作用在encoder layers上(self attention of EncoderLayer)\n Decoder:\n - look_ahead_mask #目标值的后半段还没有预测出来  #作用在decoder的 layers上 (self attention of DecoderLayer)\n - encoder_decoder_padding_mask  # 做 encoder decoder attention的时候,\n decoder的隐含状态与Encoder的每一个输出做attention,但Encoder上面有padding,\n 不应该在Encoder上的padding花注意力,要把Encoder上的padding mask 掉\n  ## look_ahead_mask 作用在Encoder和 Decoder之间的attention的layers上\n -decoder_padding_mask #也要把decoder上的padding mask掉  #作用在decoder的 layers上  (self attention of DecoderLayer)\n \n 第一selfattention层上只接受一个mask的输入,但是有 look_ahead_mask ,decoder_padding_mask\n 两个,所以要先合并,与操作\n'

def create_masks(inp,tar):
    encoder_padding_mask = create_padding_mask(inp)
    encoder_decoder_padding_mask = create_padding_mask(inp)
    
    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    decoder_padding_mask = create_padding_mask(tar)
    decoder_mask = tf.maximum(decoder_padding_mask,
                              look_ahead_mask)
    #print(encoder_padding_mask.shape)
    #print(encoder_decoder_padding_mask.shape)
    #print(look_ahead_mask.shape)
    #print(decoder_padding_mask.shape)
    return encoder_padding_mask,decoder_mask,encoder_decoder_padding_mask

tem_inp ,tem_tar = iter(train_dataset.take(1)).next()

print(tem_inp.shape)
print(tem_tar.shape)

(64, 36)
(64, 37)

create_masks(tem_inp,tem_tar)

(<tf.Tensor: id=4703997, shape=(64, 1, 1, 36), dtype=float32, numpy=
 array([[[[0., 0., 0., ..., 1., 1., 1.]]],
 
 
        [[[0., 0., 0., ..., 1., 1., 1.]]],
 
 
        [[[0., 0., 0., ..., 1., 1., 1.]]],
 
 
        ...,
 
 
        [[[0., 0., 0., ..., 1., 1., 1.]]],
 
 
        [[[0., 0., 0., ..., 1., 1., 1.]]],
 
 
        [[[0., 0., 0., ..., 1., 1., 1.]]]], dtype=float32)>,
 <tf.Tensor: id=4704029, shape=(64, 1, 37, 37), dtype=float32, numpy=
 array([[[[0., 1., 1., ..., 1., 1., 1.],
          [0., 0., 1., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.],
          ...,
          [0., 0., 0., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.]]],
 
 
        [[[0., 1., 1., ..., 1., 1., 1.],
          [0., 0., 1., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.],
          ...,
          [0., 0., 0., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.]]],
 
 
        [[[0., 1., 1., ..., 1., 1., 1.],
          [0., 0., 1., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.],
          ...,
          [0., 0., 0., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.]]],
 
 
        ...,
 
 
        [[[0., 1., 1., ..., 1., 1., 1.],
          [0., 0., 1., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.],
          ...,
          [0., 0., 0., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.]]],
 
 
        [[[0., 1., 1., ..., 1., 1., 1.],
          [0., 0., 1., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.],
          ...,
          [0., 0., 0., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.]]],
 
 
        [[[0., 1., 1., ..., 1., 1., 1.],
          [0., 0., 1., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.],
          ...,
          [0., 0., 0., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.],
          [0., 0., 0., ..., 1., 1., 1.]]]], dtype=float32)>,
 <tf.Tensor: id=4704004, shape=(64, 1, 1, 36), dtype=float32, numpy=
 array([[[[0., 0., 0., ..., 1., 1., 1.]]],
 
 
        [[[0., 0., 0., ..., 1., 1., 1.]]],
 
 
        [[[0., 0., 0., ..., 1., 1., 1.]]],
 
 
        ...,
 
 
        [[[0., 0., 0., ..., 1., 1., 1.]]],
 
 
        [[[0., 0., 0., ..., 1., 1., 1.]]],
 
 
        [[[0., 0., 0., ..., 1., 1., 1.]]]], dtype=float32)>)

train_loss = keras.metrics.Mean(name = 'train_loss')
train_accuracy = keras.metrics.SparseCategoricalAccuracy(
    name = 'train_accuracy')

def train_step(inp,tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]
    
    encoder_padding_mask, decoder_mask,encoder_decoder_padding_mask \
    = create_masks(inp,tar_inp)
    
    with tf.GradientTape() as tape:
        predictions,_ = transformer(inp,tar_inp,True,
                                   encoder_padding_mask,
                                   decoder_mask,
                                   encoder_decoder_padding_mask)
        loss = loss_function(tar_real,predictions)
    gradients = tape.gradient(loss,transformer.trainable_variables)
    optimizer.apply_gradients(
        zip(gradients,transformer.trainable_variables))
    train_loss(loss)
    train_accuracy(tar_real,predictions)

epochs = 20
for epoch in range(epochs):
    start = time.time()
    train_loss.reset_states()
    train_accuracy.reset_states()
    
    for (batch, (inp,tar)) in enumerate(train_dataset):
        train_step(inp,tar)
        if batch % 100 == 0:
            print ('Epoch{} Batch{} Loss {:.4f} Accuracy{:.4f}'.format(
                epoch + 1,batch, train_loss.result(), 
                train_accuracy.result()))
    
    print('Epoch {} Loss{:.4f} Accuracy {:.4f}'.format(
        epoch +1 ,train_loss.result(),train_accuracy.result()))
    print('Time take for 1 epoch: {} secs\n'.format(
        time.time() - start))

Epoch1 Batch0 Loss 3.1582 Accuracy0.0612
Epoch1 Batch100 Loss 2.9546 Accuracy0.0675
Epoch1 Batch200 Loss 2.8710 Accuracy0.0769
Epoch1 Batch300 Loss 2.7864 Accuracy0.0846
Epoch1 Batch400 Loss 2.7197 Accuracy0.0910
Epoch1 Batch500 Loss 2.6597 Accuracy0.0966
Epoch1 Batch600 Loss 2.6142 Accuracy0.1014
Epoch1 Batch700 Loss 2.5711 Accuracy0.1055
Epoch 1 Loss2.5702 Accuracy 0.1056
Time take for 1 epoch: 278.3582556247711 secs

Epoch2 Batch0 Loss 2.3368 Accuracy0.1402
Epoch2 Batch100 Loss 2.2700 Accuracy0.1360
Epoch2 Batch200 Loss 2.2701 Accuracy0.1379
Epoch2 Batch300 Loss 2.2473 Accuracy0.1390
Epoch2 Batch400 Loss 2.2216 Accuracy0.1403
Epoch2 Batch500 Loss 2.2048 Accuracy0.1419
Epoch2 Batch600 Loss 2.1930 Accuracy0.1435
Epoch2 Batch700 Loss 2.1752 Accuracy0.1445
Epoch 2 Loss2.1749 Accuracy 0.1445
Time take for 1 epoch: 251.6378345489502 secs

Epoch3 Batch0 Loss 1.9136 Accuracy0.1476
Epoch3 Batch100 Loss 2.0544 Accuracy0.1580
Epoch3 Batch200 Loss 2.0410 Accuracy0.1597
Epoch3 Batch300 Loss 2.0189 Accuracy0.1614
Epoch3 Batch400 Loss 1.9965 Accuracy0.1633
Epoch3 Batch500 Loss 1.9785 Accuracy0.1658
Epoch3 Batch600 Loss 1.9642 Accuracy0.1684
Epoch3 Batch700 Loss 1.9454 Accuracy0.1704
Epoch 3 Loss1.9449 Accuracy 0.1704
Time take for 1 epoch: 249.77115440368652 secs

Epoch4 Batch0 Loss 1.8713 Accuracy0.1787
Epoch4 Batch100 Loss 1.8053 Accuracy0.1884
Epoch4 Batch200 Loss 1.7894 Accuracy0.1905
Epoch4 Batch300 Loss 1.7705 Accuracy0.1921
Epoch4 Batch400 Loss 1.7558 Accuracy0.1939



---------------------------------------------------------------------------

_FallbackException                        Traceback (most recent call last)

D:\ProgramData\Anaconda3\envs\zg5\lib\site-packages\tensorflow_core\python\ops\gen_array_ops.py in reshape(tensor, shape, name)
   8100         _ctx._context_handle, _ctx._thread_local_data.device_name, "Reshape",
-> 8101         name, _ctx._post_execution_callbacks, tensor, shape)
   8102       return _result


_FallbackException: This function does not handle the case of the path where all inputs are not already EagerTensors.


During handling of the above exception, another exception occurred:


KeyboardInterrupt                         Traceback (most recent call last)

<ipython-input-109-2bc07bd91d15> in <module>
      6 
      7     for (batch, (inp,tar)) in enumerate(train_dataset):
----> 8         train_step(inp,tar)
      9         if batch % 100 == 0:
     10             print ('Epoch{} Batch{} Loss {:.4f} Accuracy{:.4f}'.format(


<ipython-input-108-0fcc566327c5> in train_step(inp, tar)
     10                                    encoder_padding_mask,
     11                                    decoder_mask,
---> 12                                    encoder_decoder_padding_mask)
     13         loss = loss_function(tar_real,predictions)
     14     gradients = tape.gradient(loss,transformer.trainable_variables)


D:\ProgramData\Anaconda3\envs\zg5\lib\site-packages\tensorflow_core\python\keras\engine\base_layer.py in __call__(self, inputs, *args, **kwargs)
    889           with base_layer_utils.autocast_context_manager(
    890               self._compute_dtype):
--> 891             outputs = self.call(cast_inputs, *args, **kwargs)
    892           self._handle_activity_regularization(inputs, outputs)
    893           self._set_mask_metadata(inputs, outputs, input_masks)


<ipython-input-86-e2fd9de650ba> in call(self, inp, tar, training, encoder_padding_mask, decoder_mask, encoder_decoder_padding_mask)
     23         decoding_outputs,attention_weights = self.decoder_model(
     24             tar,encoding_outputs,training,
---> 25             decoder_mask,encoder_decoder_padding_mask)
     26 
     27         #predictions.shape: (batch_size,output_seq_len,target_vocab_size)


D:\ProgramData\Anaconda3\envs\zg5\lib\site-packages\tensorflow_core\python\keras\engine\base_layer.py in __call__(self, inputs, *args, **kwargs)
    889           with base_layer_utils.autocast_context_manager(
    890               self._compute_dtype):
--> 891             outputs = self.call(cast_inputs, *args, **kwargs)
    892           self._handle_activity_regularization(inputs, outputs)
    893           self._set_mask_metadata(inputs, outputs, input_masks)


<ipython-input-80-cf4aa6c8310e> in call(self, x, encoding_outputs, training, decoder_mask, encoder_decoder_padding_mask)
     37             x, att1,att2 = self.decoder_layers[i](
     38                 x,encoding_outputs,training,
---> 39                 decoder_mask,encoder_decoder_padding_mask)
     40             attention_weights[
     41                 'decoder_layer{}_att1'.format(i+1)] = att1


D:\ProgramData\Anaconda3\envs\zg5\lib\site-packages\tensorflow_core\python\keras\engine\base_layer.py in __call__(self, inputs, *args, **kwargs)
    889           with base_layer_utils.autocast_context_manager(
    890               self._compute_dtype):
--> 891             outputs = self.call(cast_inputs, *args, **kwargs)
    892           self._handle_activity_regularization(inputs, outputs)
    893           self._set_mask_metadata(inputs, outputs, input_masks)


<ipython-input-47-843aca15bbe3> in call(self, x, encoding_outputs, training, decoder_mask, encoder_decoder_padding_mask)
     31 
     32         #attn1,out1.shape: (batch_size,target_seq_len,d_model))
---> 33         attn1,attn_weights1 = self.mha1(x,x,x,decoder_mask)
     34         attn1 = self.dropout1(attn1,training = training)
     35         out1 = self.layer_norm1(attn1 + x)


D:\ProgramData\Anaconda3\envs\zg5\lib\site-packages\tensorflow_core\python\keras\engine\base_layer.py in __call__(self, inputs, *args, **kwargs)
    889           with base_layer_utils.autocast_context_manager(
    890               self._compute_dtype):
--> 891             outputs = self.call(cast_inputs, *args, **kwargs)
    892           self._handle_activity_regularization(inputs, outputs)
    893           self._set_mask_metadata(inputs, outputs, input_masks)


<ipython-input-41-4f8584226b52> in call(self, q, k, v, mask)
     33         # 分头
     34         # q.shape:(batch_size, num_heads,seq_len_q, depth)
---> 35         q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth)
     36         # k.shape:(batch_size, num_heads, seq_len_k, depth)
     37         k = self.split_heads(k, batch_size)


<ipython-input-41-4f8584226b52> in split_heads(self, x, batch_size)
     20         # 分头, 将头个数的维度 放到 seq_len 前面
     21         x = tf.reshape(x, 
---> 22                        (batch_size, -1, self.num_heads, self.depth)) # -1:seq_len
     23         return tf.transpose(x, perm=[0, 2, 1, 3]) #维度重新排列 交换 seq_len和num_heads
     24 


D:\ProgramData\Anaconda3\envs\zg5\lib\site-packages\tensorflow_core\python\ops\array_ops.py in reshape(tensor, shape, name)
    129     A `Tensor`. Has the same type as `tensor`.
    130   """
--> 131   result = gen_array_ops.reshape(tensor, shape, name)
    132   tensor_util.maybe_set_static_shape(result, shape)
    133   return result


D:\ProgramData\Anaconda3\envs\zg5\lib\site-packages\tensorflow_core\python\ops\gen_array_ops.py in reshape(tensor, shape, name)
   8104       try:
   8105         return reshape_eager_fallback(
-> 8106             tensor, shape, name=name, ctx=_ctx)
   8107       except _core._SymbolicException:
   8108         pass  # Add nodes to the TensorFlow graph.


D:\ProgramData\Anaconda3\envs\zg5\lib\site-packages\tensorflow_core\python\ops\gen_array_ops.py in reshape_eager_fallback(tensor, shape, name, ctx)
   8142   _attrs = ("T", _attr_T, "Tshape", _attr_Tshape)
   8143   _result = _execute.execute(b"Reshape", 1, inputs=_inputs_flat, attrs=_attrs,
-> 8144                              ctx=_ctx, name=name)
   8145   _execute.record_gradient(
   8146       "Reshape", _inputs_flat, _attrs, _result, name)


D:\ProgramData\Anaconda3\envs\zg5\lib\site-packages\tensorflow_core\python\eager\execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
     59     tensors = pywrap_tensorflow.TFE_Py_Execute(ctx._handle, device_name,
     60                                                op_name, inputs, attrs,
---> 61                                                num_outputs)
     62   except core._NotOkStatusException as e:
     63     if name is not None:


KeyboardInterrupt:

def evaluate(inp_sentence):
    input_id_sentence = [pt_tokenizer.vocab_size] \
    + pt_tokenizer.encode(inp_sentence) + [pt_tokenizer.vocab_size + 1]
    
    encoder_input = tf.expand_dims(input_id_sentence,0)
    
    decoder_input = tf.expand_dims([en_tokenizer.vocab_size],0)
    
    for i in range (max_lenth):
        encoder_padding_mask, decoder_mask,encoder_decoder_padding_mask \
        = create_masks(encoder_input,decoder_input)
        
        predictions,attention_weights = transformer(
            encoder_input,
            decoder_input,
            False,
            encoder_padding_mask,
            decoder_mask,
            encoder_decoder_padding_mask)
        
        predictions = predictions[:,-1,:]
        
        predictions_id = tf.cast(tf.argmax(predictions,axis=-1),tf.int32)
        
        if tf.equal(predictions_id, en_tokenizer.vocab_size +1):
            return tf.squeeze(decoder_input,axis=0), attention_weights
        
        decoder_input = tf.concat([decoder_input,[predictions_id]],axis=-1)
    return tf.squeeze(decoder_input,axis=0), attention_weights

def plot_encoder_decoder_attention(attention, input_sentence,
                                   result,layer_name):
    fig = plt.figure(figsize=(16,8))
    
    input_id_sentence = pt_tokenizer.encode(input_sentence)
    
    attention = tf.squeeze(attention[layer_name],axis=0)
    
    for head in range (attention.shape[0]):
        ax = fig.add_subplot(2,4,head + 1)
        
        ax.matshow(attention[head][:-1,:])
        
        fontdict = {'fontsize':10}
        
        ax.set_xticks(range(len(input_id_sentence) + 2))
        ax.set_yticks(range(len(result)))
        
        ax.set_ylim(len(result) - 1.5 ,-0.5)
        
        ax.set_xticklabels(
            ['<start>']+[pt_tokenizer.decode([i]) for i in input_id_sentence],
            fontdict = fontdict,rotation = 90)
        
        ax.set_yticklabels([en_tokenizer.decode([i]) for i in result if i <en_tokenizer.vocab_size],
                           fontdict = fontdict)
        ax.set_xlabel('Head{}'.format(head + 1))
    plt.tight_layout()
    plt.show()

def translate (input_sentence,lay_name = ''):
    result, attention_weights = evaluate(input_sentence)
    
    predicted_sentence = en_tokenizer.decode(
        [i for i in result if i < en_tokenizer.vocab_size])
    
    print("Input: {}".format(input_sentence))
    print("predicted translation: {}".format(predicted_sentence))
    
    if lay_name:
        plot_encoder_decoder_attention(attention_weights,input_sentence,result,lay_name)

translate('está Muito frio aqui.')

Input: está Muito frio aqui.
predicted translation: it 's called the j here .

translate('esta é a minha vida.')

Input: esta é a minha vida.
predicted translation: this is my life .

translate('esta é a minha vida.',lay_name='decoder_layer4_att2')

Input: esta é a minha vida.
predicted translation: this is my life .

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-MOEqAyvI-1590139711702)(output_99_1.png)]