transformer模型由encoder,decoder构成,每一个encoder分为六层,每一层又分为一个前馈网络和一个多头注意力层。decoder也分为六层,每一层都由两个注意力层和一个前馈网络组成,一个是自注意力层,一个是和encoder最终输出组成的注意力层。下面是利用transformer模型实现机器翻译的简单例子,框架用的是tensorflow。
训练部分
import re
import string
from itertools import count
from operator import pos
from typing import Text
from keras import layers
from keras import Model
import keras
import tensorflow as tf
from tensorflow._api.v2 import data
import random
from collections import Counter
import numpy as np
from keras.layers import TextVectorization
import tensorflow as tf
france = []
english = []
with open('./dataset/spa-eng/spa.txt', 'r', encoding='utf-8') as file:
for idx, line in enumerate(file.readlines()):
if idx >= 30000:
break
line = line.split('\t')
english.append(line[0])
france.append('[start] '+line[1]+' [end]')
# #config
num_words = 15000
eng_text_length = 20
fra_text_length = 20
batch_size = 64
# # english tokenize
eng_tokenizer = TextVectorization(
max_tokens=num_words, output_mode="int", output_sequence_length=eng_text_length)
eng_tokenizer.adapt(english)
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")
def custom_standardization(input_string):
lowercase = tf.strings.lower(input_string)
return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")
fra_tokenizer = TextVectorization(
max_tokens=num_words, output_sequence_length=fra_text_length+1, standardize=custom_standardization)
fra_tokenizer.adapt(france)
# # #set train data
encoder_input_data = eng_tokenizer(english)
decoder_input_data = fra_tokenizer(france)[:, :-1]
decoder_output_data = fra_tokenizer(france)[:, 1:]
# print(encoder_input_data[:10])
# print(decoder_output_data[:10])
en_word_to_idx = dict([(v, k)
for k, v in enumerate(eng_tokenizer.get_vocabulary())])
en_idx_to_word = dict([(k, v)
for k, v in enumerate(eng_tokenizer.get_vocabulary())])
fra_word_to_idx = dict([(v, k)
for k, v in enumerate(fra_tokenizer.get_vocabulary())])
fra_idx_to_word = dict([(k, v)
for k, v in enumerate(fra_tokenizer.get_vocabulary())])
def get_angles(pos, i, d_model):
angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
return pos * angle_rates
def positional_encoding(pos, d_model):
'''
:param pos: 词在句子中的位置,句子上的维族;(i是d_model上的维度)
:param d_model: 隐状态的维度,相当于num_units
:return: 位置编码 shape=[1, position_num, d_model], 其中第一个维度是为了匹配batch_size
'''
def get_angles(position, i):
# 这里的i相当于公式里面的2i或2i+1
# 返回shape=[position_num, d_model]
return position / np.power(10000