【TensorFlow】Seq2Seq模型的代码实现 & attention机制

本文介绍了如何在TensorFlow中使用seq2seq模型进行机器翻译,包括编码器-解码器结构、注意力机制的引入,以及如何从训练好的模型进行解码操作。重点讲解了如何通过双向LSTM和注意力机制提升翻译质量,并展示了如何在解码阶段利用tf.while_loop实现动态预测。
摘要由CSDN通过智能技术生成
import tensorflow as tf

SRC_TRAIN_DATA = '/path/to/data/train.en'  # 源语言输入文件
TRG_TRAIN_DATA = '/path/to/data/train.zh'  # 目标语言输入文件
CHECKPOINT_PATH = '/path/to/seq2seq_ckpt' 
HIDDEN_SIZE = 1024
NUM_LAYERS = 2
SRC_VOCAB_SIZE = 10000
TRG_VOCAB_SIZE = 4000
BATCH_SIZE = 100
NUM_EPOCH = 5
KEEP_PROB = 0.8
MAX_GRAD_NPRM = 5 # 用于控制梯度膨胀的梯度大小上限
SHARE_EMB_AND_SOFTMAX = TRUE # 在Softmax层和词向量层之间共享参数

class NMTModel(object):
	def __init__(self):
		# 定义编码器和解码器所使用的LSTM结构
		self.enc_cell = tf.nn.rnn_cell.MultiRNNCell(
			[tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE) for _ in range((NUM_LAYERS)])
		self.dec_cell = tf.nn.rnn_cell.MultiRNNCell(
			[tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE) for _ in range((NUM_LAYERS)])
		
		# 为源语言和目标语言分别定义词向量
		self.src_embedding = tf.get_variable("src_emb", [SRC_VOCAB_SIZE, HIDDEN_SIZE])
		self.trg_embedding = tf.get_variable("trg_emb", [TRG_VOCAB_SIZE, HIDDEN_SIZE])

		# 定义softmax层的变量
		# ! 词向量是(vocab_size, hidden_size), softmax层是(hidden_size, vocab_size),所以要转秩。因为共享词向量层和softmax层的参数,不仅能大幅减少参数数量,还能提高最终模型效果
		if SHARE_EMB_AND_SOFTMAX:
			self.softmax_weight = tf.transpose(self.trg_embedding) 
		else:
			self.softmax_weight = tf.get_variable("weight", [HIDDEN_SIZE, TRG_VOCAB_SIZE])
		self.softmax_biases = tf.get_variable("biases", [TRG_VOCAB_SIZE])

	# 在forward函数中定义模型的前向计算图
	def forward(self, src_input, src_size, trg_input, trg_label, trg_size):
		batch_size = tf.shape(src_input)[0]

		# 将输入和输出单词编号转为词向量
		src_emb = tf.nn.embedding_lookup(self.src_embedding, src_input)
		trg_emb = tf.nn.embedding_lookup(self.trg_embedding, trg_input)

		# 在词向量上进行dropout
		src_emb = tf.nn.dropout(src_emb, KEEP_PROB)
		trg_emb = tf.nn.dropout(trg_emb, KEEP_PROB)

		# 使用dynamic_rnn构造编码器,dynamic_rnn对每一个batch的数据读取两个输入,输入数据的内通和输入数据的长度。对于batch里的每一条数据,在读取了相应长度的内容后,dynamic_rnn就跳过后面的输入,直接把前一步的计算结果复制到后面的时刻。
		# 编码器读取源句子每个位置的词向量,输出最后一步的隐藏状态enc_stat
Seq2Seq模型是指序列到序列模型,通常用于机器翻译、语音识别、摘要生成、对话生成等任务。其中,编码器将输入序列编码为一个向量,解码器则将该向量解码为输出序列。下面是一个简单的Seq2Seq模型代码实现,使用了PythonTensorFlow库: ```python import tensorflow as tf # 定义输入输出序列最大长度和词表大小 MAX_LENGTH = 100 VOCAB_SIZE = 10000 # 定义编码器 class Encoder(tf.keras.Model): def __init__(self, vocab_size, embedding_dim, enc_units): super(Encoder, self).__init__() self.enc_units = enc_units self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) self.gru = tf.keras.layers.GRU(enc_units, return_sequences=True, return_state=True) def call(self, x, hidden): x = self.embedding(x) output, state = self.gru(x, initial_state = hidden) return output, state def initialize_hidden_state(self, batch_size): return tf.zeros((batch_size, self.enc_units)) # 定义注意力层 class BahdanauAttention(tf.keras.layers.Layer): def __init__(self, units): super(BahdanauAttention, self).__init__() self.W1 = tf.keras.layers.Dense(units) self.W2 = tf.keras.layers.Dense(units) self.V = tf.keras.layers.Dense(1) def call(self, query, values): query_with_time_axis = tf.expand_dims(query, 1) score = self.V(tf.nn.tanh( self.W1(query_with_time_axis) + self.W2(values))) attention_weights = tf.nn.softmax(score, axis=1) context_vector = attention_weights * values context_vector = tf.reduce_sum(context_vector, axis=1) return context_vector, attention_weights # 定义解码器 class Decoder(tf.keras.Model): def __init__(self, vocab_size, embedding_dim, dec_units): super(Decoder, self).__init__() self.dec_units = dec_units self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim) self.gru = tf.keras.layers.GRU(dec_units, return_sequences=True, return_state=True) self.fc = tf.keras.layers.Dense(vocab_size) self.attention = BahdanauAttention(dec_units) def call(self, x, hidden, enc_output): context_vector, attention_weights = self.attention(hidden, enc_output) x = self.embedding(x) x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1) output, state = self.gru(x) output = tf.reshape(output, (-1, output.shape[2])) x = self.fc(output) return x, state, attention_weights # 定义损失函数和优化器 optimizer = tf.keras.optimizers.Adam() loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none') def loss_function(real, pred): mask = tf.math.logical_not(tf.math.equal(real, 0)) loss_ = loss_object(real, pred) mask = tf.cast(mask, dtype=loss_.dtype) loss_ *= mask return tf.reduce_mean(loss_) # 定义模型 class Seq2Seq(tf.keras.Model): def __init__(self, vocab_size, embedding_dim, enc_units, dec_units, batch_size): super(Seq2Seq, self).__init__() self.batch_size = batch_size self.encoder = Encoder(vocab_size, embedding_dim, enc_units) self.decoder = Decoder(vocab_size, embedding_dim, dec_units) def call(self, inputs): inp, targ = inputs enc_hidden = self.encoder.initialize_hidden_state(self.batch_size) enc_output, enc_hidden = self.encoder(inp, enc_hidden) dec_hidden = enc_hidden dec_input = tf.expand_dims([targ[0]] * self.batch_size, 1) predictions = [] for t in range(1, targ.shape[1]): predictions_batch, dec_hidden, _ = self.decoder(dec_input, dec_hidden, enc_output) predictions.append(predictions_batch) dec_input = tf.expand_dims(targ[:, t], 1) return tf.stack(predictions, axis=1) # 训练模型 model = Seq2Seq(VOCAB_SIZE, 256, 1024, 1024, 64) def train_step(inp, targ): loss = 0 with tf.GradientTape() as tape: predictions = model([inp, targ[:,:-1]]) loss = loss_function(targ[:,1:], predictions) gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) return loss # 测试模型 def evaluate(sentence): attention_plot = np.zeros((max_length_targ, max_length_inp)) sentence = preprocess_sentence(sentence) inputs = [inp_lang.word_index[i] for i in sentence.split(' ')] inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_inp, padding='post') inputs = tf.convert_to_tensor(inputs) result = '' hidden = [tf.zeros((1, units))] enc_out, enc_hidden = encoder(inputs, hidden) dec_hidden = enc_hidden dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0) for t in range(max_length_targ): predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out) attention_weights = tf.reshape(attention_weights, (-1,)) attention_plot[t] = attention_weights.numpy() predicted_id = tf.argmax(predictions[0]).numpy() result += targ_lang.index_word[predicted_id] + ' ' if targ_lang.index_word[predicted_id] == '<end>': return result, sentence, attention_plot dec_input = tf.expand_dims([predicted_id], 0) return result, sentence, attention_plot ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值