Seq2Seq + Attention

最新推荐文章于 2023-11-29 20:36:49 发布

lcomecon

最新推荐文章于 2023-11-29 20:36:49 发布

阅读量305

点赞数 1

分类专栏： NLP

本文链接：https://blog.csdn.net/lcomecon/article/details/106733960

版权

NLP 专栏收录该内容

16 篇文章 0 订阅

订阅专栏

Seq2Seq原理
Seq2Seq指输入是一个序列，输出也是一个序列；输入的序列是将不同的长度的数据转化为等长的向量表示，然后输出也是将不同长度的序列用等长的向量表示，又叫做encoder-decoder，encoder和decoder都是RNN模型，首先利用输入序列，使用最后的隐状态作为decoder的初始隐状态和输入相关的状态带入，然后再decoder中得到最终的输出。
为什么要加入Attention
时间过长，即序列的长度较长的时候，会引起梯度消散的结果，此时引入注意力模型为了记住更好的效果。

Seq2Seq + Attention具体用代码说明（实例时机器翻译模型）：

1 数据预处理
a. 加载数据，在句子前面加上开始标识符（< BOS >）和结束标识符（< EOS > )，理论上源语言可以不加，但是目标语言一定需要加；
b. 构建词典，使用占位符< UNK >表示不认识的字符，< PAD >表示占位符，将词典与数字对应；
c. 构建反查词典，根据索引查找原词；
d. 将数据（源语言与目标语言）转化为数字的集合

# 加载数据
# 返回两个集合，en和cn
def load_data(FILE_NAME):
	en = []; cn =[]

	with open(os.path.join(NEW_FILE_DIR, FILE_NAME + ("_mini" if IS_MINI else "") + "_0.txt"), "rb") as f:
		for line in f.readlines():
			line = line.decode("utf8").strip()

			line = line.split("\t")

			en.append(["BOS"] + nltk.word_tokenize(line[0].lower()) + ["EOS"])
			cn.append(["BOS"] + [c for c in line[1]] + ["EOS"])

	return en, cn

UNK_IDX = 0; PAD_IDX = 1
# 构建单词表
def build_dict(sentences, max_words = 50000):
	word_count = Counter()
	for sentence in sentences:
		for s in sentence:
			word_count[s] += 1

	ls = word_count.most_common(max_words)
	total_words = len(ls) + 2

	word_dict = {w[0]: index + 2 for index, w in enumerate(ls)}
	word_dict["UNK"] = UNK_IDX
	word_dict["PAD"] = PAD_IDX

	return word_dict, total_words

# 得到对应的反向字典
def get_inv_dict(ori_dict):

	return {v: w for w, v in ori_dict.items()}

# 将中英文转化为数字的集合
def data2number(en_data_set, cn_data_set, en_dict, cn_dict, sort_by_len = True):

	# 英语的集合
	out_en_sentences = [[en_dict.get(w, 0) for w in sent] for sent in en_data_set]

	# 中文的集合
	out_cn_sentences = [[cn_dict.get(w, 0) for w in sent] for sent in cn_data_set]

	# 根据英语的长度进行排序
	def len_argsort(seq):
		return sorted(range(len(seq)), key = lambda x: len(seq[x]))

	if sort_by_len:
		sorted_index = len_argsort(out_en_sentences)

		out_en_sentences = [out_en_sentences[i] for i in sorted_index]
		out_cn_sentences = [out_cn_sentences[i] for i in sorted_index]

	return out_en_sentences, out_cn_sentences

# 得到关于数字的集合
def get_word_set(FILE_NAME):
	global inv_en_dict, inv_cn_dict, en_dict, cn_dict

	en_data, cn_data = load_data(FILE_NAME)

	en_set, cn_set = data2number(en_data, cn_data, en_dict, cn_dict)

	return en_set, cn_set

2 数据预处理（二）
a. 根据batch_size的大小切分数据；
b. 生成掩码数据，得到最终的样本数据，每个数据为batch_size的源语言数据和它对应的长度，以及目标语言数据和它对应的长度。

# 句子切分成batch
def get_minibatches(n, minibatch_size, shuffle = True):
	idx_list = np.arange(0, n, minibatch_size) # 根据batch_size的大小进行切分

	if shuffle:
		np.random.shuffle(idx_list)

	minibatches = []

	for idx in idx_list:
		minibatches.append(np.arange(idx, min(idx + minibatch_size, n)))

	return minibatches

# 生成带mask数据
def gen_mask_data(seqs):
	lengths = [len(seq) for seq in seqs]
	n_samples = len(seqs)
	max_len = np.max(lengths)

	x = np.zeros((n_samples, max_len)).astype('int32')
	x_lengts = np.array(lengths).astype("int32")

	for idx, seq in enumerate(seqs):
		x[idx, :lengths[idx]] = seq

	return x, x_lengts

# 生成样例
def gen_examples(en_sentences, cn_sentences, batch_size):
	print(len(en_sentences), "gen_examples")
	minibatches = get_minibatches(len(en_sentences), batch_size)

	data_set = []
	for batch_index in minibatches:
		ex_en_sentences = [en_sentences[i] for i in batch_index]
		ex_cn_sentences = [cn_sentences[i] for i in batch_index]

		ex_en, ex_en_len = gen_mask_data(ex_en_sentences)
		ex_cn, ex_cn_len = gen_mask_data(ex_cn_sentences)

		data_set.append((ex_en, ex_en_len, ex_cn, ex_cn_len))

	return data_set

3 encoder 讲解
- 内部构造
  a. 词嵌入模型，将每一个词转化为向量，实现的是降维的操作，shape为（vocab_size, embed_size)；
  b. 使用双向GRU作为编码网络，网络中w的shape为（embed_size, enc_hidden_size);
  c. dropout类的使用，是为了防止过拟合；
  d. 线性变化器，将最后隐状态转化为decoder的初始状态。(2 * enc_hidden_size, dec_hidden_size)

import torch
import torch.nn as nn
# encoder类
class PlainEncoder(nn.Module):	
	def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout = 0.2):
		super(PlainEncoder, self).__init__()

		# 词嵌入模型
		self.embed = nn.Embedding(vocab_size, embed_size)
		# 双向GRU
		self.rnn = nn.GRU(embed_size, enc_hidden_size, batch_first = True,
			bidirectional = True)

		# dropout操作
		self.dropout = nn.Dropout(dropout)

		# 双向隐状态转化为decoder的h_0
		self.fc = nn.Linear(enc_hidden_size * 2, dec_hidden_size)

	def forward(self, x, lengths):

		# 根据句子长度从高到低排序
		sorted_len, sorted_idx = lengths.sort(0, descending=True)

		# 得到重排后的数据
		x_sorted = x[sorted_idx.long()]	

		# 进行dropout操作
		embedded = self.dropout(self.embed(x_sorted))

		# packed操作
		packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, 
			sorted_len.long().cpu().data.numpy(), batch_first = True)

		# 执行模型
		packed_out, hid = self.rnn(packed_embedded)

		# 解码
		out, _ = nn.utils.rnn.pad_packed_sequence(packed_out,
			batch_first = True)

		_, original_idx = sorted_idx.sort(0, descending = False)
		out = out[original_idx.long()].contiguous()
		hid = hid[:, original_idx.long()].contiguous()

		hid = torch.cat([hid[-2], hid[-1]], dim = 1)
		hid = torch.tanh(self.fc(hid)).unsqueeze(0)

		return out, hid

4 decoder讲解
- 内部构造
  a. 词嵌入模型跟encoder中作用一致；
  b. 单层GRU作为解码网络，w的shape也为（embed_size, enc_hidden_size);
  c. attention模型，输出最低维度为dec_hidden_size
  d. 线性变化模型，将dec_hidden_size转化为vocab_size
  e. dropout类的使用，是为了防止过拟合；

import torch
import torch.nn as nn
import torch.nn.functional as F

from aModel.attention import Attention
class PlainDecoder(nn.Module):
	def __init__(self, vocab_size, embed_size, enc_hidden_size, dec_hidden_size, dropout = 0.2):
		super(PlainDecoder, self).__init__()
		# 词嵌入
		self.embed = nn.Embedding(vocab_size, embed_size)

		# 注意力模型
		self.attention = Attention(enc_hidden_size, dec_hidden_size)

		# 模型rnn
		self.rnn = nn.GRU(embed_size, enc_hidden_size, batch_first = True)

		# 线性转化
		self.out = nn.Linear(dec_hidden_size, vocab_size)

		# dropout
		self.dropout = nn.Dropout(dropout)

	# 创建掩码
	def create_mask(self, x_len, y_len):
		device = x_len.device
		max_x_len = x_len.max()
		max_y_len = y_len.max()

		x_mask = torch.arange(max_x_len, device = device)[None, :] < x_len[:, None]
		y_mask = torch.arange(max_y_len, device = device)[None, :] < y_len[:, None]

		mask = torch.logical_not(x_mask[:, :, None] * y_mask[:, None, :]).byte()
		return mask

	def forward(self, ctx, ctx_lengths, y, y_lengths, hid):
		sorted_len, sorted_idx = y_lengths.sort(0, descending=True)

		y_sorted = y[sorted_idx.long()]
		hid = hid[:, sorted_idx.long()]

		y_sorted = self.dropout(self.embed(y_sorted))

		packed_embeded = nn.utils.rnn.pack_padded_sequence(y_sorted,
			sorted_len.long().cpu().data.numpy(), batch_first = True)

		out, hidden = self.rnn(packed_embeded, hid)

		packed_out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first = True)

		_ ,origin_sort = sorted_idx.sort(0, descending = False)

		# print(packed_out.size())

		out = packed_out[origin_sort.long()].contiguous()
		hidden = hidden[:, origin_sort.long()].contiguous()

		# output = F.log_softmax(self.out(out), -1)

		# 后续跟attention有关，在attention中说明
		mask = self.create_mask(y_lengths, ctx_lengths)

		output, atten = self.attention(out, ctx, mask)
		# output: batch_size, output_len, dec_hidden_size
		# atten: batch_size, output_len, context_len

		output = F.log_softmax(self.out(output), -1)
		# output: batch_size, output_len, vocal_size

		return output, hidden, atten

5 Attention讲解
根据Luong的论文中的Attention模型复现的，先附上代码，然后在做说明。

import torch
import torch.nn as nn
import torch.nn.functional as F

class Attention(nn.Module):
	def __init__(self, enc_hidden_size, dec_hidden_size):
		super(Attention, self).__init__()
		self.enc_hidden_size = enc_hidden_size
		self.dec_hidden_size = dec_hidden_size
		
		# 线性转化器，为计算权重值做准备
		self.linear_in = nn.Linear(enc_hidden_size * 2, dec_hidden_size, bias = False)
		
		# 线性变化其，为得到最后的s_t
		self.linear_out = nn.Linear(enc_hidden_size * 2 + dec_hidden_size, dec_hidden_size)
	
	# output为decoder的输出，context为encoder的输出
	def forward(self, output, context, mask):
		# output: batch_size, output_len, dec_hidden_size
		# context: batch_size, context_len, 2 * enc_hidden_size

		batch_size = output.size(0)
		output_len = output.size(1)
		input_len = context.size(1)

		context_in = self.linear_in(context.view(batch_size * input_len, -1)).view(
			batch_size, input_len, -1)
		# batch_size, context_len, dec_hidden_size

		atten = torch.bmm(output, context_in.transpose(1, 2))
		# batch_size, output_len, context_len
		# context_in.transpose(1, 2): batch_size, dec_hidden_size, context_len

		atten.data.masked_fill(mask.bool(), -1e6)

		atten = F.softmax(atten, dim = 2)
		# batch_size, output_len, context_len

		context = torch.bmm(atten, context)
		# batch_size, output_len, 2 * enc_hidden_size

		output = torch.cat((context, output), dim = 2)
		# batch_size, output_len, 2 * enc_hidden_size + dec_hidden_size

		output = output.view(batch_size * output_len, -1)
		output = torch.tanh(self.linear_out(output))
		# batch_size * output_len, dec_hidden_size

		output = output.view(batch_size, output_len, -1)

		return output, atten

公式对应
$h_t即为context，s_t即为output$
$e_{ij}=score(s_i, t_j) 对应没有进行softmax的atten$
$\alpha_{ij} = softmax(e_{ij})对应进行softmax之后的atten$
$c_t=\sum\alpha_{ij}h_j对应之后的context$
$\hat{y_t}=\tanh([c_t,s_t])对应最后的output，[c_t, s_t]对应torch.cat计算后的output$

6 Seq2Seq

#_*_coding=utf-8_*_

import torch
import torch.nn as nn
import torch.nn.functional as F

class PlainSeq2Seq(nn.Module):
	def __init__(self, encoder, decoder):
		super(PlainSeq2Seq, self).__init__()
		self.encoder = encoder
		self.decoder = decoder

	def forward(self, x, x_lengths, y, y_lengths):
		encoder_out, hidden = self.encoder(x, x_lengths)

		out, hidden, atten = self.decoder(
			ctx = encoder_out,
			ctx_lengths = x_lengths,
			y = y,
			y_lengths = y_lengths,
			hid = hidden)
		return  out, atten

	def translate(self, x, x_lengths, y, max_length = 10):
		encoder_out, hid = self.encoder(x, x_lengths)

		preds = []
		batch_size = x.shape[0]

		attns = []

		for i in range(max_length):
			output, hid, atten = self.decoder(
				ctx = encoder_out,
				ctx_lengths = x_lengths,
				y = y,
				y_lengths = torch.ones(batch_size).long().to(y.device),
				hid = hid)

			y = output.max(2)[1].view(batch_size, 1)
			preds.append(y)
			attns.append(atten)

		return torch.cat(preds, 1), torch.cat(attns, 1)

7 loss计算函数

import torch
import torch.nn as nn

class LanguageModelCriterion(nn.Module):
	
	def __init__(self):
		super(LanguageModelCriterion, self).__init__()

	def forward(self, input, target, mask):
		# input: batch_size, output_len, vocab_size
		# target: batch_size, output_len

		input = input.contiguous().view(-1, input.size(2))
		target = target.contiguous().view(-1, 1)
		mask = mask.contiguous().view(-1, 1)

		output = -input.gather(1, target) * mask
		output = torch.sum(output) / torch.sum(mask)

		return output

8 模型训练

def train(model, en_data, cn_data, eval_en_data, eval_cn_data, 
	loss_fn, optimizer, num_epochs = 600):

	data = gen_examples(en_data, cn_data, batch_size)
	eval_data = gen_examples(eval_en_data, eval_cn_data, batch_size)

	eval_loss_list = []

	no_adv_step = 0; no_adv_stop_step = 20
	for epoch in range(num_epochs):
		model.train()

		total_num_words = total_loss = 0

		for it, (en_data, en_data_len, cn_data, cn_data_len) in enumerate(data):
			en_data = torch.from_numpy(en_data).to(device).long()
			en_data_len = torch.from_numpy(en_data_len).to(device).long()
			
			cn_input = torch.from_numpy(cn_data[:, :-1]).to(device).long()
			cn_output = torch.from_numpy(cn_data[:, 1:]).to(device).long()

			cn_data_len = torch.from_numpy(cn_data_len - 1).to(device).long()

			cn_data_len[cn_data_len <= 0] = 1

			output, hidden = model(en_data, en_data_len, cn_input, cn_data_len)
			# print(cn_data_len.size(), "1111", cn_data_len.max())
			output_mask = torch.arange(cn_data_len.max().item(), device = device)[None, :] < cn_data_len[:, None]
			output_mask.float()

			loss = loss_fn(output, cn_output, output_mask)

			# 损失值计算
			num_words = torch.sum(cn_data_len).item()
			total_num_words += num_words
			total_loss += num_words * loss.item()

			# 更新模型
			optimizer.zero_grad()
			loss.backward()
			nn.utils.clip_grad_norm_(model.parameters(), 5)
			optimizer.step()

			if it % 100 == 0:
				print("\nEpoch: ", epoch, "iteration: ", 
					it, "loss: ", loss.item())

		print("\n\nEpoch: ", epoch, "Training loss:", total_loss / total_num_words)

		if epoch % 1 == 0:
			eval_loss = evalute(model, eval_data, loss_fn)

			print("\n\neval_loss: ", eval_loss)
			if len(eval_loss_list) == 0 or min(eval_loss_list) > eval_loss:
				torch.save(model.state_dict() ,"./aModel/best-model")
				no_adv_step = 0
			else:
				no_adv_step += 1
				if no_adv_step >= no_adv_stop_step:
					break

			eval_loss_list.append(eval_loss)

	print("\n\nthe best dev answer is: ", min(eval_loss_list))

顺带evalute函数：

def evalute(model, data, loss_fn):
	model.eval()

	total_num_words = total_loss = 0
	with torch.no_grad():
		for it, (en_data, en_data_len, cn_data, cn_data_len) in enumerate(data):
			en_data = torch.from_numpy(en_data).to(device).long()
			en_data_len = torch.from_numpy(en_data_len).to(device).long()
			
			cn_input = torch.from_numpy(cn_data[:, :-1]).to(device).long()
			cn_output = torch.from_numpy(cn_data[:, 1:]).to(device).long()

			cn_data_len = torch.from_numpy(cn_data_len - 1).to(device).long()

			cn_data_len[cn_data_len <= 0] = 1

			output, hidden = model(en_data, en_data_len, cn_input, cn_data_len)
			output_mask = torch.arange(cn_data_len.max().item(), device = device)[None, :] < cn_data_len[:, None]
			output_mask.float()

			loss = loss_fn(output, cn_output, output_mask)
			num_words = torch.sum(cn_data_len).item()

			total_num_words += num_words
			total_loss += loss.item() * num_words

	return total_loss / total_num_words

lcomecon

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
2
评论
Seq2Seq + Attention

Seq2Seq原理Seq2Seq指输入是一个序列，输出也是一个序列；输入的序列是将不同的长度的数据转化为等长的向量表示，然后输出也是将不同长度的序列用等长的向量表示，又叫做encoder-decoder，encoder和decoder都是RNN模型，首先利用输入序列，使用最后的隐状态作为decoder的初始隐状态和输入相关的状态带入，然后再decoder中得到最终的输出。为什么要加入Attention时间过长，即序列的长度较长的时候，会引起梯度消散的结果，此时引入注意力模型为了记住更好的效果。S.
复制链接

扫一扫

专栏目录