论文理论
-
文章:Effective Approaches to Attention-based Neural Machine Translation
链接:https://pan.baidu.com/s/1qZieu90lKPMwwj2BxoO2MA
提取码:6pv0 -
模型理解
本文提出的attention主要是基于翻译任务,模型框架使用encoder(embedding)-decoder(embedding)+attention,提出了两种attention:global attention 和local attention,本文仅深入研究global attention。模型理解如下:
代码复现
- 数据获取:https://pan.baidu.com/s/11_rvu-yc4JbLFPHTVw7oyw 提取码:593v
- 简介:内含英文-国语,英文-粤语的数据集(txt)
- 数据处理:考虑对中文(jieba+keras.preprocessing.text -Tokenizer)英文分词(keras.preprocessing.text -Tokenizer),再建立词典。即输入数据的时间步以每个句子的词语个数计算
'''
ATTENTION!!!!!!
1、用tencent中文预训练词向量,考虑先对数据jieba分词
2、tencent中文预训练词向量维度是200,即embed_dim=200固定
'''
#global attention
#import tensorflow as tf
from keras.models import Model
from keras.layers import Input, LSTM, Dense,Concatenate,Softmax
from keras.optimizers import Adam
import numpy as np
from keras.models import load_model
from keras import backend as K
import keras
from keras.models import Sequential
from keras.models import Model
from keras.layers import Masking, Embedding, Bidirectional, LSTM, Dense, Input, TimeDistributed, Activation
from keras.preprocessing import sequence
#默认参数
batch_size = 64 # Batch size for training.
epochs = 400 # Number of epochs to train for.
latent_dim = 258 # Latent dimensionality of the encoding space.
#num_samples = 3000 # Number of samples to train on.
LEARNING_RATE=0.002
EMBED_DIM = 200 #腾讯词向量维度是200
ATTENTION_DIM=200
########################################数据处理########################################
with open('cmn.txt', 'r', encoding='utf-8') as f:
data = f.read()
data = data.split('\n')
data=data[0:10000]
#3128样本,选3000训练,0-2999
####结巴分词####################
import jieba
import pandas as pd
import re
'''
分开中英文
同时对中文做结巴分词
'''
input_texts = [line.split('\t')[0] for line in data]
target_texts=[]
record=[]
i=0
jieba_cut=lambda x: " ".join(jieba.cut(x))
for line in data:
i+=1
tmp=line.split('\t')
if(len(tmp)==2):
sentence=line.split('\t')[1]
sentence=re.sub(r'\\|""|"”"|"“"|"‘"|"’"|"!"|"。"|"?"|","|":"',"",sentence)
sentence=jieba_cut(sentence)
#tmp1='\t' + sentence + '\n'
target_texts.append(sentence)
else:
print(line)
record.append(i)
###发现最后一行是一个'\t'
input_texts.remove(input_texts[-1])
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
##处理中文
tokenizer = Tokenizer(num_words=50000) #num_words:None或整数,处理的最大单词数量。多于此数的单词丢掉
tokenizer.fit_on_texts(target_texts)
#转化为数字向量[[1, 2, 3, 4], [1, 2, 3, 5]]
target_texts1=tokenizer.texts_to_sequences(target_texts) #识别句号,不识别'\t','\n'
# 生成中文字典
dic = tokenizer.word_index
dic['<PAD>'] = 0##list(dic.keys())中<PAD>在最后一个
dic['[start]']=len(dic)
dic['[end]']=len(dic)
target_token_index=dic
##处理英文
tokenizer1 = Tokenizer(num_words=50000) #num_words:None或整数,处理的最大单词数量。多于此数的单词丢掉
tokenizer1.fit_on_texts(input_texts)
#转化为数字向量[[1, 2, 3, 4], [1, 2, 3, 5]]
input_texts1=tokenizer1.texts_to_sequences(input_texts)
# 生成英文字典
input_token_index = tokenizer1.word_index
#
# 生成英文字典
#input_characters = set(''.join(input_texts))
# input_characters=set()
# for i in range(len(input_texts)):#
# #temp=re.sub(r'\.|\?|\!|\:|\"|\'','',input_texts[i])
# word=input_texts[i].split(' ')
# for char in word:
# # print(char)
# temp=re.split(r'(\.|\?|\!|\:|\"|\')',char)#保留括号
# # print(temp)
# for c in temp:
# input_characters.add(c)
# input_characters.remove('')
'''
input英文:line
target中文:'\t' + line + '\n'
e.g.
a=['1','2']
a.insert(0,'0')
a--['0','1','2']
'''
for i in range(len(target_texts1)):
target_texts1[i].insert(0,target_token_index['[start]'])
target_texts1[i].append(target_token_index['[end]'])
'''
---对list做lambda
data1 = map(lambda x: " ".join(jieba.cut(x)),data)
data1=list(data1)
data2=data1.apply(lambda x: " ".join(jieba.cut(x)))
'''
print('英文数据:\n', input_texts[:1])
print('\n中文数据:\n', target_texts1[:1])
# 生成中文字典
# target_characters = set(''.join(target_texts))#分字
# id2ch = list(target_characters)
# target_token_index = {c:i+1 for i,c in enumerate(id2ch)}
num_encoder_tokens = len(input_token_index)
num_decoder_tokens = len(target_token_index)
max_encoder_seq_length = max([len(txt) for txt in input_texts1])
max_decoder_seq_length = max([len(txt) for txt in target_texts1])
#这是model.fit中代入数据训练模型需要的2维向量
encoder_input_data = np.zeros(
(len(input_texts), max_encoder_seq_length),
dtype='float32')
decoder_input_data = np.zeros(
(len(input_texts), max_decoder_seq_length),
dtype='float32')
decoder_target_data = np.zeros(
(len(input_texts), max_decoder_seq_length, num_decoder_tokens),
dtype='float32')
#decoder_target_data留着空值:num_decoder_tokens+1
#target_texts1是数字list,input_texts是文字list
#长度是4,decoder_input_data是‘[start]你好[end]’,
#decoder_target_data是‘你好[end]'
#[start]-你,你-好,好-[end],[end]-0
target_char_list=list(target_token_index.keys())
for i, (input_text, target_text) in enumerate(zip(input_texts1, target_texts1)):
for t, char in enumerate(input_text):
encoder_input_data[i, t] = input_texts1[i][t]
for t, char in enumerate(target_text):
# decoder_input_data[i,0,target_token_index['\t']]
#'target_texts1[i]'='target_text'
decoder_input_data[i, t] = target_texts1[i][t]
# if t==len(target_text)-1:
# print('last char:',target_char_list[target_texts1[i][t]])
# print('last charnum:',target_texts1[i][t])
if t > 0:
# print(t)
# decoder_target_data will be ahead by one timestep
# and will not include the start character.
decoder_target_data[i, t-1, target_texts1[i][t]] = 1.
###################划分训练集,测试集#################
#分训练7,测试3
import random
testn=np.random.randint(len(input_texts),size=int(len(input_texts)/10*3))
encoder_input_data_test=np.array(encoder_input_data)[testn,:]
decoder_input_data_test=np.array(decoder_input_data)[testn,:]
decoder_target_data_test=np.array(decoder_target_data)[testn,:,:]
encoder_input_data_train=np.delete(np.array(encoder_input_data),testn,axis=0)
decoder_input_data_train=np.delete(np.array(decoder_input_data),testn,axis=0)
decoder_target_data_train=np.delete(np.array(decoder_target_data),testn,axis=0)
#############################################################################################
- 建立模型
- 这里使用普通的encoder(embedding)-decoder(embedding)作为对比;
- 提供global attention的3种形式(score的计算方式不同:dot,general,concat);
- 同时每种attention提供两种书写方式(类似于上一篇文章继承keras的层自定义attention层/使用keras的层合成attention层)
embedding
################建模#################################################################
import matplotlib.pyplot as plt
def acc_plotting(hist,str_save):
a=hist.history
loss_=a['loss']#['val_loss', 'val_categorical_accuracy', 'loss', 'categorical_accuracy', 'lr']
val_loss_=a['val_loss']
acc_=a['acc']
val_acc_=a['val_acc']
x=np.arange(1,len(loss_)+1)
# plt.figure(figsize=(8, 5))
plt.figure()
# 标题
plt.rcParams['font.family'] = ['simhei']
plt.subplot(1,2,1)
plt.title("模型损失函数变化曲线")
plt.grid()
# 数据
plt.plot(x, loss_, label='训练集loss', linewidth=3, color='r')
plt.plot(x, val_loss_, label='验证集loss', linewidth=3, color='blue')
# 横坐标描述
plt.xlabel('迭代次数')
# 纵坐标描述
plt.ylabel('函数值')
plt.legend()
plt.subplot(1,2,2)
plt.title("模型准确率变化曲线")
plt.grid()
# 数据
plt.plot(x, acc_, label='训练集acc', linewidth=3, color='r')
plt.plot(x, val_acc_, label='验证集acc', linewidth=3, color='blue')
# 横坐标描述
plt.xlabel('迭代次数')
# 纵坐标描述
plt.ylabel('函数值')
plt.legend()
plt.show()
plt.savefig(str_save)
from keras.engine.topology import Layer
from keras import initializers
##########embedding使用预训练字典######################
#tencent词向量中是词语(e.g.如果)+' '+向量的组合,个人觉得需要分词
#tencent词向量是200维!即embed_dim=200
#含有在tencent词向量中的才会被使用,约5000(include)/6000(total)
def loadEmbedding(embeddingFile,word2id ):
with open(embeddingFile, "r", encoding='ISO-8859-1') as f:
header = f.readline()
vocab_size, vector_size = map(int, header.split()) #map规范输出为整型, vector_size是200
initW = np.random.uniform(-0.25,0.25,(len(word2id), vector_size))#word2id表示自己任务字典字数,包括特殊词,且有随机初始化,不需要特别定义特殊词
count = 0
for i in range(vocab_size):#vocab_size
line = f.readline()
lists = line.split(' ') #tencent文件每行是 word wordvector
word = lists[0]
try: word = word.encode('ISO-8859-1').decode('utf8')
except: pass
#print(word)
if word in word2id:
count += 1
number = map(float, lists[1:])
number = list(number)
vector = np.array(number)
initW[word2id[word]] = vector #原来word在第几行,embedding后id(行数)不变。
print('existed in tencent wordvector:',count)
initW[word2id['<PAD>']]=np.zeros(vector_size)#补充pad的词向量
initW[word2id['[start]']]=np.ones(vector_size)*2#补充pad的词向量
initW[word2id['[end]']]=np.ones(vector_size)*(-2)#补充pad的词向量
return initW
#target_token_index['<PAD>'] = 0
file = r'E:\Tencent_AILab_ChineseEmbedding.txt'
embedding_matrix=loadEmbedding(file, target_token_index)#字典有包括'<PAD>''
print('Total %s word vectors.' % len(embedding_matrix))
# building Hierachical Attention network,字典多一个,0表示null--len(word_index) + 1#mask_zero=True,
# 自编写层不支持mask
embedding_layer = Embedding(num_decoder_tokens ,
200,
weights=[embedding_matrix],
input_length=max_decoder_seq_length,
mask_zero=False,trainable=False)
attention写法1
#*****************************attention写法1***********************************#
'''
concatAttention
'''
class BahdanauAttention(Layer):
def __init__(self, attention_dim):
self.init = initializers.get('uniform')#keras的初始化normal
self.attention_dim = attention_dim
super(BahdanauAttention, self).__init__()
def build(self, input_shape):
# assert len(input_shape) == 3
# self.queries_dim=K.int_shape(queries)
# self.values_dim=K.int_shape(values)
# self.W1 = K.variable(self.init((self.values_dim[-1], self.attention_dim)), name='W1')
# self.W2 = K.variable(self.init((self.queries_dim[-1],self.attention_dim)), name='W2')
self.W1 = K.variable(self.init((input_shape[-1][-1], self.attention_dim)), name='W1')
self.W2 = K.variable(self.init((input_shape[1][-1],self.attention_dim)), name='W2')
print(input_shape)
self.V = K.variable(self.init((self.attention_dim, 1)), name='V')
self.trainable_weights = [self.W1, self.W2, self.V]
super(BahdanauAttention, self).build(input_shape)
def call(self, x):
queries=x[0]
values=x[-1]
self.queries_dim=K.int_shape(queries)
#print(self.queries_dim)
self.values_dim=K.int_shape(values)
#print(self.values_dim)
c_list=[]
#print('self.queries_dim[1]',self.queries_dim[1])
for i in range(self.queries_dim[1]):
query=K.reshape(queries[:,i,:],(-1,1,self.queries_dim[-1]))
score0=K.tanh(K.dot(values, self.W1)+K.dot(query, self.W2))
score=K.dot(score0,self.V)
# score = self.V(K.tanh(
# self.W1(values) + self.W2(hidden_with_time_axis)))
attention_weights = K.softmax(score, axis=1)
context_vector0 = attention_weights * values
context_vector=K.expand_dims(K.sum(context_vector0,axis=1),axis=1)
c_list.append(context_vector)
#print('context_vector',K.int_shape(context_vector))
context_vectors=K.concatenate(c_list,axis=1)
print('context_vectors',K.int_shape(context_vectors))
#context_vectors=K.permute_dimensions(context_vectors0, (1,2,0))
return context_vectors
def compute_output_shape(self, input_shape):
return (input_shape[0][0], input_shape[0][1],input_shape[-1][-1])
'''
dotAttention
'''
class dotAttention(Layer):
def __init__(self,attention_dim):
self.init = initializers.get('uniform')#keras的初始化
super(dotAttention, self).__init__()
def build(self, input_shape):
super(dotAttention, self).build(input_shape)
def call(self, x):
queries=x[0]
values=x[-1]
self.queries_dim=K.int_shape(queries)
#print(self.queries_dim)
self.values_dim=K.int_shape(values)
#print(self.values_dim)
c_list=[]
#print('self.queries_dim[1]',self.queries_dim[1])
for i in range(self.queries_dim[1]):
query=K.reshape(queries[:,i,:],(-1,1,self.queries_dim[-1]))
score0=K.sum(query*values,axis=-1)
score=K.expand_dims(score0,axis=-1)
# score = self.V(K.tanh(
# self.W1(values) + self.W2(hidden_with_time_axis)))
attention_weights = K.softmax(score, axis=1)
context_vector0 = attention_weights * values
context_vector=K.expand_dims(K.sum(context_vector0,axis=1),axis=1)
c_list.append(context_vector)
#print('context_vector',K.int_shape(context_vector))
context_vectors=K.concatenate(c_list,axis=1)
print('context_vectors',K.int_shape(context_vectors))
#context_vectors=K.permute_dimensions(context_vectors0, (1,2,0))
return context_vectors
def compute_output_shape(self, input_shape):
return (input_shape[0][0], input_shape[0][1],input_shape[-1][-1])
'''
generalAttention
'''
class generalAttention(Layer):
def __init__(self,attention_dim):
self.init = initializers.get('uniform')#keras的初始化
super(generalAttention, self).__init__()
def build(self, input_shape):
# self.W1 = K.variable(self.init((input_shape[-1][-1], self.attention_dim)), name='W1')
# self.trainable_weights = [self.W1]
self.W1 = self.add_weight(name='W1',
shape=(input_shape[-1][-1], input_shape[0][-1]),
initializer='uniform',
trainable=True)
super(generalAttention, self).build(input_shape)
def call(self, x):
queries=x[0]
values=x[-1]
self.queries_dim=K.int_shape(queries)
#print(self.queries_dim)
self.values_dim=K.int_shape(values)
#print(self.values_dim)
FC_values=K.dot(values, self.W1)
c_list=[]
#print('self.queries_dim[1]',self.queries_dim[1])
for i in range(self.queries_dim[1]):
query=K.reshape(queries[:,i,:],(-1,1,self.queries_dim[-1]))
score0=K.sum(query*FC_values,axis=-1)
score=K.expand_dims(score0,axis=-1)
# score = self.V(K.tanh(
# self.W1(values) + self.W2(hidden_with_time_axis)))
attention_weights = K.softmax(score, axis=1)
context_vector0 = attention_weights * values
context_vector=K.expand_dims(K.sum(context_vector0,axis=1),axis=1)
c_list.append(context_vector)
#print('context_vector',K.int_shape(context_vector))
context_vectors=K.concatenate(c_list,axis=1)
print('context_vectors',K.int_shape(context_vectors))
#context_vectors=K.permute_dimensions(context_vectors0, (1,2,0))
return context_vectors
def compute_output_shape(self, input_shape):
return (input_shape[0][0], input_shape[0][1],input_shape[-1][-1])
from keras.callbacks import ReduceLROnPlateau
from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=2, mode='min')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=5,mode='min')
#Embedding(input_dim, output_dim),mask_zero=True,input_dim应该与 vocabulary + 1一样
K.clear_session()
def NMT_attention(encoder_input_data_train, decoder_input_data_train,decoder_target_data_train,str1):
encoder_inputs = Input(shape=(max_encoder_seq_length, ))#最终输入训练(总样本数,timestep,词汇表大小)
# mask1 = Masking(mask_value=0)(encoder_inputs)
Embedding1 = Embedding(num_encoder_tokens+1, EMBED_DIM, mask_zero=False)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(latent_dim, return_sequences=True,return_state=True)(Embedding1)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]
decoder_inputs = Input(shape=(max_decoder_seq_length, ))
#mask2 = Masking(mask_value=0)(decoder_inputs)
#Embedding2 = Embedding(num_decoder_tokens+1, EMBED_DIM,mask_zero=False)(decoder_inputs)
Embedding2 = embedding_layer(decoder_inputs)
#decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=False)
lstm_outputs= LSTM(latent_dim, return_sequences=True, return_state=False)(Embedding2 ,initial_state=encoder_states)
attention_layer=BahdanauAttention(attention_dim=ATTENTION_DIM)
#dotAttention规定encoder,decoder的lstm的latent_dim(输出最后一个维度)相同
#attention_layer=dotAttention(attention_dim=ATTENTION_DIM)
#generalAttention 不 规定encoder,decoder的lstm的latent_dim(输出最后一个维度)相同
#attention_layer=generalAttention(attention_dim=ATTENTION_DIM)
context_vector = attention_layer([lstm_outputs,encoder_outputs])
concat_output=Concatenate(axis=-1)([context_vector,lstm_outputs])#Concatenate除了串联轴之外,其他的尺寸都必须相同,lstm_outputs-Embedding2
# activation='softmax'默认axis=-1
#dense_outputs = Dense(latent_dim, activation='tanh')(concat_output)#decoder_outputs
dense_outputs = LSTM(latent_dim, activation='tanh', return_sequences=True, return_state=False)(concat_output)
decoder_outputs=Dense(num_decoder_tokens, activation='softmax')(dense_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()
#reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=5,mode='min')
#rmsprop,adam
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])
hist=model.fit([encoder_input_data_train, decoder_input_data_train], decoder_target_data_train,
batch_size=batch_size,verbose=1, epochs=200,validation_split=0.1,
callbacks=[reduce_lr,early_stopping])
#epochs
# Save model
model.save(str1)
return hist
hist1=NMT_attention(encoder_input_data_train, decoder_input_data_train,decoder_target_data_train,
'translation_concat_attention.h5')
str_save="NMT_concat_attention模型.png"
acc_plotting(hist1,str_save)
attention写法2
#*****************************attention写法2***********************************#
from keras.layers import Dense, Lambda, dot,Flatten, Activation,RepeatVector, concatenate,Reshape, Softmax
from keras import backend as K
def attention_general(encoder_hidden_states,decoder_hidden_states,time2):
"""
Many-to-one attention mechanism for Keras.
@param hidden_states: 3D tensor with shape (batch_size, time_steps, input_dim).
@return: 2D tensor with shape (batch_size, 128)
attention_dim==decoder_hidden_states_dim,
encoder_hidden_states_dim != decoder_hidden_states_dim
"""
encoder_hidden_size = int(encoder_hidden_states.shape[2])
decoder_hidden_size = int(decoder_hidden_states.shape[2])
#time2=int(decoder_hidden_states.shape[1])
# Inside dense layer
# hidden_states dot W => score_first_part
# (batch_size, time_steps, hidden_size) dot (hidden_size, hidden_size) => (batch_size, time_steps, hidden_size)
# W is the trainable weight matrix of attention Luong's multiplicative style score
# 使用了Dense(32)((hidden_states)就是tensor,需要使用K.int_shape;Dense(32)是layer,需要使用layer.output_shape
#score_first_part = Dense(decoder_hidden_size, use_bias=False, name='attention_score_vec')(encoder_hidden_states)
a_list=[]
#print('self.queries_dim[1]',self.queries_dim[1])
for i in range(time2):
score_first_part = Dense(decoder_hidden_size, use_bias=False)(encoder_hidden_states)
h_t = Lambda(lambda x: x[:, i, :], output_shape=(decoder_hidden_size,))(decoder_hidden_states)
#print('h_t',h_t.output_shape)
print('h_t:',K.int_shape(h_t))
# score_first_part dot last_hidden_state => attention_weights
# (batch_size, time_steps, hidden_size) dot (batch_size, hidden_size) => (batch_size, time_steps)
score = dot([score_first_part, h_t], [2, 1])
attention_weights = Activation('softmax')(score)
print('score:',K.int_shape(score))
# (batch_size, time_steps, hidden_size) dot (batch_size, time_steps) => (batch_size, hidden_size)
context_vector = dot([encoder_hidden_states, attention_weights], [1, 1])
pre_activation = concatenate([context_vector, h_t])
attention_vector0 = Dense(128, use_bias=False, activation='tanh')(pre_activation)
print('attention_vector0:',K.int_shape(attention_vector0))
attention_vector=RepeatVector(1)(attention_vector0)
a_list.append(attention_vector)
attention_vectors=concatenate(a_list,axis=1)
print('attention_vectors:',K.int_shape(attention_vectors))
return attention_vectors
def attention_dot(encoder_hidden_states,decoder_hidden_states,time2):
"""
Many-to-one attention mechanism for Keras.
@param hidden_states: 3D tensor with shape (batch_size, time_steps, input_dim).
@return: 2D tensor with shape (batch_size, 128)
encoder_hidden_states_dim == decoder_hidden_states_dim
"""
encoder_hidden_size = int(encoder_hidden_states.shape[2])
decoder_hidden_size = int(decoder_hidden_states.shape[2])
#time2=int(decoder_hidden_states.shape[1])
# Inside dense layer
# hidden_states dot W => score_first_part
# (batch_size, time_steps, hidden_size) dot (hidden_size, hidden_size) => (batch_size, time_steps, hidden_size)
# W is the trainable weight matrix of attention Luong's multiplicative style score
# 使用了Dense(32)((hidden_states)就是tensor,需要使用K.int_shape;Dense(32)是layer,需要使用layer.output_shape
score_first_part = encoder_hidden_states
a_list=[]
#print('self.queries_dim[1]',self.queries_dim[1])
for i in range(time2):
h_t = Lambda(lambda x: x[:, i, :], output_shape=(decoder_hidden_size,))(decoder_hidden_states)
#print('h_t',h_t.output_shape)
print('h_t:',K.int_shape(h_t))
# score_first_part dot last_hidden_state => attention_weights
# (batch_size, time_steps, hidden_size) dot (batch_size, hidden_size) => (batch_size, time_steps)
score = dot([score_first_part, h_t], [2, 1])
attention_weights = Activation('softmax')(score)
print('score:',K.int_shape(score))
# (batch_size, time_steps, hidden_size) dot (batch_size, time_steps) => (batch_size, hidden_size)
context_vector = dot([encoder_hidden_states, attention_weights], [1, 1])
pre_activation = concatenate([context_vector, h_t])
attention_vector0 = Dense(128, use_bias=False, activation='tanh')(pre_activation)
print('attention_vector0:',K.int_shape(attention_vector0))
attention_vector=RepeatVector(1)(attention_vector0)
a_list.append(attention_vector)
attention_vectors=concatenate(a_list,axis=1)
print('attention_vectors:',K.int_shape(attention_vectors))
return attention_vectors
def attention_concat(encoder_hidden_states,decoder_hidden_states,time2,attention_dim):
"""
Many-to-one attention mechanism for Keras.
@param hidden_states: 3D tensor with shape (batch_size, time_steps, input_dim).
@return: 2D tensor with shape (batch_size, 128)
attention_dim != decoder_hidden_states_dim,
encoder_hidden_states_dim != decoder_hidden_states_dim
"""
encoder_hidden_size = int(encoder_hidden_states.shape[2])
decoder_hidden_size = int(decoder_hidden_states.shape[2])
#time2=int(decoder_hidden_states.shape[1])
# Inside dense layer
# hidden_states dot W => score_first_part
# (batch_size, time_steps, hidden_size) dot (hidden_size, hidden_size) => (batch_size, time_steps, hidden_size)
# W is the trainable weight matrix of attention Luong's multiplicative style score
# 使用了Dense(32)((hidden_states)就是tensor,需要使用K.int_shape;Dense(32)是layer,需要使用layer.output_shape
#score_first_part = Dense(decoder_hidden_size, use_bias=False, name='attention_score_vec')(encoder_hidden_states)
a_list=[]
#print('self.queries_dim[1]',self.queries_dim[1])
for i in range(time2):
FC_encoder = Dense(attention_dim, use_bias=False)(encoder_hidden_states)
h_t = Lambda(lambda x: x[:, i, :], output_shape=(decoder_hidden_size,))(decoder_hidden_states)
FC_ht = Dense(attention_dim, use_bias=False)(h_t)
print('FC_encoder:',K.int_shape(FC_encoder))
print('FC_ht :',K.int_shape(FC_ht))
# score_first_part dot last_hidden_state => attention_weights
# (batch_size, time_steps, hidden_size) dot (batch_size, hidden_size) => (batch_size, time_steps)
score0=K.tanh(FC_encoder+FC_ht)
print('score0 :',K.int_shape(score0))
score1 = Dense(1, use_bias=False)(score0)
#score=Reshape((15))(score1)
score=Flatten()(score1)
print('score:',K.int_shape(score))
attention_weights = Activation('softmax')(score)
# (batch_size, time_steps, hidden_size) dot (batch_size, time_steps) => (batch_size, hidden_size)
context_vector = dot([encoder_hidden_states, attention_weights], [1, 1])
pre_activation = concatenate([context_vector, h_t])
attention_vector0 = Dense(128, use_bias=False, activation='tanh')(pre_activation)
#print('attention_vector0:',K.int_shape(attention_vector0))
attention_vector=RepeatVector(1)(attention_vector0)
a_list.append(attention_vector)
attention_vectors=concatenate(a_list,axis=1)
print('attention_vectors:',K.int_shape(attention_vectors))
return attention_vectors
def NMT_attention(encoder_input_data_train, decoder_input_data_train,decoder_target_data_train,str1):
encoder_inputs = Input(shape=(max_encoder_seq_length, ))#最终输入训练(总样本数,timestep,词汇表大小)
# mask1 = Masking(mask_value=0)(encoder_inputs)
Embedding1 = Embedding(num_encoder_tokens+1, EMBED_DIM, mask_zero=False)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(latent_dim, return_sequences=True,return_state=True)(Embedding1)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]
decoder_inputs = Input(shape=(max_decoder_seq_length, ))
#mask2 = Masking(mask_value=0)(decoder_inputs)
#Embedding2 = Embedding(num_decoder_tokens+1, EMBED_DIM,mask_zero=False)(decoder_inputs)
Embedding2 = embedding_layer(decoder_inputs)
#decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=False)
lstm_outputs= LSTM(latent_dim, return_sequences=True, return_state=False)(Embedding2 ,initial_state=encoder_states)
attention_layer=BahdanauAttention(attention_dim=ATTENTION_DIM)
#dotAttention规定encoder,decoder的lstm的latent_dim(输出最后一个维度)相同
#attention_layer=dotAttention(attention_dim=ATTENTION_DIM)
#generalAttention 不 规定encoder,decoder的lstm的latent_dim(输出最后一个维度)相同
#attention_layer=generalAttention(attention_dim=ATTENTION_DIM)
context_vector = attention_layer([lstm_outputs,encoder_outputs])
concat_output=Concatenate(axis=-1)([context_vector,lstm_outputs])#Concatenate除了串联轴之外,其他的尺寸都必须相同,lstm_outputs-Embedding2
# activation='softmax'默认axis=-1
#dense_outputs = Dense(latent_dim, activation='tanh')(concat_output)#decoder_outputs
dense_outputs = LSTM(latent_dim, activation='tanh', return_sequences=True, return_state=False)(concat_output)
decoder_outputs=Dense(num_decoder_tokens, activation='softmax')(dense_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()
#reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=5,mode='min')
#rmsprop,adam
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])
hist=model.fit([encoder_input_data_train, decoder_input_data_train], decoder_target_data_train,
batch_size=batch_size,verbose=1, epochs=200,validation_split=0.1,
callbacks=[reduce_lr,early_stopping])
#epochs
# Save model
model.save(str1)
return hist
hist1=NMT_attention(encoder_input_data_train, decoder_input_data_train,decoder_target_data_train,
'translation_concat_attention.h5')
str_save="NMT_concat_attention模型.png"
acc_plotting(hist1,str_save)
没有attention
def encode_decode(encoder_input_data_train, decoder_input_data_train,decoder_target_data_train):
encoder_inputs = Input(shape=(max_encoder_seq_length, ))#最终输入训练(总样本数,timestep,词汇表大小)
# mask1 = Masking(mask_value=0)(encoder_inputs)
Embedding1_ = Embedding(num_encoder_tokens+1, EMBED_DIM, mask_zero=False)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(latent_dim, return_sequences=True,return_state=True)(Embedding1_)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]
decoder_inputs = Input(shape=(max_decoder_seq_length, ))
#mask2 = Masking(mask_value=0)(decoder_inputs)
#Embedding2 = Embedding(num_decoder_tokens+1, EMBED_DIM,mask_zero=False)(decoder_inputs)
Embedding2_ = embedding_layer(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=False)
lstm_outputs= decoder_lstm(Embedding2_ ,
initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')# activation='softmax'默认axis=-1
decoder_outputs = decoder_dense(lstm_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()
model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])
hist=model.fit([encoder_input_data_train, decoder_input_data_train], decoder_target_data_train,
batch_size=batch_size,verbose=1, epochs=200,validation_split=0.1,
callbacks=[reduce_lr,early_stopping])
#epochs
# Save model
model.save('model_translation_no_attention.h5')
return hist
hist2=encode_decode(encoder_input_data_train, decoder_input_data_train,decoder_target_data_train)
str_save="NMT模型no_attention.png"
acc_plotting(hist2,str_save)
from attention_layer_NMT import attention_general
def NMT_attention2(encoder_input_data_train, decoder_input_data_train,decoder_target_data_train,str1):
encoder_inputs = Input(shape=(max_encoder_seq_length, ))#最终输入训练(总样本数,timestep,词汇表大小)
# mask1 = Masking(mask_value=0)(encoder_inputs)
Embedding1 = Embedding(num_encoder_tokens+1, EMBED_DIM, mask_zero=False)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(latent_dim, return_sequences=True,return_state=True)(Embedding1)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]
decoder_inputs = Input(shape=(max_decoder_seq_length, ))
#mask2 = Masking(mask_value=0)(decoder_inputs)
#Embedding2 = Embedding(num_decoder_tokens+1, EMBED_DIM,mask_zero=False)(decoder_inputs)
Embedding2 = embedding_layer(decoder_inputs)
#decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=False)
lstm_outputs= LSTM(latent_dim, return_sequences=True, return_state=False)(Embedding2 ,initial_state=encoder_states)
attention_vectors=attention_general(encoder_outputs,lstm_outputs,max_decoder_seq_length)
decoder_outputs=Dense(num_decoder_tokens, activation='softmax')(attention_vectors)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.summary()
#reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=5,mode='min')
#rmsprop,adam
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])
hist=model.fit([encoder_input_data_train, decoder_input_data_train], decoder_target_data_train,
batch_size=batch_size,verbose=1, epochs=200,validation_split=0.1,
callbacks=[reduce_lr,early_stopping])
#epochs
# Save model
model.save(str1)
return hist
hist1=NMT_attention2(encoder_input_data_train, decoder_input_data_train,decoder_target_data_train,
'translation_general_attention1.h5')
str_save="NMT_general_attention模型1.png"
acc_plotting(hist1,str_save)
实验结果
实验结果非常一般。。希望各位如果知道原因可以进来讨论下~~