Enhanced LSTM for Natural Language Inference
https://arxiv.org/pdf/1609.06038.pdf
文本匹配、文本相似度模型之ESIM
https://blog.csdn.net/u012526436/article/details/90380840
如何又好又快的做文本匹配-ESIM
https://zhuanlan.zhihu.com/p/337567073
1. 总体思路
2. 学习总结
3. 代码解读
3.1 keypoint
import tensorflow as tf
from esim import args
class Graph:
def __init__(self):
# 输入为n*d,相当于是原始字符直接输入
self.p = tf.placeholder(dtype=tf.int32, shape=(None, args.seq_length), name='p')
self.h = tf.placeholder(dtype=tf.int32, shape=(None, args.seq_length), name='h')
self.y = tf.placeholder(dtype=tf.int32, shape=None, name='y')
self.keep_prob = tf.placeholder(dtype=tf.float32, name='drop_rate')
# 自定义的embedding的大小,这里采用添加embeedding层而不是使用预训练的词向量的方法来做
self.embedding = tf.get_variable(dtype=tf.float32, shape=(args.vocab_size, args.char_embedding_size),
name='embedding')
self.forward()
def dropout(self, x):
return tf.nn.dropout(x, keep_prob=self.keep_prob)
def bilstm(self, x, hidden_size):
fw_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)
bw_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)
return tf.nn.bidirectional_dynamic_rnn(fw_cell, bw_cell, x, dtype=tf.float32)
def forward(self):
# 1. Input Encoding层,对原始输入进行embedding
# 定义好p和h两个输入,并接入到embedding层
p_embedding = tf.nn.embedding_lookup(self.embedding, self.p)
h_embedding = tf.nn.embedding_lookup(self.embedding, self.h)
# 这里是同一个lstm,不是两个不同的
with tf.variable_scope("lstm_p", reuse=tf.AUTO_REUSE):
(p_f, p_b), _ = self.bilstm(p_embedding, args.embedding_hidden_size)
with tf.variable_scope("lstm_p", reuse=tf.AUTO_REUSE):
(h_f, h_b), _ = self.bilstm(h_embedding, args.embedding_hidden_size)
p = tf.concat([p_f, p_b], axis=2)
h = tf.concat([h_f, h_b], axis=2)
p = self.dropout(p)
h = self.dropout(h)
# 2.Local Inference Modeling
# p batch*length*dim,使用矩阵点乘的方式,分别得到a和b的attention值
# 感觉还是挖掘左右向量的内部信息,最终为[a,a',a-a',a*a']以及对应的b把上一轮拿到的特征值做差异性计算作者采用了attention机制,其中attention weight的计算方法如下得到encoding值与加权encoding值之后,下一步是分别对这两个值做差异性计算,作者认为这样的操作有助于模型效果的提升,论文有有两种计算方法,分别是对位相减与对位相乘,最后把encoding两个状态的值与相减、相乘的值拼接起来。
e = tf.matmul(p, tf.transpose(h, perm=[0, 2, 1]))
a_attention = tf.nn.softmax(e)
# 这里相当于a与b的attention的长度是不一样的,相当于点积之后,一个取行一个取列进行softmax来算attention
b_attention = tf.transpose(tf.nn.softmax(tf.transpose(e, perm=[0, 2, 1])), perm=[0, 2, 1])
a = tf.matmul(a_attention, h)
b = tf.matmul(b_attention, p)
# 添加minus,multiply以及原始p组成内部深层次特征
m_a = tf.concat((a, p, a - p, tf.multiply(a, p)), axis=2)
m_b = tf.concat((b, h, b - h, tf.multiply(b, h)), axis=2)
# 3.Inference Composition捕获重要的上下文,就是做平均池化,最大池化[va_ave, va_max, vb_min, vb_max]在这一层中,把之前的值再一次送到了BiLSTM中,这里的BiLSTM的作用和之前的并不一样,这里主要是用于捕获局部推理信息得到ma mb
with tf.variable_scope("lstm_a", reuse=tf.AUTO_REUSE):
(a_f, a_b), _ = self.bilstm(m_a, args.context_hidden_size)
with tf.variable_scope("lstm_b", reuse=tf.AUTO_REUSE):
(b_f, b_b), _ = self.bilstm(m_b, args.context_hidden_size)
a = tf.concat((a_f, a_b), axis=2)
b = tf.concat((b_f, b_b), axis=2)
a = self.dropout(a)
b = self.dropout(b)
# 得到平均池化和最大池化特征
a_avg = tf.reduce_mean(a, axis=2)
b_avg = tf.reduce_mean(b, axis=2)
a_max = tf.reduce_max(a, axis=2)
b_max = tf.reduce_max(b, axis=2)
# 4.prediction过程
# 拿到合并的特征并送入softmax
v = tf.concat((a_avg, a_max, b_avg, b_max), axis=1)
v = tf.layers.dense(v, 512, activation='tanh')
v = self.dropout(v)
logits = tf.layers.dense(v, 2, activation='tanh')
self.prob = tf.nn.softmax(logits)
self.prediction = tf.argmax(logits, axis=1)
self.train(logits)
def train(self, logits):
y = tf.one_hot(self.y, args.class_size)
# 使用交叉熵最为损失
loss = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=logits)
self.loss = tf.reduce_mean(loss)
self.train_op = tf.train.AdamOptimizer(args.learning_rate).minimize(self.loss)
correct_prediction = tf.equal(tf.cast(self.prediction, tf.int32), self.y)
self.acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))