关系抽取(二) — RNN-based Models
Attention-Based Bidirectional Long Short-Term Memory Networks for Relation Classification
现状
依赖lexical resources,比如WordNet和基于NLP的依存分析和命名实体识别(NER);重要的信息有可能出现在句子的任何地方。
模型
-
输入层
-
Embedding层
-
BiLSTM层
-
Attention层: 从word-level特征生成sentence-level特征。
-
输出层
代码
def __init__(self, sequence_length, num_classes, vocab_size, embedding_size,
hidden_size, l2_reg_lambda=0.0):
# Placeholders for input, output and dropout
self.input_text = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_text')
self.input_y = tf.placeholder(tf.float32, shape=[None, num_classes], name='input_y')
self.emb_dropout_keep_prob = tf.placeholder(tf.float32, name='emb_dropout_keep_prob')
self.rnn_dropout_keep_prob = tf.placeholder(tf.float32, name='rnn_dropout_keep_prob')
self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob')
initializer = tf.keras.initializers.glorot_normal
# Word Embedding Layer
with tf.device('/cpu:0'), tf.variable_scope("word-embeddings"):
self.W_text = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -0.25, 0.25), name="W_text")
self.embedded_chars = tf.nn.embedding_lookup(self.W_text, self.input_text)
# Dropout for Word Embedding
with tf.variable_scope('dropout-embeddings'):
self.embedded_chars = tf.nn.dropout(self.embedded_chars, self.emb_dropout_keep_prob)
# Bidirectional LSTM
with tf.variable_scope("bi-lstm"):
self.rnn_outputs = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hidden_size, return_sequences=True), merge_mode='sum')(self.embedded_chars)
# Attention
with tf.variable_scope('attention'):
self.attn, self.alphas = attention(self.rnn_outputs)
# Dropout
with tf.variable_scope('dropout'):
self.h_drop = tf.nn.dropout(self.attn, self.dropout_keep_prob)
# Fully connected layer
with tf.variable_scope('output'):
self.logits = tf.layers.dense(self.h_drop, num_classes, kernel_initializer=initializer())
self.predictions = tf.argmax(self.logits, 1, name="predictions")
# Calculate mean cross-entropy loss
with tf.variable_scope("loss"):
losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.logits, labels=self.input_y)
self.l2 = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()])
self.loss = tf.reduce_mean(losses) + l2_reg_lambda * self.l2
# Accuracy
with tf.variable_scope("accuracy"):
correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name="accuracy")
其中attention代码如下:
def attention(inputs):
# Trainable parameters
hidden_size = inputs.shape[2].value
u_omega = tf.get_variable("u_omega", [hidden_size], initializer=tf.keras.initializers.glorot_normal())
with tf.name_scope('v'):
v = tf.tanh(inputs)
# For each of the timestamps its vector of size A from `v` is reduced with `u` vector
vu = tf.tensordot(v, u_omega, axes=1, name='vu') # (B,T) shape
alphas = tf.nn.softmax(vu, name='alphas') # (B,T) shape
# Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)
# Final output with tanh
output = tf.tanh(output)
return output, alphas
Semantic Relation Classification via Bidirectional LSTM Networks with Entity-aware Attention using Latent Entity Typing
现状
模型
- Word Representation
- Self Attention:Q=K=V,每个head在不同空间捕捉不同的信息,最后concatenation起来。
A t t e n t i o n ( Q , K , V ) = s o f t m a x ( Q K T d w ) V M u l t i H e a d ( Q , K , V ) = W M [ h e a d 1 ; . . . h e a d r ] h e a d r = A t t e n t i o n ( W i Q Q , W i K K , W i V V ) \begin{aligned}&Attention(Q, K, V)=softmax(\frac{QK^T}{\sqrt{d_w}})V\\ &MultiHead(Q,K,V)=W^M[head_1;...head^r]\\ &head_r=Attention(W_i^QQ, W_i^KK, W_i^VV)\end{aligned} Attention(Q,K,V)=softmax(dwQKT)VMultiHead(Q,K,V)=WM[head1;...headr]headr=Attention(WiQQ,WiKK,WiVV) - BiLSTM:双向LSTM
- Entity-aware Attention:
p
i
e
1
p_i^{e_1}
pie1和
p
i
e
2
p_i^{e_2}
pie2是position embedding,
h
e
1
h_{e_1}
he1和
h
e
2
h_{e_2}
he2是两个实体在上层BiLSTM的输出,
h
i
h_i
hi是上层BiLSTM对应当前词的输出。
c
i
c_i
ci是
i
i
i-th latent type vector。
u i = t a n h ( W H [ h i ; p i e 1 ; p i e 2 ] + W H [ h e 1 ; t 1 ; h e 2 ; t 2 ] ) α = e x p ( v T u i ) ∑ j = 1 n e x p ( v T u i ) z = ∑ i = 1 n α i h i a i j = ( ( h e j ) T c i ) ∑ k = 1 K ( ( h e j ) T c i ) t j ∈ { 1 , 2 } = ∑ i = 1 K a i j c i \begin{aligned} u_i & =tanh(W^H[h_i;p_i^{e_1};p_i^{e_2}]+W^H[h_{e_1};t_1;h_{e_2};t_2]) \\ \alpha & = \frac{exp(v^Tu_i)}{\sum_{j=1}^{n}exp(v^Tu_i)} \\ z & =\sum_{i=1}^n\alpha_ih_i \\ a_i^j &= \frac{((h_{e_j})^Tc_i)}{\sum_{k=1}^K((h_{e_j})^Tc_i)} \\ t_{j\in\{1,2\}} &= \sum_{i=1}^Ka_i^jc_i \end{aligned} uiαzaijtj∈{1,2}=tanh(WH[hi;pie1;pie2]+WH[he1;t1;he2;t2])=∑j=1nexp(vTui)exp(vTui)=i=1∑nαihi=∑k=1K((hej)Tci)((hej)Tci)=i=1∑Kaijci - 损失函数
L = − ∑ i = 1 ∣ D ∣ l o g ( y ( i ) ∣ S ( i ) , θ ) + λ ∥ θ ∥ 2 2 L=-\sum_{i=1}^{|D|}log(y^{(i)}|S^{(i)}, \theta)+\lambda\|\theta\|^2_2 L=−i=1∑∣D∣log(y(i)∣S(i),θ)+λ∥θ∥22
代码
def __init__(self, sequence_length, num_classes,
vocab_size, embedding_size, pos_vocab_size, pos_embedding_size,
hidden_size, num_heads, attention_size,
use_elmo=False, l2_reg_lambda=0.0):
# Placeholders for input, output and dropout
self.input_x = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_x')
self.input_y = tf.placeholder(tf.float32, shape=[None, num_classes], name='input_y')
self.input_text = tf.placeholder(tf.string, shape=[None, ], name='input_text')
self.input_e1 = tf.placeholder(tf.int32, shape=[None, ], name='input_e1')
self.input_e2 = tf.placeholder(tf.int32, shape=[None, ], name='input_e2')
self.input_p1 = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_p1')
self.input_p2 = tf.placeholder(tf.int32, shape=[None, sequence_length], name='input_p2')
self.emb_dropout_keep_prob = tf.placeholder(tf.float32, name='emb_dropout_keep_prob')
self.rnn_dropout_keep_prob = tf.placeholder(tf.float32, name='rnn_dropout_keep_prob')
self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob')
if use_elmo:
# Contextual Embedding Layer
with tf.variable_scope("elmo-embeddings"):
elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
self.embedded_chars = elmo_model(self.input_text, signature="default", as_dict=True)["elmo"]
else:
# Word Embedding Layer
with tf.device('/cpu:0'), tf.variable_scope("word-embeddings"):
self.W_text = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -0.25, 0.25), name="W_text")
self.embedded_chars = tf.nn.embedding_lookup(self.W_text, self.input_x)
# Position Embedding Layer
with tf.device('/cpu:0'), tf.variable_scope("position-embeddings"):
self.W_pos = tf.get_variable("W_pos", [pos_vocab_size, pos_embedding_size], initializer=initializer())
self.p1 = tf.nn.embedding_lookup(self.W_pos, self.input_p1)[:, :tf.shape(self.embedded_chars)[1]]
self.p2 = tf.nn.embedding_lookup(self.W_pos, self.input_p2)[:, :tf.shape(self.embedded_chars)[1]]
# Dropout for Word Embedding
with tf.variable_scope('dropout-embeddings'):
self.embedded_chars = tf.nn.dropout(self.embedded_chars, self.emb_dropout_keep_prob)
# Self Attention
with tf.variable_scope("self-attention"):
self.self_attn, self.self_alphas = multihead_attention(self.embedded_chars, self.embedded_chars,
num_units=embedding_size, num_heads=num_heads)
# Bidirectional LSTM
with tf.variable_scope("bi-lstm"):
_fw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size, initializer=initializer())
fw_cell = tf.nn.rnn_cell.DropoutWrapper(_fw_cell, self.rnn_dropout_keep_prob)
_bw_cell = tf.nn.rnn_cell.LSTMCell(hidden_size, initializer=initializer())
bw_cell = tf.nn.rnn_cell.DropoutWrapper(_bw_cell, self.rnn_dropout_keep_prob)
self.rnn_outputs, _ = tf.nn.bidirectional_dynamic_rnn(cell_fw=fw_cell,
cell_bw=bw_cell,
inputs=self.self_attn,
sequence_length=self._length(self.input_x),
dtype=tf.float32)
self.rnn_outputs = tf.concat(self.rnn_outputs, axis=-1)
# Attention
with tf.variable_scope('attention'):
self.attn, self.alphas, self.e1_alphas, self.e2_alphas = attention(self.rnn_outputs,
self.input_e1, self.input_e2,
self.p1, self.p2,
attention_size=attention_size)
# Dropout
with tf.variable_scope('dropout'):
self.h_drop = tf.nn.dropout(self.attn, self.dropout_keep_prob)
# Fully connected layer
with tf.variable_scope('output'):
self.logits = tf.layers.dense(self.h_drop, num_classes, kernel_initializer=initializer())
self.predictions = tf.argmax(self.logits, 1, name="predictions")
# Calculate mean cross-entropy loss
with tf.variable_scope("loss"):
losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.logits, labels=self.input_y)
self.l2 = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()])
self.loss = tf.reduce_mean(losses) + l2_reg_lambda * self.l2
# Accuracy
with tf.variable_scope("accuracy"):
correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name="accuracy")
其中multi-head attention代码如下:
def multihead_attention(queries, keys, num_units, num_heads,
dropout_rate=0, scope="multihead_attention", reuse=None):
with tf.variable_scope(scope, reuse=reuse):
# Linear projections
Q = tf.layers.dense(queries, num_units, kernel_initializer=initializer()) # (N, T_q, C)
K = tf.layers.dense(keys, num_units, kernel_initializer=initializer()) # (N, T_k, C)
V = tf.layers.dense(keys, num_units, kernel_initializer=initializer()) # (N, T_k, C)
# Split and concat
Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h)
K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h)
V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h)
# Multiplication
outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k)
# Scale
outputs /= K_.get_shape().as_list()[-1] ** 0.5
# Key Masking
key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1))) # (N, T_k)
key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k)
key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k)
paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k)
# Activation
alphas = tf.nn.softmax(outputs) # (h*N, T_q, T_k)
# Query Masking
query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1))) # (N, T_q)
query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q)
query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k)
alphas *= query_masks # broadcasting. (N, T_q, C)
# Dropouts
alphas = tf.layers.dropout(alphas, rate=dropout_rate, training=tf.convert_to_tensor(True))
# Weighted sum
outputs = tf.matmul(alphas, V_) # ( h*N, T_q, C/h)
# Restore shape
outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) # (N, T_q, C)
# Linear
outputs = tf.layers.dense(outputs, num_units, activation=tf.nn.relu, kernel_initializer=initializer())
# Residual connection
outputs += queries
# Normalize
outputs = layer_norm(outputs) # (N, T_q, C)
return outputs, alphas
def layer_norm(inputs, epsilon=1e-8, scope="layer_norm", reuse=None):
with tf.variable_scope(scope, reuse=reuse):
inputs_shape = inputs.get_shape()
params_shape = inputs_shape[-1:]
mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
beta = tf.Variable(tf.zeros(params_shape))
gamma = tf.Variable(tf.ones(params_shape))
normalized = (inputs - mean) / ((variance + epsilon) ** (.5))
outputs = gamma * normalized + beta
return outputs
entity-aware attention如下:
def attention(inputs, e1, e2, p1, p2, attention_size):
# inputs = (batch, seq_len, hidden)
# e1, e2 = (batch, seq_len)
# p1, p2 = (batch, seq_len, dist_emb_size)
# attention_size = scalar(int)
def extract_entity(x, e):
e_idx = tf.concat([tf.expand_dims(tf.range(tf.shape(e)[0]), axis=-1), tf.expand_dims(e, axis=-1)], axis=-1)
return tf.gather_nd(x, e_idx) # (batch, hidden)
seq_len = tf.shape(inputs)[1] # fixed at run-time
hidden_size = inputs.shape[2].value # fixed at compile-time
latent_size = hidden_size
# Latent Relation Variable based on Entities
e1_h = extract_entity(inputs, e1) # (batch, hidden)
e2_h = extract_entity(inputs, e2) # (batch, hidden)
e1_type, e2_type, e1_alphas, e2_alphas = latent_type_attention(e1_h, e2_h,
num_type=3,
latent_size=latent_size) # (batch, hidden)
e1_h = tf.concat([e1_h, e1_type], axis=-1) # (batch, hidden+latent)
e2_h = tf.concat([e2_h, e2_type], axis=-1) # (batch, hidden+latent)
# v*tanh(W*[h;p1;p2]+W*[e1;e2]) 85.18%? 84.83% 84.55%
e_h = tf.layers.dense(tf.concat([e1_h, e2_h], -1), attention_size, use_bias=False, kernel_initializer=initializer())
e_h = tf.reshape(tf.tile(e_h, [1, seq_len]), [-1, seq_len, attention_size])
v = tf.layers.dense(tf.concat([inputs, p1, p2], axis=-1), attention_size, use_bias=False, kernel_initializer=initializer())
v = tf.tanh(tf.add(v, e_h))
u_omega = tf.get_variable("u_omega", [attention_size], initializer=initializer())
vu = tf.tensordot(v, u_omega, axes=1, name='vu') # (batch, seq_len)
alphas = tf.nn.softmax(vu, name='alphas') # (batch, seq_len)
# output
output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1) # (batch, hidden)
return output, alphas, e1_alphas, e2_alphas
def latent_type_attention(e1, e2, num_type, latent_size):
# Latent Entity Type Vectors
latent_type = tf.get_variable("latent_type", shape=[num_type, latent_size], initializer=initializer())
# e1_h = tf.layers.dense(e1, latent_size, kernel_initializer=initializer())
# e2_h = tf.layers.dense(e2, latent_size, kernel_initializer=initializer())
e1_sim = tf.matmul(e1, tf.transpose(latent_type)) # (batch, num_type)
e1_alphas = tf.nn.softmax(e1_sim, name='e1_alphas') # (batch, num_type)
e1_type = tf.matmul(e1_alphas, latent_type, name='e1_type') # (batch, hidden)
e2_sim = tf.matmul(e2, tf.transpose(latent_type)) # (batch, num_type)
e2_alphas = tf.nn.softmax(e2_sim, name='e2_alphas') # (batch, num_type)
e2_type = tf.matmul(e2_alphas, latent_type, name='e2_type') # (batch, hidden)
return e1_type, e2_type, e1_alphas, e2_alphas