关系抽取(一) — CNN-based Models
本文对NLP-Progress( https://github.com/sebastianruder/NLP-progress/blob/master/english/relationship_extraction.md)关系抽取的文章进行解读。
Relation Classification via Convolutional Deep Neural Network
现状
The state-of-the-art methods used for relation classification are primarily based on statistical machine learning, and their performance strongly depends on the quality of the extracted features. The extracted features are often derived from the output of pre-existing natural language processing (NLP) systems, which leads to the propagation of the errors in the existing tools and hinders the performance of these systems.
当前的SOTA模型基本上都依赖于NLP工具提取的特征,但是这些特征提取本身存在误差。
模型
- word embedding,使用预训练好的word embedding(HLBL, SENNA, Turian’s, Huang’s, word2vec, glove等)来初始化。
- Lexical Level Features: 传统的lexical level feature主要包括实体本身、两个实体的type、两个实体直接的词序列,这些特征强依赖于NLP工具。本文使用两个实体本身的embedding、实体的context的embedding和wordnet。
- Sentence Level Features:需要捕捉long distance features and semantic compositionality。
- Word Features: 引入context
- Position Features: 距离两个实体的距离的拼接
- Convolution: word representation只捕捉了local features, conv可以用来捕捉global特征,其中max-pooling从feature vectors中选出每个维度最重要的feature
- Sentence Level Feature Vector:
- Ouput:拼接lexical feature和sentence level feature,经过linear dense+softmax输出
代码
数据准备
def _position_feature(raw_example):
def distance(n):
# 距离scale到0-122区间
if n < -60:
return 0
elif -60 <= n <= 60:
return n + 61
return 122
e1_idx = raw_example.entity1.first
e2_idx = raw_example.entity2.first
position1 = []
position2 = []
length = len(raw_example.sentence)
for i in range(length):
position1.append(distance(i - e1_idx))
position2.append(distance(i - e2_idx))
return position1, position2
def build_sequence_example(raw_example):
ex = tf.train.SequenceExample()
e1_idx = raw_example.entity1.first
e2_idx = raw_example.entity2.first
context1 = _entity_context(e1_idx, raw_example.sentence)
context2 = _entity_context(e2_idx, raw_example.sentence)
# lexical特征
lexical = context1 + context2
# lexical = _lexical_feature(raw_example)
ex.context.feature['lexical'].int64_list.value.extend(lexical)
rid = raw_example.label
ex.context.feature['rid'].int64_list.value.append(rid)
for word_id in raw_example.sentence:
word = ex.feature_lists.feature_list['sentence'].feature.add()
word.int64_list.value.append(word_id)
# position特征
position1, position2 = _position_feature(raw_example)
for pos_val in position1:
pos = ex.feature_lists.feature_list['position1'].feature.add()
pos.int64_list.value.append(pos_val)
for pos_val in position2:
pos = ex.feature_lists.feature_list['position2'].feature.add()
pos.int64_list.value.append(pos_val)
CNN Model代码
# input data
lexical, rid, sentence, pos1, pos2 = data
# embedding initialization
w_trainable = True if FLAGS.word_dim == 50 else False
word_embed = tf.get_variable('word_embed',
initializer=word_embed,
dtype=tf.float32,
trainable=w_trainable)
pos1_embed = tf.get_variable('pos1_embed', shape=[pos_num, pos_dim])
pos2_embed = tf.get_variable('pos2_embed', shape=[pos_num, pos_dim])
# # embedding lookup
lexical = tf.nn.embedding_lookup(word_embed, lexical) # batch_size, 6, word_dim
lexical = tf.reshape(lexical, [-1, 6 * word_dim])
self.labels = tf.one_hot(rid, num_relations) # batch_size, num_relations
sentence = tf.nn.embedding_lookup(word_embed, sentence) # batch_size, max_len, word_dim
pos1 = tf.nn.embedding_lookup(pos1_embed, pos1) # batch_size, max_len, pos_dim
pos2 = tf.nn.embedding_lookup(pos2_embed, pos2) # batch_size, max_len, pos_dim
# cnn model
# 拼接word, position特征
sent_pos = tf.concat([sentence, pos1, pos2], axis=2)
if is_train:
sent_pos = tf.nn.dropout(sent_pos, keep_prob)
# conv提取sentence level feature
feature = cnn_forward('cnn', sent_pos, lexical, num_filters)
feature_size = feature.shape.as_list()[1]
self.feature = feature
if is_train:
feature = tf.nn.dropout(feature, keep_prob)
# Map the features to 19 classes
# 输出
logits, loss_l2 = linear_layer('linear_cnn', feature,
feature_size, num_relations,
is_regularize=True)
prediction = tf.nn.softmax(logits)
prediction = tf.argmax(prediction, axis=1)
accuracy = tf.equal(prediction, tf.argmax(self.labels, axis=1))
accuracy = tf.reduce_mean(tf.cast(accuracy, tf.float32))
loss_ce = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(labels=self.labels, logits=logits))
self.logits = logits
self.prediction = prediction
self.accuracy = accuracy
self.loss = loss_ce + 0.01 * loss_l2
if not is_train:
return
# global_step = tf.train.get_or_create_global_step()
global_step = tf.Variable(0, trainable=False, name='step', dtype=tf.int32)
optimizer = tf.train.AdamOptimizer(lrn_rate)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops): # for batch_norm
self.train_op = optimizer.minimize(self.loss, global_step)
self.global_step = global_step
其中cnn_forward如下:
def cnn_forward(name, sent_pos, lexical, num_filters):
with tf.variable_scope(name):
input_data = tf.expand_dims(sent_pos, axis=-1)
input_dim = input_data.shape.as_list()[2]
# convolution layer
pool_outputs = []
for filter_size in [3, 4, 5]:
with tf.variable_scope('conv-%s' % filter_size):
conv_weight = tf.get_variable('W1',
[filter_size, input_dim, 1, num_filters],
initializer=tf.truncated_normal_initializer(stddev=0.1))
conv_bias = tf.get_variable('b1', [num_filters],
initializer=tf.constant_initializer(0.1))
# input_data shape: [batch_size, seq_len, input_dim, in_channel]
# padding为same的output shape: ceil((in_height)/stride_height))
# padding为valid的output shape: ceil((in_height - filter_height + 1)/stride_height)
# 输出shape: [batch_size, seq_len, 1, num_filters]
conv = tf.nn.conv2d(input_data,
conv_weight,
strides=[1, 1, input_dim, 1],
padding="SAME")
conv = tf.nn.relu(conv + conv_bias) # batch_size, max_len, 1, num_filters
conv = tf.nn.relu(conv + conv_bias)
max_len = FLAGS.max_len
# 输出shape: [batch_size, 1, 1, num_filters]
pool = tf.nn.max_pool(conv,
ksize=[1, max_len, 1, 1],
strides=[1, max_len, 1, 1],
padding="SAME") # batch_size, 1, 1, num_filters
pool_outputs.append(pool)
pools = tf.reshape(tf.concat(pool_outputs, 3), [-1, 3 * num_filters])
# feature
feature = pools
# 拼接特征
if lexical is not None:
feature = tf.concat([lexical, feature], axis=1)
return feature
Classifying Relations by Ranking with Convolutional Neural Networks
现状
使用人工制定的feature(如wordnet,dependency parser, ner等),这些特征提取本身都存在误差
模型
- word embedding + position embedding。其中position embedding是当前词到两个实体词的距离concatenation。 最后将word embedding和position embedding进行concatenation
- Sentence Representation: 先捕捉local features,然后使用max(pooling)产生sentence向量
- softmax层
- pairwise ranking loss function,同时加入l2正则。
L = l o g ( 1 + e x p ( γ ( m + − s θ ( x ) y + ) ) + l o g ( 1 + e x p ( γ ( m − + s θ ( x ) y − ) ) L=log(1+exp(\gamma(m^+-s_\theta(x)_{y^+}))+log(1+exp(\gamma(m^-+s_\theta(x)_{y^-})) L=log(1+exp(γ(m+−sθ(x)y+))+log(1+exp(γ(m−+sθ(x)y−))
该损失函数的训练目标使correct class的score大于 m + m^+ m+,使incorrect class的score小于 − m − -m^- −m−
实现
代码:https://github.com/pratapbhanu/CRCNN
# 句子embedding
sent_input = tf.nn.embedding_lookup(params=self.sent_embedding, ids=self.sent)
# 和开始实体的距离embedding
ent1_dist_input = tf.nn.embedding_lookup(params=self.dist_embedding,
ids=self.ent1_dist)
# 和结尾实体的距离embedding
ent2_dist_input = tf.nn.embedding_lookup(params=self.dist_embedding,
ids=self.ent2_dist)
# 拼接三个embedding
conv_input = tf.concat([sent_input, ent1_dist_input, ent2_dist_input],
axis=-1)
conv_input = tf.expand_dims(conv_input, -1, name='input')
input_dim = params.get('embeddings.dim') + 2*params.get('embeddings.dist.dim')
##Convolutional & pooling Layers
with tf.variable_scope('conv') as scope:
pool_tensors = []
# 配置的kernel size为2,3,4,5
for w_size in params.get('window'):
# CONV + max pooling网络(Text-CNN)
fw = variable_on_device(name='fw_'+str(w_size),
shape=[w_size, input_dim, 1, params.get('nfeature_map')],
initializer=tf.random_uniform_initializer(
-params["embeddings.init_scale"],
params["embeddings.init_scale"]),
device=params.get('device'))
# shape为[batch_size, seq_len - w_size + 1, 1, nfeature_map]
conv = tf.nn.conv2d(input=conv_input, filter=fw,
strides=[1,1,1,1], padding='VALID')
biases = variable_on_device(name='biases_'+str(w_size),
shape=[params.get('nfeature_map')],
initializer=tf.constant_initializer(0.0),
device=params.get('device'))
bias = tf.nn.bias_add(conv, biases)
relu = tf.nn.relu(bias, name=scope.name)
conv_len = relu.get_shape()[1]
# shape为[batch_size, 1, 1, nfeature_map]
pool = tf.nn.max_pool(relu, ksize=[1,conv_len,1,1],
strides=[1,1,1,1], padding='VALID')
pool = tf.squeeze(pool,squeeze_dims=[1,2])
# shape为[batch_size, nfeature_map]
pool_tensors.append(pool)
##pooling & concatenation operation
num_filters = len(params.get('window'))
pool_size = num_filters * params.get('nfeature_map')
pool_layer = tf.concat(pool_tensors, -1, name='pool')
pool_flat = tf.reshape(pool_layer, [-1, pool_size])
##Dropout Layer
pool_dropout = tf.nn.dropout(pool_flat, keep_prob=self.dropout_keep_proba)
##Dense Projection Layer
input_ = pool_dropout
input_size = pool_size
with tf.variable_scope('fc') as scope:
W = variable_on_device(name='W', shape=[input_size, params.get('nclass')],
initializer=tf.random_uniform_initializer(
-params["embeddings.init_scale"],
params["embeddings.init_scale"]),
device=params.get('device'))
biases = variable_on_device(name='biases', shape=[params.get('nclass')],
initializer=tf.constant_initializer(0.01),
device=params.get('device'))
##dense layer operation
self.logits = tf.nn.bias_add(tf.matmul(input_, W), biases)
##softmax
self.pred_probas = tf.nn.softmax(self.logits, name='class_proba')
self.preds = tf.argmax(self.pred_probas, axis=-1, name='class_prediction')
#loss using graph's output(s)
self._loss = self._loss(params)
self.l2loss = self._l2loss(params)
self.loss = self._loss + self.l2loss
其中_loss如下,加入ranking loss
@staticmethod
def ranking_loss(params, labels, logits, batch_size):
lm = tf.constant(params.get('lm')) #lambda
m_plus = tf.constant(params.get('margin_plus'))
m_minus = tf.constant(params.get('margin_minus'))
L = tf.constant(0.0)
i = tf.constant(0)
cond = lambda i, L: tf.less(i, batch_size)
def loop_body(i, L):
# 该样本对应的正类
cplus = labels[i] #positive class label index
#taking most informative negative class, use 2nd argmax
# 该样本对应的得分最高的负类
_, cminus_indices = tf.nn.top_k(logits[i,:], k=2)
cminus = tf.cond(tf.equal(cplus, cminus_indices[0]),
lambda: cminus_indices[1], lambda: cminus_indices[0])
splus = logits[i,cplus] #score for gold class
sminus = logits[i,cminus] #score for negative class
# 见公式
l = tf.log((1.0+tf.exp((lm*(m_plus-splus))))) + \
tf.log((1.0+tf.exp((lm*(m_minus+sminus)))))
return [tf.add(i, 1), tf.add(L,l)]
_, L = tf.while_loop(cond, loop_body, loop_vars=[i,L])
nbatch = tf.to_float(batch_size)
L = L/nbatch
return L
_l2loss如下,加入l2正则
def _l2loss(self, params):
vars_ = [v for v in tf.trainable_variables() if 'biases' not in v.name
and 'W_d' not in v.name and 'W_s' not in v.name]
l2loss = tf.multiply(tf.add_n([ tf.nn.l2_loss(v) for v in vars_ ]),
params.get('l2'), name='l2loss')
return l2loss
Relation Classification via Multi-Level Attention CNNs
现状
An effective solution needs to be able to account for useful semantic and syntactic features not only for the meanings of the target entities at the lexical level, but also for their immediate context and for the overall sentence structure.
these models often fail to identify critical cues, and many of them still require an external dependency parser
捕捉lexical, semantic and syntactic很重要,当前的很多模型不能捕捉,或者需要依赖额外的dependency parser工具。
模型
多层attention模型来捕捉entity-specific attention和relation-specific pooling attention。
pair-wise margin-based objective function
- Input Representation: 词嵌入lexical-semantic features,两个位置embedding,将三个embedding拼接
- Input Attention Mechanism:capture the relevance of words with respect to the target entities
1)Contextual Relevance Matrices:词和实体之间的关联度;
2)Input Attention Composition:组合关联度 - Convolutional Max-Pooling with Secondary Attention:more abstract higher-level features
1)Convolution Layer:conv + tanh,learn to recognize short phrases such as trigrams
2)Attention-Based Pooling:确定出convolved context windows from the sentence中的那部分和relation相关,那部分不相关。
G = R ∗ T U W L G=R^{*T}UW^L G=R∗TUWL,其中 U U U是权重矩阵, R ∗ T R^{*T} R∗T是上一步的convolved context输出, W L W^L WL是relation的embedding向量
再经过softmax:
A i , j P = e x p ( G i , j ) ∑ i ′ = 1 n e x p ( G i ′ , j ) A_{i,j}^P=\frac{exp(G_{i,j})}{\sum_{i'=1}^nexp(G_{i',j})} Ai,jP=∑i′=1nexp(Gi′,j)exp(Gi,j)
max pooling: A i , j P A_{i,j}^P Ai,jP乘上convolved output R ∗ R^* R∗从而highlight important individual phrase-level components,再通过max-pooling获得最显著的部分。
W i O = m a x j ( R ∗ A P ) i , j W_{i}^O=max_j(R^*A^P)_{i,j} WiO=maxj(R∗AP)i,j - 损失函数pair-wise margin-based objective function
L = [ δ θ ( S , y ) + ( 1 − δ θ ( S , y ^ − ) ) ] + β ∥ θ ∥ 2 = [ 1 + ∥ W O ∣ W O ∣ − W y L ∥ − ∥ W O ∣ W O ∣ − W y ^ − L ∥ ] + β ∥ θ ∥ 2 \begin{aligned} L& = [\delta_\theta(S,y) + (1 - \delta_\theta(S,\hat{y}^-))] + \beta\|\theta\|^2\\ & =\left[1+ \left \|\frac{W^O}{|W^O|} - W_y^L\right\|- \left \|\frac{W^O}{|W^O|} - W_{\hat{y}^-}^L\right\|\right]+ \beta\|\theta\|^2\end{aligned} L=[δθ(S,y)+(1−δθ(S,y^−))]+β∥θ∥2=[1+∥∥∥∥∣WO∣WO−WyL∥∥∥∥−∥∥∥∥∣WO∣WO−Wy^−L∥∥∥∥]+β∥θ∥2
其中距离函数为:
δ
θ
(
S
,
y
)
=
∥
W
O
∣
W
O
∣
−
W
y
L
∥
\delta_\theta(S,y) = \left \|\frac{W^O}{|W^O|} - W_y^L\right\|
δθ(S,y)=∥∥∥∥∣WO∣WO−WyL∥∥∥∥
代码
def __init__(self, config, embeddings, is_training=True):
bz = config.batch_size
n = config.max_len
dw = config.embedding_size
dp = config.pos_embed_size
d = dw+2*dp
np = config.pos_embed_num
nr = config.classnum # number of relations
dc = config.num_filters
keep_prob = config.keep_prob
self.config = config
with tf.name_scope('input'):
in_x = tf.placeholder(dtype=tf.int32, shape=[bz,n], name='in_x') # sentences
in_e1 = tf.placeholder(dtype=tf.int32, shape=[bz], name='in_e1')
in_e2 = tf.placeholder(dtype=tf.int32, shape=[bz], name='in_e2')
in_dist1 = tf.placeholder(dtype=tf.int32, shape=[bz,n], name='in_dist1')
in_dist2 = tf.placeholder(dtype=tf.int32, shape=[bz,n], name='in_dist2')
in_y = tf.placeholder(dtype=tf.int32, shape=[bz], name='in_y') # relations
self.inputs = (in_x, in_e1, in_e2, in_dist1, in_dist2, in_y)
with tf.name_scope('embeddings'):
initializer = tf.truncated_normal_initializer(stddev=0.1)
embed = tf.get_variable(initializer=embeddings, dtype=tf.float32, name='word_embed')
pos1_embed = tf.get_variable(shape=[np, dp],name='position1_embed')
pos2_embed = tf.get_variable(shape=[np, dp],name='position2_embed')
rel_embed = tf.get_variable(initializer=initializer,shape=[nr, dc],name='relation_embed')
# embdding lookup
e1 = tf.nn.embedding_lookup(embed, in_e1, name='e1')# bz,dw
e2 = tf.nn.embedding_lookup(embed, in_e2, name='e2')# bz,dw
x = tf.nn.embedding_lookup(embed, in_x, name='x') # bz,n,dw
dist1 = tf.nn.embedding_lookup(pos1_embed, in_dist1, name='dist1')#bz, n, k,dp
dist2 = tf.nn.embedding_lookup(pos2_embed, in_dist2, name='dist2')# bz, n, k,dp
y = tf.nn.embedding_lookup(rel_embed, in_y, name='y')# bz, dc
# 拼接word embedding和position embedding
x_concat = tf.reshape(tf.concat([x, dist1, dist2], -1), # bz, n, d
[bz,n,d,1])
if is_training and keep_prob < 1:
x_concat = tf.nn.dropout(x_concat, keep_prob)
self.l2_loss = tf.nn.l2_loss(rel_embed)
with tf.name_scope('forword'):
alpha = self._input_attention(x, e1, e2, initializer=initializer)
R = self._convolution(x_concat, initializer=initializer, alpha=alpha)
wo = self._attentive_pooling(R, rel_embed, initializer=initializer)
if is_training and keep_prob < 1:
wo = tf.nn.dropout(wo, keep_prob)
self._loss_and_train(wo, rel_embed, in_y, y, is_training)
计算primary attention,词和两个实体的关联度,并组合。
def _input_attention(self, x, e1, e2, initializer=None):
bz = self.config.batch_size
n = self.config.max_len
with tf.name_scope('input_attention'):
A1 = tf.matmul(x, tf.expand_dims(e1, -1))# bz, n, 1
A2 = tf.matmul(x, tf.expand_dims(e2, -1))
A1 = tf.reshape(A1, [bz, n])
A2 = tf.reshape(A2, [bz, n])
alpha1 = tf.nn.softmax(A1)# bz, n
alpha2 = tf.nn.softmax(A2)# bz, n
alpha = (alpha1 + alpha2)/2
return alpha
其中conv如下:
def _convolution(self, x_concat, initializer=None, alpha=None):
bz = self.config.batch_size
n = self.config.max_len
k = self.config.slide_window
dw = self.config.embedding_size
dp = self.config.pos_embed_size
d = dw+2*dp
dc = self.config.num_filters
with tf.name_scope('convolution'):
# x: (batch_size, max_len, embdding_size, 1)
# w: (filter_size, embdding_size, 1, num_filters)
w = tf.get_variable(initializer=initializer,shape=[k, d, 1, dc],name='weight')
b = tf.get_variable(initializer=initializer,shape=[dc],name='bias')
conv = tf.nn.conv2d(x_concat, w, strides=[1,1,d,1],padding="SAME")# bz, n, 1, dc
R = tf.nn.tanh(tf.nn.bias_add(conv,b),name="R") # bz, n, 1, dc
R = tf.reshape(R, [bz, n, dc])
R = tf.multiply(R, tf.reshape(alpha, [bz, n, 1])) # bz, n, dc
self.l2_loss += tf.nn.l2_loss(w)
self.l2_loss += tf.nn.l2_loss(b)
return R
其中 attention pooling如下:
def _attentive_pooling(self, R, rel_embed, initializer=None):
bz = self.config.batch_size
n = self.config.max_len
k = self.config.slide_window
dw = self.config.embedding_size
dp = self.config.pos_embed_size
d = dw+2*dp
dc = self.config.num_filters
nr = self.config.classnum
with tf.name_scope('attention_pooling'):
# U: [dc, nr]
U = tf.get_variable(initializer=initializer,shape=[dc,nr],name='U')
G = tf.matmul(tf.reshape(R, [bz*n, dc]), U)# (bz*n,dc), (dc, nr) => (bz*n, nr)
G = tf.matmul(G, rel_embed) # (bz*n, nr), (nr, dc) => (bz*n, dc)
G = tf.reshape(G, [bz, n, dc])
AP = tf.nn.softmax(G, dim=1)# attention pooling tensor
# predict
wo = tf.matmul(
tf.transpose(R, perm=[0, 2, 1]), # batch transpose: (bz, n, dc) => (bz,dc,n)
AP
)# (bz, dc, dc)
# wo = tf.reduce_max(wo, axis=-1) # (bz, dc)
wo = tf.nn.max_pool(tf.expand_dims(wo,-1),
ksize=[1,1,dc,1],
strides=[1,1,dc,1],
padding="SAME"
)# (bz, dc, 1, 1)
wo=tf.reshape(wo,[bz, dc])
self.l2_loss += tf.nn.l2_loss(U)
return wo
其中损失函数如下:
def _loss_and_train(self, wo, rel_embed, in_y, y, is_training):
nr = self.config.classnum
with tf.name_scope('predict'):
wo_norm = tf.nn.l2_normalize(wo, 1)
wo_norm_tile = tf.tile(tf.expand_dims(wo_norm, axis=1), [1, nr, 1])
all_distance = tf.norm(wo_norm_tile - tf.nn.l2_normalize(rel_embed, dim=1), axis=2)
predict = tf.argmin(all_distance, axis=1)
predict = tf.cast(predict, dtype=tf.int32)
acc = tf.reduce_sum(tf.cast(tf.equal(predict, in_y), dtype=tf.int32))
self.predict = predict
self.acc = acc
if not is_training:
return
with tf.name_scope('loss'):
mask = tf.one_hot(in_y, nr, on_value=1000., off_value=0.)# bz, nr
# 加mask为了排除positive label
neg_y = tf.argmin(tf.add(all_distance, mask), axis=1)# bz,
neg_y = tf.nn.embedding_lookup(rel_embed, neg_y)# bz, dc
neg_distance = tf.norm(wo_norm - tf.nn.l2_normalize(neg_y, dim=1), axis=1)
pos_distance = tf.norm(wo_norm - tf.nn.l2_normalize(y, dim=1), axis=1)
# pair-wise margin-based objective function计算
loss = tf.reduce_mean(pos_distance + (self.config.margin - neg_distance))
self.loss = loss + 0.003 * self.config.l2_reg_lambda * self.l2_loss
with tf.name_scope('optimizer'):
optimizer = tf.train.AdamOptimizer(self.config.learning_rate)
global_step = tf.Variable(0, trainable=False, name='global_step')
self.train_op = optimizer.minimize(self.loss)
# self.reg_op = reg_op
self.reg_op = tf.no_op()
self.global_step = global_step