paddlenlp.layers.LinearChainCrf源码解析
动机
第一次参加信息抽取相关的竞赛,在跑通官方提供的baseline之后,尝试在其预训练模型的基础上添加其他的网络层,于是选用了Bi-GRU+CRF结构,在调试代码的过程中遇到了很多坑,于是决定从源码开始,一步步分析代码。本篇博客就分析paddlenlp.layers.LinearChainCrf类。
def init(self, num_labels, crf_lr=0.1, with_start_stop_tag=True)参数理解
def __init__(self, num_labels, crf_lr=0.1, with_start_stop_tag=True):
super(LinearChainCrf, self).__init__()
if with_start_stop_tag:
self.num_tags = num_labels + 2 # Additional [START] and [STOP]
self.start_idx = int(self.num_tags - 1)
self.stop_idx = int(self.num_tags - 2)
else:
self.num_tags = num_labels
self.transitions = self.create_parameter(
attr=paddle.ParamAttr(learning_rate=crf_lr),
# 创建一个参数属性对象,用户可设置参数的名称、初始化方式、学习率、
# 正则化规则、是否需要训练、梯度裁剪方式、是否做模型平均等属性。
shape=[self.num_tags, self.num_tags],
dtype='float32')
self.with_start_stop_tag = with_start_stop_tag
self._initial_alpha = None
self._start_tensor = None
self._stop_tensor = None
self._batch_index = None
self._seq_index = None
self._batch_seq_index = None
num_labels(int): 标签种类数。
batch_size(int): 每批的样本数。
crf_lr(float): CRF层的学习率。
with_start_stop_tag(bool): 如果值为True,则开头和结尾的标签(一般是[CLS]/[SEP])就会被CRF层考虑在内,转移矩阵的就是一个[num_labels+2, num_labels+2]的Tensor;否则,转移矩阵的维度就是[num_labels, num_labels]
def _initialize_alpha(self, batch_size):
def _initialize_alpha(self, batch_size):
# alpha accumulate the path value to get the different next tag
if self._initial_alpha is None or batch_size > self._initial_alpha.shape[
0]:
# Initialized by a small value.
# 创建形状大小为 shape 并且数据类型为 dtype 的Tensor,
# 其中元素值均为 fill_value 。
initial_alpha = paddle.full(
(batch_size, self.num_tags - 1),
dtype='float32',
fill_value=-10000.)
# alpha_start fill_value = 0. > -10000., means the first one step START gets the most score.
alpha_start = paddle.full(
(batch_size, 1), dtype='float32', fill_value=0.)
self._initial_alpha = paddle.concat(
[initial_alpha, alpha_start], axis=1)
return self._initial_alpha[:batch_size, :]
loss
P ( y ∣ x ) = e s ( x , y ) ∑ y ~ ∈ Y x e s ( x , y ) P(y|x) = \frac{{{e^{s(x,y)}}}}{{\sum\nolimits_{\tilde y \in {Y_x}} {{e^{s(x,y)}}} }} P(y∣x)=∑y~∈Yxes(x,y)es(x,y)
训练目标是最大化logP(y|x) = s(x,y) - log( ∑ y ~ ∈ Y x e s ( x , y ) {\sum\nolimits_{\tilde y \in {Y_x}} {{e^{s(x,y)}}} } ∑y~∈Yxes(x,y))
其中s(x,y) = score(x,y) = ∑ i = 1 l E m i t ( x i , y i ) + ∑ i = 1 l T r a n s ( y i − 1 , y i ) \sum\limits_{i=1}^l Emit(x_i,y_i) + \sum\limits_{i=1}^lTrans(y_{i-1}, y_i) i=1∑lEmit(xi,yi)+i=1∑lTrans(yi−1,yi),l为序列长度
p ( y i ) = E m i t ( x i , y i ) p(y_i) = Emit(x_i,y_i) p(yi)=Emit(xi,yi)
T ( y i − 1 , y i ) = T r a n s ( y i − 1 , y i ) T(y_{i-1}, y_i)=Trans(y_{i-1}, y_i) T(yi−1,yi)=Trans(yi−1,yi)
def forward(self, inputs, lengths)
该函数的目的是计算
s(x,y) = score(x,y) =
∑
i
=
1
l
E
m
i
t
(
x
i
,
y
i
)
+
∑
i
=
1
l
T
r
a
n
s
(
y
i
−
1
,
y
i
)
\sum\limits_{i=1}^l Emit(x_i,y_i) + \sum\limits_{i=1}^lTrans(y_{i-1}, y_i)
i=1∑lEmit(xi,yi)+i=1∑lTrans(yi−1,yi)
def forward(self, inputs, lengths):
# inputs: [batch_size, seq_len, n_labels]
# lengths: [batch_size]
batch_size, seq_len, n_labels = inputs.shape
inputs_t_exp = inputs.transpose([1, 0, 2]).unsqueeze(-1).expand(
[seq_len, batch_size, n_labels, n_labels])
# unsqueeze(-1)是在最后一列增加一维,expand只能改变维度大小为1的维度
# inputs_t_exp: [seq_len, batch_size, n_labels, n_labels]
# trans_exp: [batch_size, n_labels, n_labels]
trans_exp = self.transitions.unsqueeze(0).expand(
[batch_size, n_labels, n_labels])
all_alpha = []
if self.with_start_stop_tag:
# alpha: [batch_size,n_labels]
alpha = self._initialize_alpha(batch_size)
for i, input_exp in enumerate(inputs_t_exp):
# input_exp: batch_size, n_labels, n_labels
# alpha_exp: batch_size, n_labels, n_labels
if i == 0 and not self.with_start_stop_tag:
mat = input_exp
else:
alpha_exp = alpha.unsqueeze(1).expand(
[batch_size, n_labels, n_labels])
# F(n) = logsumexp(F(n-1) + p(y_n) + T(y_{n-1}, y_n))
mat = input_exp + trans_exp + alpha_exp
alpha = paddle.logsumexp(mat, 2)
all_alpha.append(alpha)
# Get the valid alpha
all_alpha = paddle.stack(all_alpha).transpose([1, 0, 2])
batch_index = self._get_batch_index(batch_size)
last_index = lengths - 1
idxs = paddle.stack([batch_index, last_index], axis=1)
alpha = paddle.gather_nd(all_alpha, idxs)
if self.with_start_stop_tag:
# The last one step
alpha += self.transitions[self.stop_idx].unsqueeze(0)
norm_score = paddle.logsumexp(alpha, 1)
return norm_score
这里对unsqueeze()和expand()做出解释:
inputs_t_exp = inputs.transpose([1, 0, 2]).unsqueeze(-1).expand(
[seq_len, batch_size, n_labels, n_labels])
inputs的初始维度为[batch_size, seq_len, n_labels]经过unsqueeze()最后一列增加一维后维度变为[batch_size, seq_len, n_labels, 1],再经过expand()维度变为[seq_len, batch_size, n_labels, n_labels]。
logsumexp(x,axis)代表计算沿着axis的log(sum of
e
x
e^{x}
ex),例如:
x = paddle.to_tensor([[-1.5, 0., 2.], [3., 1.2, -2.4]])
out1 = paddle.logsumexp(x) # [3.4691226]
out2 = paddle.logsumexp(x, 1) # [2.15317821, 3.15684602]
未完待续~