参照了大佬大佬对应网站的写法
CRF对应的定义
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
class CRF(tf.keras.layers.Layer):
"""
Conditional Random Field layer (tf.keras)
`CRF` can be used as the last layer in a network (as a classifier). Input shape (features)
must be equal to the number of classes the CRF can predict (a linear layer is recommended).
Args:
num_labels (int): the number of labels to tag each temporal input.
Input shape:
nD tensor with shape `(batch_size, sentence length, num_classes)`.
Output shape:
nD tensor with shape: `(batch_size, sentence length, num_classes)`.
Masking
This layer supports keras masking for input data with a variable number
of timesteps. To introduce masks to your data,
use an embedding layer with the `mask_zero` parameter
set to `True` or add a Masking Layer before this Layer
"""
def __init__(self, sparse_target=True, **kwargs):
self.transitions = None
super(CRF, self).__init__(**kwargs)
self.sparse_target = sparse_target
self.sequence_lengths = None
self.mask = None
self.output_dim = None
def get_config(self):
config = {
"output_dim": self.output_dim,
"transitions": K.eval(self.transitions),
}
base_config = super(CRF, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
def build(self, input_shape):
self.output_dim = input_shape[-1]
# assert len(input_shape) == 3
self.transitions = self.add_weight(
name="transitions",
shape=[self.output_dim, self.output_dim],
initializer="glorot_uniform",
trainable=True
)
def call(self, inputs, mask=None, training=None):
if mask is not None:
self.sequence_lengths = K.sum(K.cast(mask, 'int32'), axis=-1)
self.mask = mask
else:
self.sequence_lengths = K.sum(K.ones_like(inputs[:, :, 0], dtype='int32'), axis=-1)
if training:
return inputs
viterbi_sequence, _ = tfa.text.crf_decode(
inputs, self.transitions, self.sequence_lengths
)
# tensorflow requires TRUE and FALSE branch has the same dtype
return K.cast(viterbi_sequence, inputs.dtype)
def loss(self, y_true, y_pred):
if len(K.int_shape(y_true)) == 3:
y_true = K.argmax(y_true, axis=-1)
if len(y_pred.shape) == 2:
y_pred = K.one_hot(K.cast(y_pred, 'int32'), self.output_dim)
log_likelihood, _ = tfa.text.crf_log_likelihood(
y_pred,
y_true,
self.sequence_lengths,
transition_params=self.transitions,
)
return tf.reduce_mean(-log_likelihood)
def compute_output_shape(self, input_shape):
return input_shape[:2] + (self.out_dim,)
def compute_mask(self, inputs, mask=None):
return mask
# use crf decode to estimate accuracy
def accuracy(self, y_true, y_pred):
mask = self.mask
if len(K.int_shape(y_true)) == 3:
y_true = K.argmax(y_true, axis=-1)
if len(y_pred.shape) == 3:
y_pred, _ = tfa.text.crf_decode(
y_pred, self.transitions, self.sequence_lengths
)
y_true = K.cast(y_true, y_pred.dtype)
is_equal = K.equal(y_true, y_pred)
is_equal = K.cast(is_equal, y_pred.dtype)
if mask is None:
return K.sum(is_equal) / K.sum(self.sequence_lengths)
else:
mask = K.cast(mask, y_pred.dtype)
return K.sum(is_equal * mask) / K.sum(mask)
测试样例如下(不使用混合精度):
from tensorflow.keras.layers import Input, Embedding, Bidirectional, GRU, Dense
from tensorflow.keras.models import Model
import tensorflow as tf
from tf2crf import CRF
tf.random.set_seed(200)
def test():
inputs = Input(shape=(None,), dtype='int32')
output = Embedding(100, 40, trainable=True, mask_zero=False)(inputs)
output = Bidirectional(GRU(64, return_sequences=True))(output)
output = Dense(9, activation=None)(output)
crf = CRF(dtype='float32')
output = crf(output)
model = Model(inputs, output)
model.compile(loss=crf.loss, optimizer='adam', metrics=[crf.accuracy])
x = [[5, 2, 3] * 3] * 10
y = [[1, 2, 3] * 3] * 10
model.fit(x=x, y=y, epochs=10, batch_size=4)
model.save('model')
if __name__ == '__main__':
test()
使用混合精度之后
from tensorflow.keras.layers import Input, Embedding, Bidirectional, GRU, Dense
from tensorflow.keras.models import Model
from tf2crf import CRF
from tensorflow.keras.mixed_precision import experimental as mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_policy(policy)
def test():
inputs = Input(shape=(None,), dtype='int32')
output = Embedding(100, 40, trainable=True, mask_zero=True)(inputs)
output = Bidirectional(GRU(64, return_sequences=True))(output)
output = Dense(9, activation=None)(output)
crf = CRF(dtype='float32')
output = crf(output)
model = Model(inputs, output)
model.compile(loss=crf.loss, optimizer='adam', metrics=[crf.accuracy])
x = [[5, 2, 3] * 3] * 10
y = [[1, 2, 3] * 3] * 10
model.fit(x=x, y=y, epochs=2, batch_size=2)
model.save('model')
if __name__ == '__main__':
test()
下面解读一下这段对应的crf代码
首先我认为非常精彩的部分在于sequence_lengths的计算过程
if mask is not None:
self.sequence_lengths = K.sum(K.cast(mask, 'int32'), axis=-1)
self.mask = mask
else:
self.sequence_lengths = K.sum(K.ones_like(inputs[:, :, 0], dtype='int32'), axis=-1)
比如输入的inputs = (None,None,768),这里的inputs[:,:,0] = (None,None),K.ones_like(inputs[:,:,0])与inputs[:,:,0]形状一致的全1的tensor类型,接着将最后一个维度(也就是inputs第二个维度)的数值相加,得到一个形状为(batch_size,)的长度tensor,比如batch_size = 5,这里存放的可以是[3,6,7,8,9],代表第一个句子长度为3,第二个句子长度为6,第三个句子长度为7,第四个句子长度为8,第五个句子长度为9。
这里的巧妙之处就在于使用一个K.ones_like去将原先的形状用1填满,然后使用K.sum将最后一维的1全部相加,达到最优的效果