Slot-Gated Modeling for Joint Slot Filling and Intent Prediction代码解析
预处理
def createVocabulary(input_path, output_path, no_pad=False):
if not isinstance(input_path, str):
raise TypeError('input_path should be string')
if not isinstance(output_path, str):
raise TypeError('output_path should be string')
vocab = {}
with open(input_path, 'r', encoding='utf8') as fd, \
open(output_path, 'w+', encoding='utf8') as out:
for line in fd:
line = line.rstrip('\r\n')
words = line.split()
for w in words:
if w == '_UNK':
break
if str.isdigit(w) == True:
w = '0'
if w in vocab:
vocab[w] += 1
else:
vocab[w] = 1
if no_pad == False:
vocab = ['_PAD', '_UNK'] + sorted(vocab, key=vocab.get, reverse=True)
else:
vocab = ['_UNK'] + sorted(vocab, key=vocab.get, reverse=True) # reverse=True指的是降序排列,vocab.get返回字典键值对的值
for v in vocab:
out.write(v+'\n')
先创建一个vocab的字典,该字典通过提取seq.out文件中每句话的每个词的slot信息。
再根据slot出现次数降序排列,并加入[’_PAD’, ‘_UNK’],写入到输出文件slot_vocab中
in_vocab = loadVocabulary(os.path.join(arg.vocab_path, 'in_vocab'))
slot_vocab = loadVocabulary(os.path.join(arg.vocab_path, 'slot_vocab'))
intent_vocab = loadVocabulary(os.path.join(arg.vocab_path, 'intent_vocab'))
def loadVocabulary(path):
if not isinstance(path, str):
raise TypeError('path should be a string')
vocab = []
rev = []
with open(path, encoding='utf8') as fd:
for line in fd:
line = line.rstrip('\r\n')
rev.append(line)
vocab = dict([(x,y) for (y,x) in enumerate(rev)])
return {'vocab': vocab, 'rev': rev}
返回一个字典,该字典有两个键值对,第一个键值对的值还是个字典,第二个键值对的值是个列表
以intent_vocab的结果为例
train_processor = DataProcessor(
os.path.join(full_train_path, arg.input_file),
os.path.join(full_train_path, arg.slot_file),
os.path.join(full_train_path, arg.intent_file),
in_vocab, slot_vocab, intent_vocab,
arg.maxlen
)
valid_processor = DataProcessor(
os.path.join(full_valid_path, arg.input_file),
os.path.join(full_valid_path, arg.slot_file),
os.path.join(full_valid_path, arg.intent_file),
in_vocab, slot_vocab, intent_vocab,
arg.maxlen
)
test_processor = DataProcessor(
os.path.join(full_test_path, arg.input_file),
os.path.join(full_test_path, arg.slot_file),
os.path.join(full_test_path, arg.intent_file),
in_vocab, slot_vocab, intent_vocab,
arg.maxlen
)
class DataProcessor(object):
def __init__(self, in_path, slot_path, intent_path, in_vocab, slot_vocab, intent_vocab,max_len):
self.__fd_in = open(in_path, 'r', encoding='utf8')
self.__fd_slot = open(slot_path, 'r', encoding='utf8')
self.__fd_intent = open(intent_path, 'r', encoding='utf8')
self.__in_vocab = in_vocab
self.__slot_vocab = slot_vocab
self.__intent_vocab = intent_vocab
self.max_len = max_len
self.end = 0
def close(self):
self.__fd_in.close()
self.__fd_slot.close()
self.__fd_intent.close()
def get_data(self):
in_data = [] #输入序列 ,padding
slot_data = [] # 输入序列对于的solt标签 ,padding
slot_weight = []
intents = [] #意图标签
batch_in = [] #输入序列
batch_slot = [] # 输入序列对于的solt标签
max_len = 0
#used to record word(not id)
in_seq = []
slot_seq = []
intent_seq = []
for i in range(100000):
inp = self.__fd_in.readline()
if inp == '':
self.end = 1
break
slot = self.__fd_slot.readline()
intent = self.__fd_intent.readline()
inp = inp.rstrip()
slot = slot.rstrip()
intent = intent.rstrip()
in_seq.append(inp)
slot_seq.append(slot)
intent_seq.append(intent)
iii=inp
sss=slot
inp = sentenceToIds(inp, self.__in_vocab)
slot = sentenceToIds(slot, self.__slot_vocab)
intent = sentenceToIds(intent, self.__intent_vocab)
batch_in.append(np.array(inp))
batch_slot.append(np.array(slot))
intents.append(intent[0])
if len(inp) != len(slot):
print(iii,sss)
print(inp,slot)
exit(0)
if len(inp) > max_len:
max_len = len(inp)
intents = np.asarray(intents)
for i, s in zip(batch_in, batch_slot):
in_data.append(padSentence(list(i), self.max_len, self.__in_vocab))
slot_data.append(padSentence(list(s), self.max_len, self.__slot_vocab))
#print(s)
in_data = np.asarray(in_data)
slot_data = np.asarray(slot_data)
self.close()
return in_data, slot_data, intents
主函数
train_X, train_slot_y, train_intent_y = train_processor.get_data()
跳入到get_data方法中
def get_data(self):
in_data = [] #输入序列 ,padding
slot_data = [] # 输入序列对于的solt标签 ,padding
slot_weight = []
intents = [] #意图标签
batch_in = [] #输入序列
batch_slot = [] # 输入序列对于的solt标签
max_len = 0
#used to record word(not id)
in_seq = []
slot_seq = []
intent_seq = []
for i in range(100000):
inp = self.__fd_in.readline()
if inp == '':
self.end = 1
break
slot = self.__fd_slot.readline()
intent = self.__fd_intent.readline()
inp = inp.rstrip()
slot = slot.rstrip()
intent = intent.rstrip()
in_seq.append(inp)
slot_seq.append(slot)
intent_seq.append(intent)
iii=inp
sss=slot
inp = sentenceToIds(inp, self.__in_vocab)
slot = sentenceToIds(slot, self.__slot_vocab)
intent = sentenceToIds(intent, self.__intent_vocab)
batch_in.append(np.array(inp))
batch_slot.append(np.array(slot))
intents.append(intent[0])
if len(inp) != len(slot):
print(iii,sss)
print(inp,slot)
exit(0)
if len(inp) > max_len:
max_len = len(inp)
intents = np.asarray(intents)
for i, s in zip(batch_in, batch_slot):
in_data.append(padSentence(list(i), self.max_len, self.__in_vocab))
slot_data.append(padSentence(list(s), self.max_len, self.__slot_vocab))
#print(s)
in_data = np.asarray(in_data)
slot_data = np.asarray(slot_data)
self.close()
return in_data, slot_data, intents
再跳入到
def sentenceToIds(data, vocab):
if not isinstance(vocab, dict):
raise TypeError('vocab should be a dict that contains vocab and rev')
vocab = vocab['vocab']
if isinstance(data, str):
words = data.split()
elif isinstance(data, list):
words = data
else:
raise TypeError('data should be a string or a list contains words')
ids = []
for w in words:
if str.isdigit(w) == True:
w = '0'
ids.append(vocab.get(w, vocab['_UNK']))
return ids
方法中,该方法是把一句话中的每个单词转换成对应词汇表中的数字,其中,如果单词是数字形式的字符串时,通通化为字符形式的数字0,对应到词汇表中为‘_pad’。
以下是词汇表
vocab.get(w, vocab[’_UNK’]) 方法,该方法会查找键w而返回该键对应的值,如果没有找到键w,则返回vocab[’_UNK’],对应的键为1.
回到get_data方法中,第一个for循环把输入文件中的每一句话,每一句话对应的槽位和意图分别放入到batch_in,batch_slots,intents的列表中,并确定了maxlen为35,超出了设置的20,因此第二个for循环就是把长度不足20的向后填充,超出20的向前截断。snip数据集中,共13084个样本。
返回到main方法中, train_X, train_slot_y, train_intent_y分别为2个秩为2的(13084,20)的ndarray数组和一个秩为1的数组。
model_param['intent_nums'] = len(set(train_intent_y.flatten())) + 2
model_param['slot_label_nums'] = len(set(train_slot_y.flatten())) + 2
train_slot_y = keras.utils.to_categorical(train_slot_y,num_classes=model_param['slot_label_nums'])
train_intent_y = keras.utils.to_categorical(train_intent_y,num_classes=model_param['intent_nums'])
valid_X, valid_slot_y, valid_intent_y = valid_processor.get_data()
valid_slot_y = keras.utils.to_categorical(valid_slot_y,num_classes=model_param['slot_label_nums'])
valid_intent_y = keras.utils.to_categorical(valid_intent_y,num_classes=model_param['intent_nums'])
这儿吧训练集和验证集中的train_slot_y和train_intent_y改为形状为(13084,20,75)和(13084,9)。
建立模型
主函数中为
model = SlotGatedSLU(model_param).build()
进入build函数
class SlotGatedSLU(object):
"""implementation SlotGated SLU model for keras
https://www.aclweb.org/anthology/N18-2118/
"""
def __init__(self, params):
super(SlotGatedSLU, self).__init__()
self._params = params
def build(self):
seq_input = keras.layers.Input(
name='seq_input',
shape=(self._params['maxlen'],)
)
x = self._make_embedding_layer(embed_type='char')(seq_input)
x = keras.layers.SpatialDropout1D(
0.1,
name='embed_drop')(x)
state_outputs,_fw,fw_final_c,_bw,bw_final_c = keras.layers.Bidirectional(
keras.layers.LSTM(
self._params['lstm_units'],
dropout=self._params['lstm_dropout_rate'],
return_sequences=True,
return_state=True,
),
name='bilstm_encoder'
)(x)
slot_inputs = state_outputs #[batch_size,maxlen,2*lstm_units]
intent_input = keras.layers.concatenate(
[fw_final_c,bw_final_c],
name='final_state'
) #[batch_size,2*lstm_units]
# 意图识别任务
intent_attn = self._apply_intent_attn(state_outputs) #[batch_size,maxlen]
intent_feats = keras.layers.concatenate(
[intent_input,intent_attn],
name='intent_feats'
)
intent_dense = keras.layers.Dense(
self._params['intent_dense_size'],
activation="relu",
name="intent_dense"
)(intent_feats)
intent_out = keras.layers.Dense(
self._params['intent_nums'],
activation="softmax",
name="intent_out"
)(intent_dense)
# 槽位填充任务
if self._params['full_attention']:
slot_attn_out = self._apply_slot_attn(slot_inputs)
slot_feats = self._apply_slot_gate(
state_outputs,slot_attn_out,intent_attn)
else:
slot_feats = self._apply_slot_gate(
state_outputs,slot_inputs,intent_attn)
slot_feats_drop = keras.layers.TimeDistributed(
keras.layers.Dropout(0.2),
name='slot_feats_drop'
)(slot_feats)
slot_dense = keras.layers.TimeDistributed(
keras.layers.Dense(
self._params['slot_dense_size'],
activation='relu'
),
name='slot_dense'
)(slot_feats_drop)
slot_out = keras.layers.TimeDistributed(
keras.layers.Dense(
self._params['slot_label_nums'],
activation='softmax'
),
name='slot_out'
)(slot_dense)
# 模型
model = keras.models.Model(
inputs=seq_input,
outputs=[intent_out,slot_out]
)
return model
def _make_embedding_layer(self,name='embedding',embed_type='char',**kwargs):
def init_embedding(weights=None):
if embed_type == "char":
input_dim = self._params['char_max_features']
output_dim = self._params['char_embed_size']
else:
input_dim = self._params['word_max_features']
output_dim = self._params['word_embed_size']
return keras.layers.Embedding(
input_dim = input_dim,
output_dim = output_dim,
trainable = True,
name = name,
weights = weights,
**kwargs)
if embed_type == "char":
embed_weights = self._params['char_embedding_matrix']
else:
embed_weights = self._params['word_embedding_matrix']
if embed_weights is None:
embedding = init_embedding()
else:
embedding = init_embedding(weights = [embed_weights])
return embedding
def _apply_intent_attn(self,inputs):
intent_attn = IntentAttention(self._params['maxlen'],name='intent_attn')(inputs)
return intent_attn
def _apply_slot_attn(self,inputs):
# 将BILSTM编码输出先输入给一个前馈神经网络,不变换维度,然后计算attention
slot_attn_ffn_size = K.int_shape(inputs)[2]
slot_ffn = keras.layers.TimeDistributed(
keras.layers.Dense(
slot_attn_ffn_size,
activation='relu'
),
name='slot_ffn'
)(inputs)
slot_atten = SlotAttention()(slot_ffn)
return slot_atten
def _apply_slot_gate(self,hi,slot_c,intent_c):
slot_gate = SlotGate(name='slot_gate')([slot_c,intent_c])
slot_feats = keras.layers.concatenate(
[hi,slot_gate],
name='slot_feats'
)
return slot_feats
首先是输入层
seq_input = keras.layers.Input(
name='seq_input',
shape=(self._params['maxlen'],)
)
debug结果为
可以看到batch次数位置,维度为20
然后进入_make_embedding_layer函数,_make_embedding_layer内部还包含这一个函数init_embedding,因为设置的是embedding_matrix为空,故该函数用于创建每个token的embedding。
init_embedding函数通过keras.layers.Embedding返回
return keras.layers.Embedding(
input_dim = input_dim,
output_dim = output_dim,
trainable = True,
name = name,
weights = weights,
**kwargs)
简单分析一下这个语句,input_dim指的是总词汇量,此处总词汇量设置为11250,实际上有11241.output_dim指的是每个token,即每个单词embedding后的维度,这儿设置为200,trainable说明embedding参数是可训练参数。name是该层的名字,设置为’embedding’,weights为None。实际上这儿总的可训练参数变为了11250*200=2250000.
返回到build函数中
x = self._make_embedding_layer(embed_type='char')(seq_input)
该语句调用了隐函数callable,把seq_input作为输入序列。seq_input形状为(?,20),None代表了batch,这儿通过后面的model.fit给出,那么20就代表了每一句话经过padding后的长度,相当于在keras.layers.Embedding中设置了input_length=20.
此时x维度变为
再经过
x = keras.layers.SpatialDropout1D(
0.1,
name='embed_drop')(x)
此处是对百分之十的可训练参数进行随机失活,该层命名为’embed_drop’。此时x维度不变
接下来是
state_outputs,_fw,fw_final_c,_bw,bw_final_c = keras.layers.Bidirectional(
keras.layers.LSTM(
self._params['lstm_units'],
dropout=self._params['lstm_dropout_rate'],
return_sequences=True,
return_state=True,
),
name='bilstm_encoder'
)(x)
其中, keras.layers.LSTM的参数解释为
第一个参数lstm_units 表示LSTM层,每一个时间步输出向量的维度
dropout为随机失活,随机选取百分之十的LSTM神经元失活
return_sequences:返回每个时间步的hidden state
return_state :返回最后一个时间步的hidden state 和cell state
return_sequences 和 return_state :可同时使用,三者都输出
在keras.layers.Bidirectional中,其库里的源码为
def __init__(self, layer, merge_mode='concat', weights=None, **kwargs):
可以看出默认模式为’concat’。
最后 state_outputs,_fw,fw_final_c,_bw,bw_final_c 的状态分别为
state_outputs的 最后一个维度变为了256,这是由于双向LSTM要拼接前向LSTM和后向LSTM的结果。
Ps:这儿我有一点不明白的地方,按理说应该变成(None,20,256),第二个参数应该是时间步,且由输入确定了为20.
然后
slot_inputs = state_outputs #[batch_size,maxlen,2*lstm_units]
intent_input = keras.layers.concatenate(
[fw_final_c,bw_final_c],
name='final_state'
) #[batch_size,2*lstm_units]
把双向LSTM的编码结果赋值给slot_inputs ,再把双向LSTM的最后一步的细胞状态拼接,赋值给intent_input
意图识别模型
接下来是本论文的关键之一,意图识别模型
intent_attn = self._apply_intent_attn(state_outputs) #[batch_size,maxlen]
跳转到_apply_intent_attn函数内
def _apply_intent_attn(self,inputs):
intent_attn = IntentAttention(self._params['maxlen'],name='intent_attn')(inputs)
return intent_attn
再跳转到IntentAttention函数内
class IntentAttention(Layer):
def __init__(self, step_dim,
W_regularizer=None, b_regularizer=None,
W_constraint=None, b_constraint=None,
bias=True, **kwargs):
self.supports_masking = True
self.init = initializers.get('glorot_uniform')
self.W_regularizer = regularizers.get(W_regularizer)
self.b_regularizer = regularizers.get(b_regularizer)
self.W_constraint = constraints.get(W_constraint)
self.b_constraint = constraints.get(b_constraint)
self.bias = bias
self.step_dim = step_dim
self.features_dim = 0
super(IntentAttention, self).__init__(**kwargs)
def build(self, input_shape):
assert len(input_shape) == 3
self.W = self.add_weight(name='{}_W'.format(self.name),
shape =(input_shape[-1],),
initializer=self.init,
regularizer=self.W_regularizer,
constraint=self.W_constraint)
self.features_dim = input_shape[-1]
if self.bias:
self.b = self.add_weight(name='{}_b'.format(self.name),
shape=(input_shape[1],),
initializer='zero',
regularizer=self.b_regularizer,
constraint=self.b_constraint)
else:
self.b = None
self.built = True
def compute_mask(self, input, input_mask=None):
return None
def call(self, x, mask=None):
features_dim = self.features_dim
step_dim = self.step_dim
eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
if self.bias:
eij += self.b
eij = K.tanh(eij)
a = K.exp(eij)
if mask is not None:
a *= K.cast(mask, K.floatx())
a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
a = K.expand_dims(a)
weighted_input = x * a
return K.sum(weighted_input, axis=1)
def compute_output_shape(self, input_shape):
return input_shape[0], self.features_dim
def get_config(self):
config = {'step_dim':self.step_dim}
base_config = super(IntentAttention,self).get_config()
return dict(list(base_config.items()) + list(config.items()))
关于self-attention这部分代码有可能来自keras库,因此不做解释,总是self-attention是一个seq2seq模型,其最终的结果必然也为一个序列向量。
这之后就是
intent_feats = keras.layers.concatenate(
[intent_input,intent_attn],
name='intent_feats'
)
intent_dense = keras.layers.Dense(
self._params['intent_dense_size'],
activation="relu",
name="intent_dense"
)(intent_feats)
intent_out = keras.layers.Dense(
self._params['intent_nums'],
activation="softmax",
name="intent_out"
)(intent_dense)
这部分代码对应论文中的公式为
其中论文中公式是intent_input向量与intent_attn向量相加,代码中是将他们进行拼接,维度变成512,而不是向量相加。
然后是接全连接层,激活函数为Relu,隐含层神经元个数设置为intent_dense_size(256),对应到公式中的矩阵,因为是relu函数,直接用一个矩阵与向量作內积好像也没什么毛病
最后是接一个全连接层,其输出维度设置为self._params[‘intent_nums’],即意图个数,激活函数选用softmax。
槽位填充模型
槽位填充模型是本论文的核心部分
if self._params['full_attention']:
slot_attn_out = self._apply_slot_attn(slot_inputs)
slot_feats = self._apply_slot_gate(
state_outputs,slot_attn_out,intent_attn)
else:
slot_feats = self._apply_slot_gate(
state_outputs,slot_inputs,intent_attn)
slot_attn_out 的生成和intent_attn有一点不一样,在_apply_slot_attn内部
def _apply_slot_attn(self,inputs):
# 将BILSTM编码输出先输入给一个前馈神经网络,不变换维度,然后计算attention
slot_attn_ffn_size = K.int_shape(inputs)[2]
slot_ffn = keras.layers.TimeDistributed(
keras.layers.Dense(
slot_attn_ffn_size,
activation='relu'
),
name='slot_ffn'
)(inputs)
slot_atten = SlotAttention()(slot_ffn)
return slot_atten
先确定slot_attn_ffn_size 的值为256,然后用keras.layers.TimeDistributed 函数
keras.layers.TimeDistributed函数的用法 在这儿可以简单理解为对每个时间步的编码向量同时连接一个全连接层。这儿的时间步应该为20,尽管实参inputs其实指代的是slot_inputs,而slot_inputs的形状为
但是由于前面代码分析,索引为一的位置理应为20,这儿不知道为什么显示的为None
TimeDistributed函数和前面的Bidirectional函数都属于层封装器 wrappers,把某层当做其参数。官方文档所说的是
这个封装器将一个层应用于输入的每个时间片。
输入至少为 3D,且第一个维度应该是时间所表示的维度。
第一个维度这儿略微有点歧义,代码中iniputs的第一个维度其实batch,这儿理解为索引为1的维度。这儿应该是keras中英文文档的例子差异,建议观看英文的keras文档。
总之,经过TimeDistributed后,slot_ffn 的形状为
这儿时间步为20.
接下来就是求slot_atten ,这是求槽位信息的self-attention。进入到SlotAttention()
class SlotAttention(object):
"""https://www.aclweb.org/anthology/N18-2118/"""
def __call__(self, inputs):
attention = keras.layers.Lambda(self._attention,
output_shape = self._attention_output_shape,
arguments = None,
name = 'slot_attn_weights')(inputs)
align = keras.layers.Lambda(self._soft_alignment,
output_shape = self._soft_alignment_output_shape,
arguments = None,
name = 'slot_attn')([attention, inputs])
return align
def _attention(self, inputs):
"""
Compute the attention between elements of one sentences self with the dot
product.
Args:
inputs: A sentence encoded by a BiLSTM.
Returns:
A tensor containing the dot product (attention weights between the
elements of the sentences self).
"""
attn_weights = K.batch_dot(x=inputs,
y=K.permute_dimensions(inputs,
pattern=(0, 2, 1)))
return K.permute_dimensions(attn_weights, (0, 2, 1))
def _attention_output_shape(self, inputs):
input_shape = inputs
embedding_size = input_shape[1]
return (input_shape[0], embedding_size, embedding_size)
def _soft_alignment(self, inputs):
"""
Compute the soft alignment between the elements of two sentences.
Args:
inputs: A list of two elements, the first is a tensor of attention
weights, the second is the encoded sentence on which to
compute the alignments.
Returns:
A tensor containing the alignments.
"""
attention = inputs[0]
sentence = inputs[1]
# Subtract the max. from the attention weights to avoid overflows.
exp = K.exp(attention - K.max(attention, axis=-1, keepdims=True))
exp_sum = K.sum(exp, axis=-1, keepdims=True)
softmax = exp / exp_sum
return K.batch_dot(softmax, sentence)
def _soft_alignment_output_shape(self, inputs):
attention_shape = inputs[0]
sentence_shape = inputs[1]
return (attention_shape[0], attention_shape[1], sentence_shape[2])
此处分析一下该代码,首先是执行call函数。call函数可以让对象直接接实参,实际上就是让对象执行对象内部的call方法,使得对象看起来更像一个方法了。
首先执行keras.layers.Lambda函数,该函数的第一个参数必须是一个函数的地址,以此作为Lambda层。头一个参数和output_shape 可以理解为执行了
详情参考link
_attention(inputs)
_soft_alignment_output_shape(inputs)
其中,_attention函数内 K.batch_dot是对两个张量作內积,K.permute_dimensions重新排列了张量的轴。link
_soft_alignment函数是求“计算两个句子元素之间的软对齐”,没看太懂,原话是Compute the soft alignment between the elements of two sentences。
回到_apply_slot_attn函数中
slot_atten的维度为
回到build函数中, slot_feats = self._apply_slot_gate ,进入到_apply_slot_gate
def _apply_slot_gate(self,hi,slot_c,intent_c):
slot_gate = SlotGate(name='slot_gate')([slot_c,intent_c])
slot_feats = keras.layers.concatenate(
[hi,slot_gate],
name='slot_feats'
)
return slot_feats
运行之,得到各个变量的维度
至此,完成了下式(在SlotGate函数中实现的)
和(由keras.layers.concatenate实现的)
slot_feats_drop = keras.layers.TimeDistributed(
keras.layers.Dropout(0.2),
name='slot_feats_drop'
)(slot_feats)
slot_dense = keras.layers.TimeDistributed(
keras.layers.Dense(
self._params['slot_dense_size'],
activation='relu'
),
name='slot_dense'
)(slot_feats_drop)
slot_out = keras.layers.TimeDistributed(
keras.layers.Dense(
self._params['slot_label_nums'],
activation='softmax'
),
name='slot_out'
)(slot_dense)
接下来就和intent模型一致了,先随机失活,再接全连接层和softmax层
完成了
训练模型
回到主函数中
model.compile(
optimizer='adam',
loss={'slot_out':'categorical_crossentropy', 'intent_out':'categorical_crossentropy'},
loss_weights={'slot_out': 1.0, 'intent_out': 0.5},
metrics={'intent_out':'accuracy'}
)
print(model.summary())
reduce_lr = keras.callbacks.ReduceLROnPlateau(
monitor='val_slot_out_loss',
factor=0.5,
patience=4,
verbose=1)
earlystop = keras.callbacks.EarlyStopping(
monitor='val_slot_out_loss',
patience=8,
verbose=2,
mode='min'
)
bast_model_filepath = './model_file/slotgate_model.h5'
checkpoint = keras.callbacks.ModelCheckpoint(
bast_model_filepath,
monitor='val_slot_out_loss',
verbose=1,
save_best_only=True,
mode='min'
)
H = model.fit(
x=train_X,
y={"slot_out": train_slot_y, "intent_out": train_intent_y},
validation_data=(
valid_X,
{"slot_out": valid_slot_y, "intent_out": valid_intent_y}
),
batch_size=arg.batch_size,
epochs=arg.max_epochs,
callbacks=[reduce_lr,earlystop,checkpoint]
)
这一部分是固定套路。reduce_lr 是设置学习率衰减策略,注意model.compile里的loss,在设置损失函数的时候,可以是字符串形式,可以是目标函数形式,这儿是字符串形式,由于model有两个输出,因此写成了字典形式。loss_weights没看懂。
# model.load_weights(bast_model_filepath)
test_X, test_slot_y, test_intent_y = test_processor.get_data()
intent_pred,slot_pred = model.predict(test_X)
训练完成后,开始预测,各个变量形状如下,注意test_slot_y,test_intent_y 没有独热表示,但是预测结果是独热表示的
intent_pred = np.argmax(intent_pred,axis=1)
intent_accuracy = (intent_pred==test_intent_y)
intent_accuracy = np.mean(intent_accuracy)*100.0
print("\n\n%s 数据集意图准确率:" % arg.dataset,intent_accuracy)
既然真实值没有独热表示,那么要把预测结果转化成真实值的数值表示形式,计算准确率。
# 槽位
from metrics import *
tag2id = slot_vocab['vocab']
id2tag = {v:k for k,v in tag2id.items()}
y_true, y_pred = [],[]
for t_oh,p_oh in zip(test_slot_y,slot_pred):
t_oh = [id2tag[i] for i in t_oh if i!=0]
p_oh = np.argmax(p_oh,axis=1)
p_oh = [id2tag[i] for i in p_oh if i!=0]
y_true.append(t_oh)
y_pred.append(p_oh)
tag2id ,id2tag ,很好理解,是从词汇表中提取的。
tag2id如下图
id2tag如下图
在for循环中把slot_pred也变为数值表示形式,形状为(700,20),但是此时无论是test_slot_y还是slot_pred,他们都是数字,我们接下来通过for循环把数字转换成slot,这一操作是通过查找id2tag字典完成的。
计算各项评测指标
f1 = f1_score(y_true,y_pred,suffix=False)
p = precision_score(y_true,y_pred,suffix=False)
r = recall_score(y_true,y_pred,suffix=False)
acc = accuracy_score(y_true,y_pred)
print("\nf1_score: {:.4f}, precision_score: {:.4f}, recall_score: {:.4f}, accuracy_score: {:.4f}".format(f1,p,r,acc))
print(classification_report(y_true, y_pred, digits=4, suffix=False))
首先看f1值
def f1_score(y_true: object, y_pred: object, average: object = 'micro', suffix: object = False) -> object:
"""Compute the F1 score.
The F1 score can be interpreted as a weighted average of the precision and
recall, where an F1 score reaches its best value at 1 and worst score at 0.
The relative contribution of precision and recall to the F1 score are
equal. The formula for the F1 score is::
F1 = 2 * (precision * recall) / (precision + recall)
Args:
y_true : 2d array. Ground truth (correct) target values.
y_pred : 2d array. Estimated targets as returned by a tagger.
Returns:
score : float.
Example:
# >>> from seqeval.metrics import f1_score
# >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
# >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
# >>> f1_score(y_true, y_pred)
0.50
"""
true_entities = set(get_entities(y_true, suffix))
pred_entities = set(get_entities(y_pred, suffix))
nb_correct = len(true_entities & pred_entities) # 求两个集合交集的个数
nb_pred = len(pred_entities)
nb_true = len(true_entities)
p = nb_correct / nb_pred if nb_pred > 0 else 0
r = nb_correct / nb_true if nb_true > 0 else 0
score = 2 * p * r / (p + r) if p + r > 0 else 0
return score
再进入到get_entities函数
def get_entities(seq, suffix=False):
"""Gets entities from sequence.
Args:
seq (list): sequence of labels.
Returns:
list: list of (chunk_type, chunk_start, chunk_end).
Example:
# >>> from seqeval.metrics.sequence_labeling import get_entities
# >>> seq = ['B-PER', 'I-PER', 'O', 'B-LOC']
# >>> get_entities(seq)
[('PER', 0, 1), ('LOC', 3, 3)]
"""
# for nested list
if any(isinstance(s, list) for s in seq):
seq = [item for sublist in seq for item in sublist + ['O']]
prev_tag = 'O'
prev_type = ''
begin_offset = 0
chunks = []
# print(seq)
for i, chunk in enumerate(seq + ['O']):
# print(i,chunk)
if suffix:
tag = chunk[-1]
type_ = chunk.split('-')[0]
else:
try:
tag = chunk[0]
type_ = chunk.split('-')[-1]
except IndexError:
tag = 'O'
type_ = 'O'
if end_of_chunk(prev_tag, tag, prev_type, type_):
chunks.append((prev_type, begin_offset, i-1))
if start_of_chunk(prev_tag, tag, prev_type, type_):
begin_offset = i
prev_tag = tag
prev_type = type_
return chunks
首先判断seq里是否存在可迭代参数,只要有存在一个,为true,接下来通过列表推导式,在每一个子列表后面补零,再生成一个大列表。
在这个大列表基础上查找实体元组,正如注释所说
Example:
# >>> from seqeval.metrics.sequence_labeling import get_entities
# >>> seq = [‘B-PER’, ‘I-PER’, ‘O’, ‘B-LOC’]
# >>> get_entities(seq)
[(‘PER’, 0, 1), (‘LOC’, 3, 3)]
元组类型为 实体类型+起始位置+结束位置。
返回到f1_score函数中,用set把返回结果去重,得到2个set类型的对象
true_entities = set(get_entities(y_true, suffix))
pred_entities = set(get_entities(y_pred, suffix))
nb_correct = len(true_entities & pred_entities) # 求两个集合交集的个数
再对两个set对象求交集,确定预测实体元组中预测正确的个数
p = nb_correct / nb_pred if nb_pred > 0 else 0
r = nb_correct / nb_true if nb_true > 0 else 0
score = 2 * p * r / (p + r) if p + r > 0 else 0
p求的是精确率,为预测正确正样本个数占预测为正样本个数之比
r求的是召回率,为预测正确的正样本个数与实际正样本个数之比
f1是他们的一个平均值。
f1求出来,返回到主函数中,precision_score和recall_score也就求出来了,接下来求accuracy_score
def accuracy_score(y_true, y_pred):
"""Accuracy classification score.
In multilabel classification, this function computes subset accuracy:
the set of labels predicted for a sample must *exactly* match the
corresponding set of labels in y_true.
Args:
y_true : 2d array. Ground truth (correct) target values.
y_pred : 2d array. Estimated targets as returned by a tagger.
Returns:
score : float.
Example:
# >>> from seqeval.metrics import accuracy_score
# >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
# >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
# >>> accuracy_score(y_true, y_pred)
0.80
"""
if any(isinstance(s, list) for s in y_true):
y_true = [item for sublist in y_true for item in sublist]
y_pred = [item for sublist in y_pred for item in sublist]
nb_correct = sum(y_t==y_p for y_t, y_p in zip(y_true, y_pred))
nb_true = len(y_true)
score = nb_correct / nb_true
return score
accuracy_score求法与f1_score类似,但是它求的是基于每个token上的准确率,该准确率也包含了“O”.
输出结果为
f1_score: 0.7092, precision_score: 0.6875, recall_score: 0.7324, accuracy_score: 0.8398
最后计算每个slot的分数,在classification_report函数中,digits=4意思是保留4位小数,返回结果类型为
precision recall f1-score support
spatial_relation 0.7432 0.7746 0.7586 71
playlist 0.6667 0.7287 0.6963 129
music_item 0.7757 0.7981 0.7867 104
city 0.6769 0.7333 0.7040 60
artist 0.6436 0.6075 0.6250 107
object_type 0.7607 0.7654 0.7631 162
timeRange 0.7193 0.7664 0.7421 107
movie_name 0.4706 0.5106 0.4898 47
object_select 0.6829 0.7000 0.6914 40
cuisine 0.8889 0.5714 0.6957 14
party_size_number 0.8200 0.8200 0.8200 50
genre 0.3333 0.4000 0.3636 5
sort 0.8438 0.8438 0.8438 32
album 0.0000 0.0000 0.0000 10
rating_value 0.7531 0.7625 0.7578 80
state 0.7586 0.7458 0.7521 59
object_name 0.4486 0.5646 0.5000 147
condition_description 0.7667 0.8214 0.7931 28
playlist_owner 0.8194 0.8429 0.8310 70
rating_unit 0.7750 0.7750 0.7750 40
entity_name 0.4000 0.5455 0.4615 33
movie_type 0.9394 0.9394 0.9394 33
year 0.6923 0.7500 0.7200 24
best_rating 0.7727 0.7907 0.7816 43
country 0.7857 0.7500 0.7674 44
condition_temperature 0.7826 0.7826 0.7826 23
restaurant_type 0.8438 0.8308 0.8372 65
location_name 0.8400 0.8750 0.8571 24
party_size_description 0.3846 0.5000 0.4348 10
poi 0.6000 0.7500 0.6667 8
service 0.9583 0.9583 0.9583 24
object_location_type 0.9091 0.9091 0.9091 22
track 0.2222 0.4444 0.2963 9
restaurant_name 0.4500 0.6000 0.5143 15
current_location 0.9333 1.0000 0.9655 14
served_dish 0.4500 0.7500 0.5625 12
geographic_poi 0.4286 0.5455 0.4800 11
object_part_of_series_type 0.6154 0.7273 0.6667 11
facility 0.6667 0.6667 0.6667 3
micro avg 0.6875 0.7324 0.7092 1790
macro avg 0.7014 0.7324 0.7148 1790
Process finished with exit code 0
再一次训练为
snips 数据集意图准确率: 97.0
f1_score: 0.8593, precision_score: 0.8346, recall_score: 0.8855, accuracy_score: 0.9359
precision recall f1-score support
timeRange 0.8654 0.8411 0.8531 107
artist 0.7500 0.7009 0.7246 107
party_size_description 0.7273 0.8000 0.7619 10
poi 0.6000 0.7500 0.6667 8
sort 0.9677 0.9375 0.9524 32
party_size_number 0.9804 1.0000 0.9901 50
entity_name 0.4884 0.6364 0.5526 33
state 0.9286 0.8814 0.9043 59
rating_value 0.9756 1.0000 0.9877 80
playlist 0.7616 0.8915 0.8214 129
object_type 0.9448 0.9506 0.9477 162
rating_unit 1.0000 1.0000 1.0000 40
object_name 0.6353 0.7347 0.6814 147
city 0.7576 0.8333 0.7937 60
music_item 0.9455 1.0000 0.9720 104
current_location 1.0000 1.0000 1.0000 14
year 1.0000 1.0000 1.0000 24
spatial_relation 0.9189 0.9577 0.9379 71
country 0.8039 0.9318 0.8632 44
service 0.9600 1.0000 0.9796 24
movie_name 0.6250 0.6383 0.6316 47
movie_type 1.0000 1.0000 1.0000 33
restaurant_type 0.9692 0.9692 0.9692 65
location_name 0.8846 0.9583 0.9200 24
object_location_type 1.0000 1.0000 1.0000 22
restaurant_name 0.4762 0.6667 0.5556 15
object_select 0.9756 1.0000 0.9877 40
playlist_owner 0.9286 0.9286 0.9286 70
best_rating 1.0000 0.9767 0.9882 43
object_part_of_series_type 1.0000 1.0000 1.0000 11
condition_description 0.9655 1.0000 0.9825 28
geographic_poi 0.7778 0.6364 0.7000 11
cuisine 0.7143 0.7143 0.7143 14
served_dish 0.6111 0.9167 0.7333 12
condition_temperature 1.0000 1.0000 1.0000 23
album 0.0714 0.1000 0.0833 10
track 0.1852 0.5556 0.2778 9
genre 0.5714 0.8000 0.6667 5
facility 1.0000 1.0000 1.0000 3
micro avg 0.8346 0.8855 0.8593 1790
macro avg 0.8502 0.8855 0.8657 1790
Process finished with exit code 0
基本达到了论文中的结果。
解析中遇到的问题
1.门控单元的建立没看懂
2.debug时点击step into my code,进场遇到如下情况
左下角,从__call__,modules.py跳到_attention,modules.py中,中间突然多了两个灰色层,跳过去的时候不进入灰色层,返回时进入了灰色层,这是为什么