代码解析

最新推荐文章于 2021-10-07 16:44:24 发布

Meloneating

最新推荐文章于 2021-10-07 16:44:24 发布

阅读量1.4k

点赞数 1

分类专栏： keras

本文链接：https://blog.csdn.net/Meloneating/article/details/116593067

版权

keras 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

Slot-Gated Modeling for Joint Slot Filling and Intent Prediction代码解析

预处理
主函数
建立模型
- 意图识别模型
- 槽位填充模型
训练模型
解析中遇到的问题

预处理

def createVocabulary(input_path, output_path, no_pad=False):
    if not isinstance(input_path, str):
        raise TypeError('input_path should be string')

    if not isinstance(output_path, str):
        raise TypeError('output_path should be string')

    vocab = {}
    with open(input_path, 'r', encoding='utf8') as fd, \
         open(output_path, 'w+', encoding='utf8') as out:
        for line in fd:
            line = line.rstrip('\r\n')
            words = line.split()

            for w in words:
                if w == '_UNK':
                    break
                if str.isdigit(w) == True:
                    w = '0'
                if w in vocab:
                    vocab[w] += 1
                else:
                    vocab[w] = 1
        if no_pad == False:
            vocab = ['_PAD', '_UNK'] + sorted(vocab, key=vocab.get, reverse=True)
        else:
            vocab = ['_UNK'] + sorted(vocab, key=vocab.get, reverse=True) # reverse=True指的是降序排列，vocab.get返回字典键值对的值

        for v in vocab:
            out.write(v+'\n')

先创建一个vocab的字典，该字典通过提取seq.out文件中每句话的每个词的slot信息。
在这里插入图片描述
再根据slot出现次数降序排列，并加入[’_PAD’, ‘_UNK’]，写入到输出文件slot_vocab中

in_vocab = loadVocabulary(os.path.join(arg.vocab_path, 'in_vocab'))
slot_vocab = loadVocabulary(os.path.join(arg.vocab_path, 'slot_vocab'))
intent_vocab = loadVocabulary(os.path.join(arg.vocab_path, 'intent_vocab'))

def loadVocabulary(path):
    if not isinstance(path, str):
        raise TypeError('path should be a string')

    vocab = []
    rev = []
    with open(path, encoding='utf8') as fd:
        for line in fd:
            line = line.rstrip('\r\n')
            rev.append(line)
        vocab = dict([(x,y) for (y,x) in enumerate(rev)])

    return {'vocab': vocab, 'rev': rev}

返回一个字典，该字典有两个键值对，第一个键值对的值还是个字典，第二个键值对的值是个列表
以intent_vocab的结果为例
在这里插入图片描述

train_processor = DataProcessor(
    os.path.join(full_train_path, arg.input_file), 
    os.path.join(full_train_path, arg.slot_file), 
    os.path.join(full_train_path, arg.intent_file), 
    in_vocab, slot_vocab, intent_vocab,
    arg.maxlen
    )

valid_processor = DataProcessor(
    os.path.join(full_valid_path, arg.input_file), 
    os.path.join(full_valid_path, arg.slot_file), 
    os.path.join(full_valid_path, arg.intent_file), 
    in_vocab, slot_vocab, intent_vocab,
    arg.maxlen
    )

test_processor = DataProcessor(
    os.path.join(full_test_path, arg.input_file), 
    os.path.join(full_test_path, arg.slot_file), 
    os.path.join(full_test_path, arg.intent_file), 
    in_vocab, slot_vocab, intent_vocab,
    arg.maxlen
    )

class DataProcessor(object):
    def __init__(self, in_path, slot_path, intent_path, in_vocab, slot_vocab, intent_vocab,max_len):
        self.__fd_in = open(in_path, 'r', encoding='utf8')
        self.__fd_slot = open(slot_path, 'r', encoding='utf8')
        self.__fd_intent = open(intent_path, 'r', encoding='utf8')
        self.__in_vocab = in_vocab
        self.__slot_vocab = slot_vocab
        self.__intent_vocab = intent_vocab
        self.max_len = max_len
        self.end = 0

    def close(self):
        self.__fd_in.close()
        self.__fd_slot.close()
        self.__fd_intent.close()

    def get_data(self):
        in_data = [] #输入序列 ，padding
        slot_data = [] # 输入序列对于的solt标签 ，padding
        slot_weight = []
        intents = [] #意图标签

        batch_in = [] #输入序列
        batch_slot = [] # 输入序列对于的solt标签
        max_len = 0

        #used to record word(not id)
        in_seq = []
        slot_seq = []
        intent_seq = []
        for i in range(100000):
            inp = self.__fd_in.readline()
            if inp == '':
                self.end = 1
                break
            slot = self.__fd_slot.readline()
            intent = self.__fd_intent.readline()
            inp = inp.rstrip()
            slot = slot.rstrip()
            intent = intent.rstrip()

            in_seq.append(inp)
            slot_seq.append(slot)
            intent_seq.append(intent)

            iii=inp
            sss=slot
            inp = sentenceToIds(inp, self.__in_vocab)
            slot = sentenceToIds(slot, self.__slot_vocab)
            intent = sentenceToIds(intent, self.__intent_vocab)
            batch_in.append(np.array(inp))
            batch_slot.append(np.array(slot))
            intents.append(intent[0])
            if len(inp) != len(slot):
                print(iii,sss)
                print(inp,slot)
                exit(0)
            if len(inp) > max_len:
                max_len = len(inp)

        intents = np.asarray(intents)
        for i, s in zip(batch_in, batch_slot):
            in_data.append(padSentence(list(i), self.max_len, self.__in_vocab))
            slot_data.append(padSentence(list(s), self.max_len, self.__slot_vocab))
            #print(s)
        in_data = np.asarray(in_data)
        slot_data = np.asarray(slot_data)

        self.close()
        return in_data, slot_data, intents

主函数

    train_X, train_slot_y, train_intent_y = train_processor.get_data()

跳入到get_data方法中

    def get_data(self):
       in_data = [] #输入序列 ，padding
       slot_data = [] # 输入序列对于的solt标签 ，padding
       slot_weight = []
       intents = [] #意图标签

       batch_in = [] #输入序列
       batch_slot = [] # 输入序列对于的solt标签
       max_len = 0

       #used to record word(not id)
       in_seq = []
       slot_seq = []
       intent_seq = []
       for i in range(100000):
           inp = self.__fd_in.readline()
           if inp == '':
               self.end = 1
               break
           slot = self.__fd_slot.readline()
           intent = self.__fd_intent.readline()
           inp = inp.rstrip()
           slot = slot.rstrip()
           intent = intent.rstrip()

           in_seq.append(inp)
           slot_seq.append(slot)
           intent_seq.append(intent)

           iii=inp
           sss=slot
           inp = sentenceToIds(inp, self.__in_vocab)
           slot = sentenceToIds(slot, self.__slot_vocab)
           intent = sentenceToIds(intent, self.__intent_vocab)
           batch_in.append(np.array(inp))
           batch_slot.append(np.array(slot))
           intents.append(intent[0])
           if len(inp) != len(slot):
               print(iii,sss)
               print(inp,slot)
               exit(0)
           if len(inp) > max_len:
               max_len = len(inp)

       intents = np.asarray(intents)
       for i, s in zip(batch_in, batch_slot):
           in_data.append(padSentence(list(i), self.max_len, self.__in_vocab))
           slot_data.append(padSentence(list(s), self.max_len, self.__slot_vocab))
           #print(s)
       in_data = np.asarray(in_data)
       slot_data = np.asarray(slot_data)

       self.close()
       return in_data, slot_data, intents

再跳入到

def sentenceToIds(data, vocab):
    if not isinstance(vocab, dict):
        raise TypeError('vocab should be a dict that contains vocab and rev')
    vocab = vocab['vocab']
    if isinstance(data, str):
        words = data.split()
    elif isinstance(data, list):
        words = data
    else:
        raise TypeError('data should be a string or a list contains words')

    ids = []
    for w in words:
        if str.isdigit(w) == True:
            w = '0'
        ids.append(vocab.get(w, vocab['_UNK']))

    return ids

方法中，该方法是把一句话中的每个单词转换成对应词汇表中的数字，其中，如果单词是数字形式的字符串时，通通化为字符形式的数字0，对应到词汇表中为‘_pad’。
以下是词汇表
在这里插入图片描述
vocab.get(w, vocab[’_UNK’]) 方法，该方法会查找键w而返回该键对应的值，如果没有找到键w，则返回vocab[’_UNK’]，对应的键为1.

回到get_data方法中，第一个for循环把输入文件中的每一句话，每一句话对应的槽位和意图分别放入到batch_in,batch_slots,intents的列表中，并确定了maxlen为35，超出了设置的20，因此第二个for循环就是把长度不足20的向后填充，超出20的向前截断。snip数据集中，共13084个样本。

返回到main方法中， train_X, train_slot_y, train_intent_y分别为2个秩为2的（13084,20）的ndarray数组和一个秩为1的数组。

    model_param['intent_nums'] = len(set(train_intent_y.flatten())) + 2
    model_param['slot_label_nums'] = len(set(train_slot_y.flatten())) + 2
    train_slot_y = keras.utils.to_categorical(train_slot_y,num_classes=model_param['slot_label_nums'])
    train_intent_y = keras.utils.to_categorical(train_intent_y,num_classes=model_param['intent_nums'])

    valid_X, valid_slot_y, valid_intent_y = valid_processor.get_data()
    valid_slot_y = keras.utils.to_categorical(valid_slot_y,num_classes=model_param['slot_label_nums'])
    valid_intent_y = keras.utils.to_categorical(valid_intent_y,num_classes=model_param['intent_nums'])

这儿吧训练集和验证集中的train_slot_y和train_intent_y改为形状为（13084,20,75）和（13084,9）。

建立模型

主函数中为

    model = SlotGatedSLU(model_param).build()

进入build函数

class SlotGatedSLU(object):
    """implementation SlotGated SLU model for keras
    https://www.aclweb.org/anthology/N18-2118/
    """
    def __init__(self, params):
        super(SlotGatedSLU, self).__init__()
        self._params = params

    def build(self):
        seq_input = keras.layers.Input(
            name='seq_input',
            shape=(self._params['maxlen'],)
            )
        x = self._make_embedding_layer(embed_type='char')(seq_input)
        x = keras.layers.SpatialDropout1D(
                0.1,
                name='embed_drop')(x)

        state_outputs,_fw,fw_final_c,_bw,bw_final_c = keras.layers.Bidirectional(
            keras.layers.LSTM(
                self._params['lstm_units'],
                dropout=self._params['lstm_dropout_rate'],
                return_sequences=True,
                return_state=True,
                ),
            name='bilstm_encoder'
            )(x)

        slot_inputs = state_outputs #[batch_size,maxlen,2*lstm_units]
        intent_input = keras.layers.concatenate(
            [fw_final_c,bw_final_c],
            name='final_state'
            ) #[batch_size,2*lstm_units]

        # 意图识别任务
        intent_attn = self._apply_intent_attn(state_outputs) #[batch_size,maxlen]
        intent_feats = keras.layers.concatenate(
            [intent_input,intent_attn],
            name='intent_feats'
            )
        intent_dense = keras.layers.Dense(
            self._params['intent_dense_size'], 
            activation="relu",
            name="intent_dense"
            )(intent_feats)
        intent_out = keras.layers.Dense(
            self._params['intent_nums'], 
            activation="softmax",
            name="intent_out"
            )(intent_dense)

        # 槽位填充任务
        if self._params['full_attention']:
            slot_attn_out = self._apply_slot_attn(slot_inputs)
            slot_feats = self._apply_slot_gate(
                state_outputs,slot_attn_out,intent_attn)
        else:
            slot_feats = self._apply_slot_gate(
                state_outputs,slot_inputs,intent_attn)
        slot_feats_drop = keras.layers.TimeDistributed(
            keras.layers.Dropout(0.2),
            name='slot_feats_drop'
            )(slot_feats)
        slot_dense = keras.layers.TimeDistributed(
            keras.layers.Dense(
                self._params['slot_dense_size'],
                activation='relu'
                ),
            name='slot_dense'
            )(slot_feats_drop)
        slot_out = keras.layers.TimeDistributed(
            keras.layers.Dense(
                self._params['slot_label_nums'],
                activation='softmax'
                ),
            name='slot_out'
            )(slot_dense)

        # 模型
        model = keras.models.Model(
            inputs=seq_input, 
            outputs=[intent_out,slot_out]
            )
        return model

    def _make_embedding_layer(self,name='embedding',embed_type='char',**kwargs):

        def init_embedding(weights=None):
            if embed_type == "char":
                input_dim = self._params['char_max_features']
                output_dim = self._params['char_embed_size']
            else:
                input_dim = self._params['word_max_features']
                output_dim = self._params['word_embed_size']

            return keras.layers.Embedding(
                input_dim = input_dim,
                output_dim = output_dim,
                trainable = True,
                name = name,
                weights = weights,
                **kwargs)

        if embed_type == "char":
            embed_weights = self._params['char_embedding_matrix']
        else:
            embed_weights = self._params['word_embedding_matrix']

        if embed_weights is None:
            embedding = init_embedding()
        else:
            embedding = init_embedding(weights = [embed_weights])
        return embedding

    def _apply_intent_attn(self,inputs):
        intent_attn = IntentAttention(self._params['maxlen'],name='intent_attn')(inputs)
        return intent_attn

    def _apply_slot_attn(self,inputs):
        # 将BILSTM编码输出先输入给一个前馈神经网络，不变换维度，然后计算attention
        slot_attn_ffn_size = K.int_shape(inputs)[2]
        slot_ffn = keras.layers.TimeDistributed(
            keras.layers.Dense(
                slot_attn_ffn_size,
                activation='relu'
                ),
            name='slot_ffn'
            )(inputs)
        slot_atten = SlotAttention()(slot_ffn)
        return slot_atten

    def _apply_slot_gate(self,hi,slot_c,intent_c):
        slot_gate = SlotGate(name='slot_gate')([slot_c,intent_c])
        slot_feats = keras.layers.concatenate(
            [hi,slot_gate],
            name='slot_feats'
            )
        return slot_feats

首先是输入层

        seq_input = keras.layers.Input(
            name='seq_input',
            shape=(self._params['maxlen'],)
            )

debug结果为
在这里插入图片描述
可以看到batch次数位置，维度为20

然后进入_make_embedding_layer函数，_make_embedding_layer内部还包含这一个函数init_embedding，因为设置的是embedding_matrix为空，故该函数用于创建每个token的embedding。

init_embedding函数通过keras.layers.Embedding返回

            return keras.layers.Embedding(
                input_dim = input_dim,
                output_dim = output_dim,
                trainable = True,
                name = name,
                weights = weights,
                **kwargs)

简单分析一下这个语句，input_dim指的是总词汇量，此处总词汇量设置为11250，实际上有11241.output_dim指的是每个token，即每个单词embedding后的维度，这儿设置为200，trainable说明embedding参数是可训练参数。name是该层的名字，设置为’embedding’，weights为None。实际上这儿总的可训练参数变为了11250*200=2250000.

返回到build函数中

x = self._make_embedding_layer(embed_type='char')(seq_input)

该语句调用了隐函数callable，把seq_input作为输入序列。seq_input形状为（？，20），None代表了batch，这儿通过后面的model.fit给出，那么20就代表了每一句话经过padding后的长度，相当于在keras.layers.Embedding中设置了input_length=20.

此时x维度变为
在这里插入图片描述
再经过

        x = keras.layers.SpatialDropout1D(
                0.1,
                name='embed_drop')(x)

此处是对百分之十的可训练参数进行随机失活，该层命名为’embed_drop’。此时x维度不变

接下来是

        state_outputs,_fw,fw_final_c,_bw,bw_final_c = keras.layers.Bidirectional(
            keras.layers.LSTM(
                self._params['lstm_units'],
                dropout=self._params['lstm_dropout_rate'],
                return_sequences=True,
                return_state=True,
                ),
            name='bilstm_encoder'
            )(x)

其中， keras.layers.LSTM的参数解释为

第一个参数lstm_units 表示LSTM层，每一个时间步输出向量的维度
dropout为随机失活，随机选取百分之十的LSTM神经元失活
return_sequences：返回每个时间步的hidden state
return_state ：返回最后一个时间步的hidden state 和cell state
return_sequences 和 return_state ：可同时使用，三者都输出

在keras.layers.Bidirectional中，其库里的源码为

def __init__(self, layer, merge_mode='concat', weights=None, **kwargs):

可以看出默认模式为’concat’。

最后 state_outputs,_fw,fw_final_c,_bw,bw_final_c 的状态分别为
在这里插入图片描述
state_outputs的最后一个维度变为了256，这是由于双向LSTM要拼接前向LSTM和后向LSTM的结果。

Ps：这儿我有一点不明白的地方，按理说应该变成（None，20,256），第二个参数应该是时间步，且由输入确定了为20.

然后

        slot_inputs = state_outputs #[batch_size,maxlen,2*lstm_units]
        intent_input = keras.layers.concatenate(
            [fw_final_c,bw_final_c],
            name='final_state'
            ) #[batch_size,2*lstm_units]

把双向LSTM的编码结果赋值给slot_inputs ，再把双向LSTM的最后一步的细胞状态拼接，赋值给intent_input

意图识别模型

接下来是本论文的关键之一，意图识别模型

intent_attn = self._apply_intent_attn(state_outputs) #[batch_size,maxlen]

跳转到_apply_intent_attn函数内

    def _apply_intent_attn(self,inputs):
        intent_attn = IntentAttention(self._params['maxlen'],name='intent_attn')(inputs)
        return intent_attn

再跳转到IntentAttention函数内

class IntentAttention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(IntentAttention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3
        self.W = self.add_weight(name='{}_W'.format(self.name),
                                 shape =(input_shape[-1],),
                                 initializer=self.init,
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight(name='{}_b'.format(self.name),
                                     shape=(input_shape[1],),
                                     initializer='zero',
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                            K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b
        eij = K.tanh(eij)
        a = K.exp(eij)
        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

    def get_config(self):
        config = {'step_dim':self.step_dim}
        base_config = super(IntentAttention,self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

关于self-attention这部分代码有可能来自keras库，因此不做解释，总是self-attention是一个seq2seq模型，其最终的结果必然也为一个序列向量。
在这里插入图片描述

这之后就是

        intent_feats = keras.layers.concatenate(
            [intent_input,intent_attn],
            name='intent_feats'
            )
        intent_dense = keras.layers.Dense(
            self._params['intent_dense_size'], 
            activation="relu",
            name="intent_dense"
            )(intent_feats)
        intent_out = keras.layers.Dense(
            self._params['intent_nums'], 
            activation="softmax",
            name="intent_out"
            )(intent_dense)

这部分代码对应论文中的公式为
在这里插入图片描述
其中论文中公式是intent_input向量与intent_attn向量相加，代码中是将他们进行拼接，维度变成512，而不是向量相加。

然后是接全连接层，激活函数为Relu，隐含层神经元个数设置为intent_dense_size（256），对应到公式中的矩阵，因为是relu函数，直接用一个矩阵与向量作內积好像也没什么毛病

最后是接一个全连接层，其输出维度设置为self._params[‘intent_nums’]，即意图个数，激活函数选用softmax。
在这里插入图片描述

槽位填充模型

槽位填充模型是本论文的核心部分

        if self._params['full_attention']:
            slot_attn_out = self._apply_slot_attn(slot_inputs)
            slot_feats = self._apply_slot_gate(
                state_outputs,slot_attn_out,intent_attn)
        else:
            slot_feats = self._apply_slot_gate(
                state_outputs,slot_inputs,intent_attn)

slot_attn_out 的生成和intent_attn有一点不一样，在_apply_slot_attn内部

    def _apply_slot_attn(self,inputs):
        # 将BILSTM编码输出先输入给一个前馈神经网络，不变换维度，然后计算attention
        slot_attn_ffn_size = K.int_shape(inputs)[2]
        slot_ffn = keras.layers.TimeDistributed(
            keras.layers.Dense(
                slot_attn_ffn_size,
                activation='relu'
                ),
            name='slot_ffn'
            )(inputs)
        slot_atten = SlotAttention()(slot_ffn)
        return slot_atten

先确定slot_attn_ffn_size 的值为256，然后用keras.layers.TimeDistributed 函数
keras.layers.TimeDistributed函数的用法在这儿可以简单理解为对每个时间步的编码向量同时连接一个全连接层。这儿的时间步应该为20，尽管实参inputs其实指代的是slot_inputs，而slot_inputs的形状为
在这里插入图片描述
但是由于前面代码分析，索引为一的位置理应为20，这儿不知道为什么显示的为None

link

TimeDistributed函数和前面的Bidirectional函数都属于层封装器 wrappers，把某层当做其参数。官方文档所说的是

这个封装器将一个层应用于输入的每个时间片。
输入至少为 3D，且第一个维度应该是时间所表示的维度。

第一个维度这儿略微有点歧义，代码中iniputs的第一个维度其实batch，这儿理解为索引为1的维度。这儿应该是keras中英文文档的例子差异，建议观看英文的keras文档。

总之，经过TimeDistributed后，slot_ffn 的形状为
在这里插入图片描述
这儿时间步为20.

接下来就是求slot_atten ，这是求槽位信息的self-attention。进入到SlotAttention()

class SlotAttention(object):
    """https://www.aclweb.org/anthology/N18-2118/"""
    def __call__(self, inputs):
        attention = keras.layers.Lambda(self._attention,
                                        output_shape = self._attention_output_shape,
                                        arguments = None,
                                        name = 'slot_attn_weights')(inputs)

        align = keras.layers.Lambda(self._soft_alignment,
                                     output_shape = self._soft_alignment_output_shape,
                                     arguments = None,
                                     name = 'slot_attn')([attention, inputs])
        return align

    def _attention(self, inputs):
        """
        Compute the attention between elements of one sentences self with the dot
        product.
        Args:
            inputs: A sentence encoded by a BiLSTM.
        Returns:
            A tensor containing the dot product (attention weights between the
            elements of the sentences self).
        """
        attn_weights = K.batch_dot(x=inputs,
                                   y=K.permute_dimensions(inputs,
                                                          pattern=(0, 2, 1)))
        return K.permute_dimensions(attn_weights, (0, 2, 1))

    def _attention_output_shape(self, inputs):
        input_shape = inputs
        embedding_size = input_shape[1]
        return (input_shape[0], embedding_size, embedding_size)

    def _soft_alignment(self, inputs):
        """
        Compute the soft alignment between the elements of two sentences.
        Args:
            inputs: A list of two elements, the first is a tensor of attention
                    weights, the second is the encoded sentence on which to
                    compute the alignments.
        Returns:
            A tensor containing the alignments.
        """
        attention = inputs[0]
        sentence = inputs[1]

        # Subtract the max. from the attention weights to avoid overflows.
        exp = K.exp(attention - K.max(attention, axis=-1, keepdims=True))
        exp_sum = K.sum(exp, axis=-1, keepdims=True)
        softmax = exp / exp_sum

        return K.batch_dot(softmax, sentence)

    def _soft_alignment_output_shape(self, inputs):
        attention_shape = inputs[0]
        sentence_shape = inputs[1]
        return (attention_shape[0], attention_shape[1], sentence_shape[2])

此处分析一下该代码，首先是执行call函数。call函数可以让对象直接接实参，实际上就是让对象执行对象内部的call方法，使得对象看起来更像一个方法了。

首先执行keras.layers.Lambda函数，该函数的第一个参数必须是一个函数的地址，以此作为Lambda层。头一个参数和output_shape 可以理解为执行了
详情参考link

_attention（inputs）
_soft_alignment_output_shape(inputs)

其中，_attention函数内 K.batch_dot是对两个张量作內积，K.permute_dimensions重新排列了张量的轴。link

_soft_alignment函数是求“计算两个句子元素之间的软对齐”，没看太懂，原话是Compute the soft alignment between the elements of two sentences。

回到_apply_slot_attn函数中
slot_atten的维度为
在这里插入图片描述

回到build函数中， slot_feats = self._apply_slot_gate ，进入到_apply_slot_gate

    def _apply_slot_gate(self,hi,slot_c,intent_c):
        slot_gate = SlotGate(name='slot_gate')([slot_c,intent_c])
        slot_feats = keras.layers.concatenate(
            [hi,slot_gate],
            name='slot_feats'
            )
        return slot_feats

运行之，得到各个变量的维度
在这里插入图片描述
至此，完成了下式（在SlotGate函数中实现的）

和（由keras.layers.concatenate实现的）

        slot_feats_drop = keras.layers.TimeDistributed(
            keras.layers.Dropout(0.2),
            name='slot_feats_drop'
            )(slot_feats)
        slot_dense = keras.layers.TimeDistributed(
            keras.layers.Dense(
                self._params['slot_dense_size'],
                activation='relu'
                ),
            name='slot_dense'
            )(slot_feats_drop)
        slot_out = keras.layers.TimeDistributed(
            keras.layers.Dense(
                self._params['slot_label_nums'],
                activation='softmax'
                ),
            name='slot_out'
            )(slot_dense)

接下来就和intent模型一致了，先随机失活，再接全连接层和softmax层
完成了
在这里插入图片描述

训练模型

回到主函数中

    model.compile(
        optimizer='adam',
        loss={'slot_out':'categorical_crossentropy', 'intent_out':'categorical_crossentropy'},
        loss_weights={'slot_out': 1.0, 'intent_out': 0.5},
        metrics={'intent_out':'accuracy'}
        )

    print(model.summary())

    reduce_lr = keras.callbacks.ReduceLROnPlateau(
        monitor='val_slot_out_loss', 
        factor=0.5, 
        patience=4, 
        verbose=1)

    earlystop = keras.callbacks.EarlyStopping(
        monitor='val_slot_out_loss', 
        patience=8, 
        verbose=2, 
        mode='min'
        )
    bast_model_filepath = './model_file/slotgate_model.h5'
    checkpoint = keras.callbacks.ModelCheckpoint(
        bast_model_filepath, 
        monitor='val_slot_out_loss', 
        verbose=1, 
        save_best_only=True,
        mode='min'
        )

    H = model.fit(
            x=train_X,
            y={"slot_out": train_slot_y, "intent_out": train_intent_y},
            validation_data=(
                valid_X,
                {"slot_out": valid_slot_y, "intent_out": valid_intent_y}
                ),
            batch_size=arg.batch_size,
            epochs=arg.max_epochs,
            callbacks=[reduce_lr,earlystop,checkpoint]
            )

这一部分是固定套路。reduce_lr 是设置学习率衰减策略，注意model.compile里的loss，在设置损失函数的时候，可以是字符串形式，可以是目标函数形式，这儿是字符串形式，由于model有两个输出，因此写成了字典形式。loss_weights没看懂。

    # model.load_weights(bast_model_filepath)
    test_X, test_slot_y, test_intent_y = test_processor.get_data()
    intent_pred,slot_pred = model.predict(test_X)

训练完成后，开始预测，各个变量形状如下，注意test_slot_y，test_intent_y 没有独热表示，但是预测结果是独热表示的
在这里插入图片描述

    intent_pred = np.argmax(intent_pred,axis=1)
    intent_accuracy = (intent_pred==test_intent_y)
    intent_accuracy = np.mean(intent_accuracy)*100.0
    print("\n\n%s 数据集意图准确率：" % arg.dataset,intent_accuracy)

既然真实值没有独热表示，那么要把预测结果转化成真实值的数值表示形式，计算准确率。
在这里插入图片描述

    # 槽位
    from metrics import *
    tag2id = slot_vocab['vocab']
    id2tag = {v:k for k,v in tag2id.items()}
    y_true, y_pred = [],[]

    for t_oh,p_oh in zip(test_slot_y,slot_pred):
        t_oh = [id2tag[i] for i in t_oh if i!=0]
        p_oh = np.argmax(p_oh,axis=1)
        p_oh = [id2tag[i] for i in p_oh if i!=0]

        y_true.append(t_oh)
        y_pred.append(p_oh)

tag2id ,id2tag ，很好理解，是从词汇表中提取的。
tag2id如下图
在这里插入图片描述
id2tag如下图

在for循环中把slot_pred也变为数值表示形式，形状为（700,20），但是此时无论是test_slot_y还是slot_pred，他们都是数字，我们接下来通过for循环把数字转换成slot，这一操作是通过查找id2tag字典完成的。
在这里插入图片描述

计算各项评测指标

    f1 = f1_score(y_true,y_pred,suffix=False)
    p = precision_score(y_true,y_pred,suffix=False)
    r = recall_score(y_true,y_pred,suffix=False)
    acc = accuracy_score(y_true,y_pred)
    print("\nf1_score: {:.4f}, precision_score: {:.4f}, recall_score: {:.4f}, accuracy_score: {:.4f}".format(f1,p,r,acc))
    print(classification_report(y_true, y_pred, digits=4, suffix=False))

首先看f1值

def f1_score(y_true: object, y_pred: object, average: object = 'micro', suffix: object = False) -> object:
    """Compute the F1 score.

    The F1 score can be interpreted as a weighted average of the precision and
    recall, where an F1 score reaches its best value at 1 and worst score at 0.
    The relative contribution of precision and recall to the F1 score are
    equal. The formula for the F1 score is::

        F1 = 2 * (precision * recall) / (precision + recall)

    Args:
        y_true : 2d array. Ground truth (correct) target values.
        y_pred : 2d array. Estimated targets as returned by a tagger.

    Returns:
        score : float.

    Example:
        # >>> from seqeval.metrics import f1_score
        # >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        # >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        # >>> f1_score(y_true, y_pred)
        0.50
    """
    true_entities = set(get_entities(y_true, suffix))
    pred_entities = set(get_entities(y_pred, suffix))

    nb_correct = len(true_entities & pred_entities) # 求两个集合交集的个数
    nb_pred = len(pred_entities)
    nb_true = len(true_entities)

    p = nb_correct / nb_pred if nb_pred > 0 else 0
    r = nb_correct / nb_true if nb_true > 0 else 0
    score = 2 * p * r / (p + r) if p + r > 0 else 0

    return score

再进入到get_entities函数

def get_entities(seq, suffix=False):
    """Gets entities from sequence.

    Args:
        seq (list): sequence of labels.

    Returns:
        list: list of (chunk_type, chunk_start, chunk_end).

    Example:
        # >>> from seqeval.metrics.sequence_labeling import get_entities
        # >>> seq = ['B-PER', 'I-PER', 'O', 'B-LOC']
        # >>> get_entities(seq)
        [('PER', 0, 1), ('LOC', 3, 3)]
    """
    # for nested list
    if any(isinstance(s, list) for s in seq):
        seq = [item for sublist in seq for item in sublist + ['O']]

    prev_tag = 'O'
    prev_type = ''
    begin_offset = 0
    chunks = []
    # print(seq)
    for i, chunk in enumerate(seq + ['O']):
        # print(i,chunk)
        if suffix:
            tag = chunk[-1]
            type_ = chunk.split('-')[0]
        else:
            try:
                tag = chunk[0]
                type_ = chunk.split('-')[-1]
            except IndexError:
                tag = 'O'
                type_ = 'O'
        if end_of_chunk(prev_tag, tag, prev_type, type_):
            chunks.append((prev_type, begin_offset, i-1))
        if start_of_chunk(prev_tag, tag, prev_type, type_):
            begin_offset = i
        prev_tag = tag
        prev_type = type_

    return chunks

首先判断seq里是否存在可迭代参数，只要有存在一个，为true，接下来通过列表推导式，在每一个子列表后面补零，再生成一个大列表。
在这里插入图片描述
在这个大列表基础上查找实体元组，正如注释所说

Example:
# >>> from seqeval.metrics.sequence_labeling import get_entities
# >>> seq = [‘B-PER’, ‘I-PER’, ‘O’, ‘B-LOC’]
# >>> get_entities(seq)
[(‘PER’, 0, 1), (‘LOC’, 3, 3)]

元组类型为实体类型+起始位置+结束位置。

返回到f1_score函数中，用set把返回结果去重，得到2个set类型的对象

    true_entities = set(get_entities(y_true, suffix))
    pred_entities = set(get_entities(y_pred, suffix))

    nb_correct = len(true_entities & pred_entities) # 求两个集合交集的个数

再对两个set对象求交集，确定预测实体元组中预测正确的个数

    p = nb_correct / nb_pred if nb_pred > 0 else 0
    r = nb_correct / nb_true if nb_true > 0 else 0
    score = 2 * p * r / (p + r) if p + r > 0 else 0

p求的是精确率，为预测正确正样本个数占预测为正样本个数之比
r求的是召回率，为预测正确的正样本个数与实际正样本个数之比
f1是他们的一个平均值。

f1求出来，返回到主函数中，precision_score和recall_score也就求出来了，接下来求accuracy_score

def accuracy_score(y_true, y_pred):
    """Accuracy classification score.

    In multilabel classification, this function computes subset accuracy:
    the set of labels predicted for a sample must *exactly* match the
    corresponding set of labels in y_true.

    Args:
        y_true : 2d array. Ground truth (correct) target values.
        y_pred : 2d array. Estimated targets as returned by a tagger.

    Returns:
        score : float.

    Example:
        # >>> from seqeval.metrics import accuracy_score
        # >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        # >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
        # >>> accuracy_score(y_true, y_pred)
        0.80
    """
    if any(isinstance(s, list) for s in y_true):
        y_true = [item for sublist in y_true for item in sublist]
        y_pred = [item for sublist in y_pred for item in sublist]

    nb_correct = sum(y_t==y_p for y_t, y_p in zip(y_true, y_pred))
    nb_true = len(y_true)

    score = nb_correct / nb_true

    return score

accuracy_score求法与f1_score类似，但是它求的是基于每个token上的准确率，该准确率也包含了“O”.
输出结果为

f1_score: 0.7092, precision_score: 0.6875, recall_score: 0.7324, accuracy_score: 0.8398

最后计算每个slot的分数，在classification_report函数中，digits=4意思是保留4位小数，返回结果类型为

                       precision    recall  f1-score   support

      spatial_relation     0.7432    0.7746    0.7586        71
              playlist     0.6667    0.7287    0.6963       129
            music_item     0.7757    0.7981    0.7867       104
                  city     0.6769    0.7333    0.7040        60
                artist     0.6436    0.6075    0.6250       107
           object_type     0.7607    0.7654    0.7631       162
             timeRange     0.7193    0.7664    0.7421       107
            movie_name     0.4706    0.5106    0.4898        47
         object_select     0.6829    0.7000    0.6914        40
               cuisine     0.8889    0.5714    0.6957        14
     party_size_number     0.8200    0.8200    0.8200        50
                 genre     0.3333    0.4000    0.3636         5
                  sort     0.8438    0.8438    0.8438        32
                 album     0.0000    0.0000    0.0000        10
          rating_value     0.7531    0.7625    0.7578        80
                 state     0.7586    0.7458    0.7521        59
           object_name     0.4486    0.5646    0.5000       147
 condition_description     0.7667    0.8214    0.7931        28
        playlist_owner     0.8194    0.8429    0.8310        70
           rating_unit     0.7750    0.7750    0.7750        40
           entity_name     0.4000    0.5455    0.4615        33
            movie_type     0.9394    0.9394    0.9394        33
                  year     0.6923    0.7500    0.7200        24
           best_rating     0.7727    0.7907    0.7816        43
               country     0.7857    0.7500    0.7674        44
 condition_temperature     0.7826    0.7826    0.7826        23
       restaurant_type     0.8438    0.8308    0.8372        65
         location_name     0.8400    0.8750    0.8571        24
party_size_description     0.3846    0.5000    0.4348        10
                   poi     0.6000    0.7500    0.6667         8
               service     0.9583    0.9583    0.9583        24
  object_location_type     0.9091    0.9091    0.9091        22
                 track     0.2222    0.4444    0.2963         9
       restaurant_name     0.4500    0.6000    0.5143        15
      current_location     0.9333    1.0000    0.9655        14
           served_dish     0.4500    0.7500    0.5625        12
        geographic_poi     0.4286    0.5455    0.4800        11

object_part_of_series_type 0.6154 0.7273 0.6667 11
facility 0.6667 0.6667 0.6667 3

             micro avg     0.6875    0.7324    0.7092      1790
             macro avg     0.7014    0.7324    0.7148      1790

Process finished with exit code 0

再一次训练为

snips 数据集意图准确率： 97.0

f1_score: 0.8593, precision_score: 0.8346, recall_score: 0.8855, accuracy_score: 0.9359
precision recall f1-score support

             timeRange     0.8654    0.8411    0.8531       107
                artist     0.7500    0.7009    0.7246       107
party_size_description     0.7273    0.8000    0.7619        10
                   poi     0.6000    0.7500    0.6667         8
                  sort     0.9677    0.9375    0.9524        32
     party_size_number     0.9804    1.0000    0.9901        50
           entity_name     0.4884    0.6364    0.5526        33
                 state     0.9286    0.8814    0.9043        59
          rating_value     0.9756    1.0000    0.9877        80
              playlist     0.7616    0.8915    0.8214       129
           object_type     0.9448    0.9506    0.9477       162
           rating_unit     1.0000    1.0000    1.0000        40
           object_name     0.6353    0.7347    0.6814       147
                  city     0.7576    0.8333    0.7937        60
            music_item     0.9455    1.0000    0.9720       104
      current_location     1.0000    1.0000    1.0000        14
                  year     1.0000    1.0000    1.0000        24
      spatial_relation     0.9189    0.9577    0.9379        71
               country     0.8039    0.9318    0.8632        44
               service     0.9600    1.0000    0.9796        24
            movie_name     0.6250    0.6383    0.6316        47
            movie_type     1.0000    1.0000    1.0000        33
       restaurant_type     0.9692    0.9692    0.9692        65
         location_name     0.8846    0.9583    0.9200        24
  object_location_type     1.0000    1.0000    1.0000        22
       restaurant_name     0.4762    0.6667    0.5556        15
         object_select     0.9756    1.0000    0.9877        40
        playlist_owner     0.9286    0.9286    0.9286        70
           best_rating     1.0000    0.9767    0.9882        43

object_part_of_series_type 1.0000 1.0000 1.0000 11
condition_description 0.9655 1.0000 0.9825 28
geographic_poi 0.7778 0.6364 0.7000 11
cuisine 0.7143 0.7143 0.7143 14
served_dish 0.6111 0.9167 0.7333 12
condition_temperature 1.0000 1.0000 1.0000 23
album 0.0714 0.1000 0.0833 10
track 0.1852 0.5556 0.2778 9
genre 0.5714 0.8000 0.6667 5
facility 1.0000 1.0000 1.0000 3

             micro avg     0.8346    0.8855    0.8593      1790
             macro avg     0.8502    0.8855    0.8657      1790

Process finished with exit code 0

基本达到了论文中的结果。

解析中遇到的问题

1.门控单元的建立没看懂
2.debug时点击step into my code，进场遇到如下情况
在这里插入图片描述
左下角，从__call__,modules.py跳到_attention,modules.py中，中间突然多了两个灰色层，跳过去的时候不进入灰色层，返回时进入了灰色层，这是为什么

Meloneating

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录