LEBERT

最新推荐文章于 2024-08-15 21:10:06 发布

Greeksilverfir

最新推荐文章于 2024-08-15 21:10:06 发布

阅读量96

点赞数

文章标签： c# 开发语言

本文链接：https://blog.csdn.net/weixin_43896398/article/details/124145787

版权

文章目录

一、论文阅读
二、代码解读
- 1.输入数据的处理
2.词向量构造

一、论文阅读

https://lonepatient.top/2021/06/02/Lexicon-Enhanced-Chinese-Sequence-Labelling-Using-BERT-Adapter.html

二、代码解读

1.输入数据的处理

class LEBertProcessor(Processor):
    def __init__(self, args, tokenizer):

作者默认导入词向量的前10000个词（不是字）进行构建字典树，并更新字典树的最大深度，最后返回word_embed_dict（哈希词-词向量）, word_list（前10000个词向量）, word_embed_dim（词向量维度）

    def load_word_embedding(cls, word_embed_path, max_scan_num):
        """
        todo 存在许多单字的，考虑是否去掉
        加载前max_scan_num个词向量, 并且返回词表
        :return:
        """
        logger.info('loading word embedding from pretrain')
        word_embed_dict = dict()
        word_list = list()
        with open(word_embed_path, 'r', encoding='utf8') as f:
            for idx, line in tqdm(enumerate(f)):
                # 只扫描前max_scan_num个词向量
                if idx > max_scan_num:
                    break
                items = line.strip().split()
                if idx == 0:
                    assert len(items) == 2
                    num_embed, word_embed_dim = items
                    num_embed, word_embed_dim = int(num_embed), int(word_embed_dim)
                else:
                    assert len(items) == word_embed_dim + 1
                    word = items[0]
                    embedding = np.empty([1, word_embed_dim])
                    embedding[:] = items[1:]
                    word_embed_dict[word] = embedding
                    word_list.append(word)
        logger.info('word_embed_dim:{}'.format(word_embed_dim))
        logger.info('size of word_embed_dict:{}'.format(len(word_embed_dict)))
        logger.info('size of word_list:{}'.format(len(word_list)))

        return word_embed_dict, word_list, word_embed_dim

字典树的创建，max_depth 为最大词的的长度。

import collections

class TrieNode:
    def __init__(self):
        self.children = collections.defaultdict(TrieNode)
        self.is_word = False

class Trie:
    """
    In fact, this Trie is a letter three.
    root is a fake node, its function is only the begin of a word, same as <bow>
    the the first layer is all the word's possible first letter, for example, '中国'
        its first letter is '中'
    the second the layer is all the word's possible second letter.
    and so on
    """
    def __init__(self, use_single=True):
        self.root = TrieNode()
        self.max_depth = 0
        if use_single:
            self.min_len = 0
        else:
            self.min_len = 1

    def insert(self, word):
        current = self.root
        deep = 0
        for letter in word:
            current = current.children[letter]
            deep += 1
        current.is_word = True
        if deep > self.max_depth:
            self.max_depth = deep

    def search(self, word):
        current = self.root
        for letter in word:
            current = current.children.get(letter)

            if current is None:
                return False
        return current.is_word

    def enumerateMatch(self, str, space=""):
        """
        Args:
            str: 需要匹配的词
        Return:
            返回匹配的词, 如果存在多字词，则会筛去单字词
        """
        matched = []
        while len(str) > self.min_len:
            if self.search(str):
                matched.insert(0, space.join(str[:])) # 短的词总是在最前面
            del str[-1]

        if len(matched) > 1 and len(matched[0]) == 1: # filter single character word
            matched = matched[1:]

        return matched

    @classmethod
    def build_trie_tree(cls, word_list, save_path):
        """
        # todo 是否不将单字加入字典树中
        构建字典树
        :return:
        """
        logger.info('building trie tree')
        trie_tree = Trie()
        for word in word_list:
            trie_tree.insert(word)
        write_pickle(trie_tree, save_path)
        return trie_tree

获取输入数据

    def get_input_data(self, file):
        lines = load_lines(file)
        features = []
        cls_token_id = self.tokenizer.cls_token_id
        sep_token_id = self.tokenizer.sep_token_id
        pad_token_id = self.tokenizer.pad_token_id
        o_label_id = self.label_vocab.convert_token_to_id('O')
        pad_label_id = self.label_vocab.convert_token_to_id('[PAD]')

        for line in tqdm(lines):
            data = json.loads(line)
            text = data['text']
            labels = data['label']
            char_index2words = self.get_char2words(text)

            # 在开头与结尾分别添加[CLS]与[SEP]
            input_ids = [cls_token_id] + self.tokenizer.convert_tokens_to_ids(text) + [sep_token_id]
            label_ids = [o_label_id] + self.label_vocab.convert_tokens_to_ids(labels) + [o_label_id]

            word_ids_list = []
            word_pad_id = self.word_vocab.convert_token_to_id('[PAD]')
            # 获取每个字的词向量，并且每个字最多只能3个（默认），然后对缺失的进行pad补齐
            for words in char_index2words:
                words = words[:self.max_word_num]
                word_ids = self.word_vocab.convert_tokens_to_ids(words)
                word_pad_num = self.max_word_num - len(words)
                word_ids = word_ids + [word_pad_id] * word_pad_num
                word_ids_list.append(word_ids)
            # 开头和结尾进行padding
            word_ids_list = [[word_pad_id]*self.max_word_num] + word_ids_list + [[word_pad_id]*self.max_word_num]

            if len(input_ids) > self.max_seq_len:
                input_ids = input_ids[: self.max_seq_len]
                label_ids = label_ids[: self.max_seq_len]
                word_ids_list = word_ids_list[: self.max_seq_len]
            input_mask = [1] * len(input_ids)
            token_type_ids = [0] * len(input_ids)
            assert len(input_ids) == len(label_ids) == len(word_ids_list)

            # padding
            padding_length = self.max_seq_len - len(input_ids)
            input_ids += [pad_token_id] * padding_length
            input_mask += [0] * padding_length
            token_type_ids += [0] * padding_length
            label_ids += [pad_label_id] * padding_length
            word_ids_list += [[word_pad_id]*self.max_word_num] * padding_length

            text = ''.join(text)
            input_ids = torch.LongTensor(input_ids)
            label_ids = torch.LongTensor(label_ids)
            input_mask = torch.LongTensor(input_mask)
            token_type_ids = torch.LongTensor(token_type_ids)
            word_ids = torch.LongTensor(word_ids_list)
            word_mask = (word_ids != word_pad_id).long()

            feature = {
                'text': text, 'input_ids': input_ids, 'attention_mask': input_mask, 'token_type_ids': token_type_ids,
                'word_ids': word_ids, 'word_mask': word_mask, 'label_ids': label_ids
            }
            features.append(feature)

        return features

获取每个字的词列表，max_depth为初始的10000个字典树的最长的词长度

    def get_char2words(self, text):
        """
        获取每个汉字，对应的单词列表
        :param text:
        :return:
        """
        text_len = len(text)
        char_index2words = [[] for _ in range(text_len)]
        for idx in range(text_len):
            sub_sent = text[idx:idx + self.trie_tree.max_depth]  # speed using max depth
            words = self.trie_tree.enumerateMatch(sub_sent)  # 找到以text[idx]开头的所有单词
            for word in words:
                start_pos = idx
                end_pos = idx + len(word)
                for i in range(start_pos, end_pos):
                    char_index2words[i].append(word)
        # todo 截断
        # for i, words in enumerate(char_index2words):
        #     char_index2words[i] = char_index2words[i][:self.max_word_num]
        return char_index2words

2.词向量构造

class WordEmbeddingAdapter(nn.Module):
    
    def __init__(self, config):
        super(WordEmbeddingAdapter, self).__init__()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.tanh = nn.Tanh()

        self.linear1 = nn.Linear(config.word_embed_dim, config.hidden_size)
        self.linear2 = nn.Linear(config.hidden_size, config.hidden_size)

        attn_W = torch.zeros(config.hidden_size, config.hidden_size)
        self.attn_W = nn.Parameter(attn_W)
        self.attn_W.data.normal_(mean=0.0, std=config.initializer_range)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, layer_output, word_embeddings, word_mask):
        """
        :param layer_output:bert layer的输出,[b_size, len_input, d_model]
        :param word_embeddings:每个汉字对应的词向量集合,[b_size, len_input, num_word, d_word]
        :param word_mask:每个汉字对应的词向量集合的attention mask, [b_size, len_input, num_word]
        """

        # transform
        # 为了保证词向量和字符向量维度保持一致，作者对Char-Words Pair中的词向量使用非线性变换：
        # 将词向量，与字符向量进行维度对齐
        word_outputs = self.linear1(word_embeddings)
        word_outputs = self.tanh(word_outputs)
        word_outputs = self.linear2(word_outputs)
        word_outputs = self.dropout(word_outputs)   # word_outputs：[b_size, len_input, num_word, d_model]

        # 计算每个字符向量，与其对应的所有词向量的注意力权重，然后加权求和。采用双线性映射计算注意力权重
        # layer_output = layer_output.unsqueeze(2)    # layer_output：[b_size, len_input, 1, d_model]
        socres = torch.matmul(layer_output.unsqueeze(2), self.attn_W)  # [b_size, len_input, 1, d_model]
        socres = torch.matmul(socres, torch.transpose(word_outputs, 2, 3))  # [b_size, len_input, 1, num_word]
        socres = socres.squeeze(2)  # [b_size, len_input, num_word]
        socres.masked_fill_(word_mask, -1e9)  # 将pad的注意力设为很小的数
        socres = F.softmax(socres, dim=-1)  # [b_size, len_input, num_word]
        attn = socres.unsqueeze(-1)  # [b_size, len_input, num_word, 1]

        weighted_word_embedding = torch.sum(word_outputs * attn, dim=2)  # [N, L, D]   # 加权求和，得到每个汉字对应的词向量集合的表示
        layer_output = layer_output + weighted_word_embedding

        layer_output = self.dropout(layer_output)
        layer_output = self.layer_norm(layer_output)

        return layer_output

Greeksilverfir

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
LEBERT

文章目录一、论文阅读二、代码解读1.输入数据的处理2.词向量构造一、论文阅读https://lonepatient.top/2021/06/02/Lexicon-Enhanced-Chinese-Sequence-Labelling-Using-BERT-Adapter.html二、代码解读1.输入数据的处理class LEBertProcessor(Processor): def __init__(self, args, tokenizer):作者默认导入词向量的前10000个词（
复制链接

扫一扫