基于中文哪吒NEZHA的FLAT的命名实体识别实现与探讨（二）

最新推荐文章于 2024-04-11 17:50:18 发布

chenmingwei000

最新推荐文章于 2024-04-11 17:50:18 发布

阅读量770

点赞数

分类专栏：实体识别 bert

本文链接：https://blog.csdn.net/chenmingwei000/article/details/107896515

版权

bert 同时被 2 个专栏收录

5 篇文章 1 订阅

订阅专栏

实体识别

3 篇文章 0 订阅

订阅专栏

接下来解释如何利用匹配到的词汇与模型进行融合的代码，获取在训练数据中的单词id以及词汇在语句中的开始位置和结束位置，字符对应的标签对应关系，代码如下：

 if args.do_train and args.do_eval:
        # 加载训练数据,这里同时把候选词转化为ids
        train_examples = processor.get_train_examples(args.data_dir,data.gaz,data.gaz_alphabet)

具体的操作过程如下：

   def _read_data(self, input_file,gaz,gaz_alphabet):
        """Reads a BIO NERdata."""
        with codecs.open(input_file, 'r', encoding='utf-8') as f:
            lines = []
            words = []
            labels = []
            for line in f:
                contends = line
                tokens = contends.split('\t')
                if len(tokens) == 2:
                    words.append(tokens[0])
                    labels.append(tokens[-1].strip())
                else:#本次的数据是利用---分割一条语句
                    if contends.strip() == '---' and len(words) > 0:
                        label = []
                        word = []
                        for l, w in zip(labels, words):
                            if len(l) > 0 and len(w) > 0:
                                label.append(l)
                                self.labels.add(l)
                                word.append(w)
                        assert len(label)==len(word)
                        #以上过程是得到字符list对应的label，之所以这样做原因是berttoken会导致标签不对应，直接用list分割可以解决这个问题
                        #############################
                        gazs = []
                        gaz_Ids = []
                        words_start_end=[] #用来存储匹配词汇的开始结束位置
                        words_ids=[] #当前句子中包含几个候选词汇
                        w_length = len(words)
                        for idx in range(w_length):
                            matched_list = gaz.enumerateMatchList(words[idx:])
                            matched_length = [len(a) for a in matched_list]
                            if len(matched_list)!=0:
                                words_start_end.extend([[''.join(words).index(ele),''.join(words).index(ele)+len(ele)-1] for ele in matched_list])
                            gazs.append(matched_list)
                            matched_Id = [gaz_alphabet.get_index(entity) for entity in matched_list]
                            words_ids.extend(matched_Id)
                            if matched_Id:
                                gaz_Ids.append([matched_Id, matched_length])  # match对应 词语id以及对应的词语长度
                            else:
                                gaz_Ids.append([])
                        lines.append(['|'.join(label), '|'.join(word),words_ids,words_start_end])
                        words = []
                        labels = []
                        continue
                if contends.startswith("-DOCSTART-"):
                    continue
            return lines

以上代码最终得到的是字符以及对应的label标签，匹配语句中的词汇ids，对应词汇在句子中开始结束位置。然后写入tfrecoed中，代码如下：

if not os.path.exists(train_file):
    filed_based_convert_examples_to_features(
        train_examples, label_list, args.max_seq_length, tokenizer, train_file, args.output_dir)

针对每一个样本使用函数：
feature = convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, output_dir, mode)构造feature

具体函数如下：

 if example.words_ids: #如果存在之前准备的词汇ids，作为tokens_b以及对应的词汇对应开始结束位置
        tokens_b = example.words_ids
        tokens_start_end=example.words_start_end
 #以下代码利用每一个字符传入到bert的tokenizer中得到对应bert的字符，由于可能不存在bert词汇列表中或者去除，目前使用字符比较方便，
 labellist = example.label.split('|')
 textlist = example.text.split('|')
 assert len(textlist) == len(labellist)
  tokens = []
  labels = []
  for i, word in enumerate(textlist):
      # 分词，如果是中文，就是分字,但是对于一些不在BERT的vocab.txt中得字符会被进行WordPice处理（例如中文的引号），可以将所有的分字操作替换为list(input)
      token = tokenizer.tokenize(word)
      if len(token)>1:
          print('1111111')
      tokens.extend(token)
      label_1 = labellist[i]
      for m in range(len(token)):
          if m == 0:
              labels.append(label_1)
          else:  # 一般不会出现else
              labels.append("X")
  assert len(tokens) == len(labels)

 if tokens_b:#如果把词汇拼接到后面需要保证不高于最大长度
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens, tokens_b,tokens_start_end, max_seq_length - 3)
 #以下就是正常的把对应的字符添加[CLS], [SEP],并且转化为ids
  ntokens = []
  segment_ids = []
   if mode != 'test':
       label_ids = []
       label_ids.append(label_map["[CLS]"])  # O OR CLS 没有任何影响，不过我觉得O 会减少标签个数,不过拒收和句尾使用不同的标志来标注，使用LCS 也没毛病
   ntokens.append("[CLS]")  # 句子开始设置CLS 标志
   segment_ids.append(0)
   # append("O") or append("[CLS]") not sure!
   for i, token in enumerate(tokens):
       ntokens.append(token)
       segment_ids.append(0)
       if mode != 'test':
           label_ids.append(label_map[labels[i]])
   ntokens.append("[SEP]")  # 句尾添加[SEP] 标志
   segment_ids.append(0)
   # append("O") or append("[SEP]") not sure!
   if mode != 'test':
       label_ids.append(label_map["[SEP]"])

   input_ids = tokenizer.convert_tokens_to_ids(ntokens)  # 将序列中的字(ntokens)转化为ID形式

重点接下来就是生成分割的词汇开始位置列表以及结束为止列表：

  初始化开始位置与结束为止
 postion_head = list(range(max_seq_length)) #记录对应的postion位置
 postion_tail = list(range(max_seq_length))
 index_begin = len(input_ids)#由于下标从0开始，所以原始字符最大下标应该为len(input_ids)-1，那么词汇的开始下标应该就是
 for words_index in tokens_start_end:
      # try:
      postion_head[index_begin] = words_index[0]+1#由于第一个位置增加了【CLS】所以位置+1
      postion_tail[index_begin] = words_index[1]+1
      index_begin+=1

这样上面的代码就把对应的词汇开始下标和结束下标的替换进去。对应了论文中的标记位置数据例如：
… llll [CLS, X1, X2, X3, …, SEP]
head [0 ,1,2,…,67,1,4,…400]
tail [0,1,2,…,67,2,5,…400]
67对应SEP，对应后边的便是词汇在句子中的开始和结束位置。