使用Bert对含有数组、字母的中文文本分词（每个字母、数字都分词）【最细力度】

u013250861

已于 2023-10-27 23:17:25 修改

阅读量939

点赞数 2

分类专栏： # NLP/词向量_预训练模型文章标签： bert 人工智能深度学习

于 2022-09-26 19:54:51 首次发布

本文链接：https://blog.csdn.net/u013250861/article/details/127059712

版权

NLP/词向量_预训练模型专栏收录该内容

27 篇文章 4 订阅

订阅专栏

序列标注任务 BERT 分词器可能会导致标注偏移，用 char-level 来 tokenize。

import torch
from transformers import BertTokenizer, BertModel


def fine_grade_tokenize(raw_text, tokenizer):
    """
    序列标注任务 BERT 分词器可能会导致标注偏移，
    用 char-level 来 tokenize
    """
    tokens = []

    for _ch in raw_text:
        if _ch in [' ', '\t', '\n']:
            tokens.append('[BLANK]')
        else:
            if not len(tokenizer.tokenize(_ch)):
                tokens.append('[INV]')
            else:
                tokens.append(_ch)

    return tokens


sentence = "62号汽车故障报告综合情况:故障现象:加速后，丢开油门，发动机熄火。"

len_sentence = len(sentence)

print("len_sentence = ", len_sentence)

bert_tokenizer = BertTokenizer.from_pretrained('./pretrained_models/bert-base-chinese/vocab.txt')

tokens = fine_grade_tokenize(sentence, bert_tokenizer)

print("len(tokens) = {0}; \ntokens = \n{1}".format(len(tokens), tokens))

encode_dict = bert_tokenizer.encode_plus(text=tokens,is_pretokenized=True,return_token_type_ids=True,return_attention_mask=True, max_length=36, pad_to_max_length=True) # max_length=1000, pad_to_max_length=True

print("\nencode_dict = \n", encode_dict)

token_ids = torch.tensor(encode_dict['input_ids'])
attention_masks = torch.tensor(encode_dict['attention_mask'])
token_type_ids = torch.tensor(encode_dict['token_type_ids'])

print("\nlen(token_ids) = {0}; \ntoken_ids = \n{1}".format(len(token_ids), token_ids))

zip_result = list(zip(tokens, token_ids[1:-1].numpy()))
print("\nlen(zip_result) = {0}; \nzip_result = \n{1}".format(len(zip_result), zip_result))

# 修改维度，用于接下来的Bert编码
token_ids = token_ids.unsqueeze(0)
attention_masks = attention_masks.unsqueeze(0)
token_type_ids = token_type_ids.unsqueeze(0)

bert_module = BertModel.from_pretrained("./pretrained_models/bert-base-chinese")

bert_outputs = bert_module(input_ids=token_ids,attention_mask=attention_masks,token_type_ids=token_type_ids)

seq_out = bert_outputs[0]

print("\n使用Bert编码后的Embedding：\nseq_out.shape = {0}; \nseq_out = \n{1}".format(seq_out.shape, seq_out))

打印结果：

len_sentence =  34

len(tokens) = 34; 
tokens = 
['6', '2', '号', '汽', '车', '故', '障', '报', '告', '综', '合', '情', '况', ':', '故', '障', '现', '象', ':', '加', '速', '后', '，', '丢', '开', '油', '门', '，', '发', '动', '机', '熄', '火', '。']

利用Bert【bert-base-chinese】分词后，在前后会分别加上101 [CLS]、102 [SEP]

encode_dict = 
 {
'input_ids': [101, 127, 123, 1384, 3749, 6756, 3125, 7397, 2845, 1440, 5341, 1394, 2658, 1105, 131, 3125, 7397, 4385, 6496, 131, 1217, 6862, 1400, 8024, 696, 2458, 3779, 7305, 8024, 1355, 1220, 3322, 4219, 4125, 511, 102], 
'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
}

len(token_ids) = 36; 
token_ids = 
tensor([ 101,  127,  123, 1384, 3749, 6756, 3125, 7397, 2845, 1440, 5341, 1394,
        2658, 1105,  131, 3125, 7397, 4385, 6496,  131, 1217, 6862, 1400, 8024,
         696, 2458, 3779, 7305, 8024, 1355, 1220, 3322, 4219, 4125,  511,  102])

len(zip_result) = 34; 
zip_result = 
[('6', 127), ('2', 123), ('号', 1384), ('汽', 3749), ('车', 6756), ('故', 3125), ('障', 7397), ('报', 2845), ('告', 1440), ('综', 5341), ('合', 1394), ('情', 2658), ('况', 1105), (':', 131), ('故', 3125), ('障', 7397), ('现', 4385), ('象', 6496), (':', 131), ('加', 1217), ('速', 6862), ('后', 1400), ('，', 8024), ('丢', 696), ('开', 2458), ('油', 3779), ('门', 7305), ('，', 8024), ('发', 1355), ('动', 1220), ('机', 3322), ('熄', 4219), ('火', 4125), ('。', 511)]

使用Bert编码后的Embedding：
seq_out.shape = torch.Size([1, 36, 768]); 
seq_out = 
tensor([[[ 0.4830,  0.4836,  0.0063,  ...,  0.8388, -0.1990,  0.2916],
         [-0.1549, -0.6636,  0.8651,  ..., -0.9683,  0.5224,  0.0259],
         [ 0.9820, -0.3003, -0.2826,  ...,  0.6285, -0.2102, -0.1150],
         ...,
         [ 0.5009, -1.2509, -1.1049,  ...,  1.0608, -0.9074,  0.1253],
         [ 0.8523,  0.2491,  0.0017,  ..., -0.2093, -0.4663, -0.0143],
         [ 0.8245,  0.3761, -0.1383,  ...,  0.9094, -0.6616, -0.0506]]],
       grad_fn=<NativeLayerNormBackward0>)