序列标注任务 BERT 分词器可能会导致标注偏移, 用 char-level 来 tokenize。
import torch
from transformers import BertTokenizer, BertModel
def fine_grade_tokenize(raw_text, tokenizer):
"""
序列标注任务 BERT 分词器可能会导致标注偏移,
用 char-level 来 tokenize
"""
tokens = []
for _ch in raw_text:
if _ch in [' ', '\t', '\n']:
tokens.append('[BLANK]')
else:
if not len(tokenizer.tokenize(_ch)):
tokens.append('[INV]')
else:
tokens.append(_ch)
return tokens
sentence = "62号汽车故障报告综合情况:故障现象:加速后,丢开油门,发动机熄火。"
len_sentence = len(sentence)
print("len_sentence = ", len_sentence)
bert_tokenizer = BertTokenizer.from_pretrained('./pretrained_models/bert-base-chinese/vocab.txt')
tokens = fine_grade_tokenize(sentence, bert_tokenizer)
print("len(tokens) = {0}; \ntokens = \n{1}".format(len(tokens), tokens))
encode_dict = bert_tokenizer.encode_plus(text=tokens,is_pretokenized=True,return_token_type_ids=True,return_attention_mask=True, max_length=36, pad_to_max_length=True) # max_length=1000, pad_to_max_length=True
print("\nencode_dict = \n", encode_dict)
token_ids = torch.tensor(encode_dict['input_ids'])
attention_masks = torch.tensor(encode_dict['attention_mask'])
token_type_ids = torch.tensor(encode_dict['token_type_ids'])
print("\nlen(token_ids) = {0}; \ntoken_ids = \n{1}".format(len(token_ids), token_ids))
zip_result = list(zip(tokens, token_ids[1:-1].numpy()))
print("\nlen(zip_result) = {0}; \nzip_result = \n{1}".format(len(zip_result), zip_result))
# 修改维度,用于接下来的Bert编码
token_ids = token_ids.unsqueeze(0)
attention_masks = attention_masks.unsqueeze(0)
token_type_ids = token_type_ids.unsqueeze(0)
bert_module = BertModel.from_pretrained("./pretrained_models/bert-base-chinese")
bert_outputs = bert_module(input_ids=token_ids,attention_mask=attention_masks,token_type_ids=token_type_ids)
seq_out = bert_outputs[0]
print("\n使用Bert编码后的Embedding:\nseq_out.shape = {0}; \nseq_out = \n{1}".format(seq_out.shape, seq_out))
打印结果:
len_sentence = 34
len(tokens) = 34;
tokens =
['6', '2', '号', '汽', '车', '故', '障', '报', '告', '综', '合', '情', '况', ':', '故', '障', '现', '象', ':', '加', '速', '后', ',', '丢', '开', '油', '门', ',', '发', '动', '机', '熄', '火', '。']
利用Bert【bert-base-chinese】分词后,在前后会分别加上101 [CLS]、102 [SEP]
encode_dict =
{
'input_ids': [101, 127, 123, 1384, 3749, 6756, 3125, 7397, 2845, 1440, 5341, 1394, 2658, 1105, 131, 3125, 7397, 4385, 6496, 131, 1217, 6862, 1400, 8024, 696, 2458, 3779, 7305, 8024, 1355, 1220, 3322, 4219, 4125, 511, 102],
'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
}
len(token_ids) = 36;
token_ids =
tensor([ 101, 127, 123, 1384, 3749, 6756, 3125, 7397, 2845, 1440, 5341, 1394,
2658, 1105, 131, 3125, 7397, 4385, 6496, 131, 1217, 6862, 1400, 8024,
696, 2458, 3779, 7305, 8024, 1355, 1220, 3322, 4219, 4125, 511, 102])
len(zip_result) = 34;
zip_result =
[('6', 127), ('2', 123), ('号', 1384), ('汽', 3749), ('车', 6756), ('故', 3125), ('障', 7397), ('报', 2845), ('告', 1440), ('综', 5341), ('合', 1394), ('情', 2658), ('况', 1105), (':', 131), ('故', 3125), ('障', 7397), ('现', 4385), ('象', 6496), (':', 131), ('加', 1217), ('速', 6862), ('后', 1400), (',', 8024), ('丢', 696), ('开', 2458), ('油', 3779), ('门', 7305), (',', 8024), ('发', 1355), ('动', 1220), ('机', 3322), ('熄', 4219), ('火', 4125), ('。', 511)]
使用Bert编码后的Embedding:
seq_out.shape = torch.Size([1, 36, 768]);
seq_out =
tensor([[[ 0.4830, 0.4836, 0.0063, ..., 0.8388, -0.1990, 0.2916],
[-0.1549, -0.6636, 0.8651, ..., -0.9683, 0.5224, 0.0259],
[ 0.9820, -0.3003, -0.2826, ..., 0.6285, -0.2102, -0.1150],
...,
[ 0.5009, -1.2509, -1.1049, ..., 1.0608, -0.9074, 0.1253],
[ 0.8523, 0.2491, 0.0017, ..., -0.2093, -0.4663, -0.0143],
[ 0.8245, 0.3761, -0.1383, ..., 0.9094, -0.6616, -0.0506]]],
grad_fn=<NativeLayerNormBackward0>)