add_tokens和add_special_tokens一点点尝试
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from transformers import BertTokenizer
def show_token_info(input_str, my_tokenizer):
print("vocab_size: ", my_tokenizer.vocab_size)
encode_info = my_tokenizer([input_str])
convert_token_list = my_tokenizer.convert_ids_to_tokens(encode_info['input_ids'][0])
print("分词结果:", convert_token_list)
print("******Tokenizer******")
input_txt = "甄天真真天真1234"
model_path = "/Users/daliy/PythonCode/pretraining/LanguageModel/chinese-roberta-wwm-ext"
tokenizer_bert = BertTokenizer.from_pretrained(model_path)
show_token_info(input_txt, tokenizer_bert)
print("******add_tokens******")
# 添加tokens
words = ['天真', '甄天真']
tokenizer_bert.add_tokens(words)
show_token_info(input_txt, tokenizer_bert)
print("******add_special_tokens******")
# 添加special_tokens
token_list = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
special_tokens_dict = {'additional_special_tokens': token_list}
tokenizer_bert.add_special_tokens(special_tokens_dict)
show_token_info(input_txt, tokenizer_bert)
直接看结果自行体会吧
******Tokenizer******
vocab_size: 21128
分词结果: ['[CLS]', '甄', '天', '真', '真', '天', '真', '1234', '[SEP]']
******add_tokens******
vocab_size: 21128
分词结果: ['[CLS]', '甄天真', '真', '天真', '1234', '[SEP]']
******add_special_tokens******
vocab_size: 21128
分词结果: ['[CLS]', '甄天真', '真', '天真', '1', '2', '3', '4', '[SEP]']