from tokenizers import Tokenizer, pre_tokenizers
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers import decoders
path="./"
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.decoder = decoders.ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True)
trainer = BpeTrainer(
special_tokens=[ "<s>","<pad>", "</s>","<unk>",
# "<mask>"
],
show_progress=True,
# min_frequency=100,
# vocab_size=10
)#,vocab_size=100,min_frequency=10,vocab_size=600
tokenizer.train(trainer=trainer,files=[path+"spacemath.txt"])
print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))
tokenizer.save(path="tokenizer.json",pretty=True)
from transformers import PreTrainedTokenizerFast
# 加载 tokenizer
tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
# 检查特殊符号 ID
print("ID for '<s>':", tokenizer.convert_tokens_to_ids("<s>"))
print("ID for '</s>':", tokenizer.convert_tokens_to_ids("</s>"))
print(tokenizer.encode("asd123"))
训练trocr的tokenizer
最新推荐文章于 2024-04-29 17:55:25 发布