话不多说,上代码
import sentencepiece as spm
def export_vocab_to_file(model_path, output_file):
# 加载 SentencePiece 模型
sp = spm.SentencePieceProcessor()
sp.load(model_path)
# 打开文件准备写入
with open(output_file, 'w', encoding='utf-8') as f:
# 遍历词汇表中的每一个词汇和其索引
for piece_id in range(sp.get_piece_size()):
piece = sp.id_to_piece(piece_id)
score = sp.get_score(piece_id)
# 将词汇和其索引(或分数)写入文件
f.write(f'{piece_id}\t{piece}\t{score}\n')
# 使用示例:假设模型文件名为 'tokenizer.model'
export_vocab_to_file('tokenizer.model', 'vocab.txt')