sentencepiece加载模型输出词表内容

couldn

于 2024-03-27 10:31:37 发布

阅读量286

点赞数 1

文章标签：语言模型算法词表大模型分词算法 tokenizer

本文链接：https://blog.csdn.net/qq_40450969/article/details/137069845

版权

话不多说，上代码

import sentencepiece as spm
def export_vocab_to_file(model_path, output_file):
    # 加载 SentencePiece 模型
    sp = spm.SentencePieceProcessor()
    sp.load(model_path)

    # 打开文件准备写入
    with open(output_file, 'w', encoding='utf-8') as f:
        # 遍历词汇表中的每一个词汇和其索引
        for piece_id in range(sp.get_piece_size()):
            piece = sp.id_to_piece(piece_id)
            score = sp.get_score(piece_id)
            # 将词汇和其索引（或分数）写入文件
            f.write(f'{piece_id}\t{piece}\t{score}\n')

# 使用示例：假设模型文件名为 'tokenizer.model'
export_vocab_to_file('tokenizer.model', 'vocab.txt')