class WordpieceTokenizer(object):
"""Runs WordPiece tokenziation."""
def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.
Returns:
A list of wordpiece tokens.
"""
text = convert_to_unicode(text)
output_tokens = []
for token in whitespace_
bert-tokenization-WordpieceTokenizer
最新推荐文章于 2024-10-21 17:17:40 发布
本文深入探讨BERT的分词技术,重点解析WordpieceTokenizer的工作原理及其在BERT预训练过程中的作用。通过实例展示如何使用WordpieceTokenizer进行词汇切分,理解其在处理词汇未知词(unk)和词汇表构建过程中的策略。
摘要由CSDN通过智能技术生成