Bert WordpieceTokenizer字符片段的分词。个人理解分出vocab中的更小片段,分词思想从字符后往前遍历字符段与vocab进行匹配,匹配到则保留,然后检索字符片段剩下的片段。 def tokenize(text): unk_token='<unk>' vocab=["un", "##aff", "##able"] output_tokens = [] for token in [text]: chars = list(token) n_chars = len(chars) if n_chars > 300: output_tokens.append(unk_token) continue is_bad = False start = 0 sub_tokens = [] while start < n_chars: end = n_chars cur_substr = None while start < end: substr = "".join(chars[start:end]) if start > 0: # Now it's a subword substr = "##" + substr if substr in vocab: cur_substr = substr break end -= 1 if cur_substr is None: is_bad = True break sub_tokens.append(cur_substr) start = end if is_bad: output_tokens.append(unk_token) else: output_tokens.extend(sub_tokens) return output_tokens print(tokenize('unaffable'))
#输出 ["un", "##aff", "##able"]
源码来自于guonnlp/data/fast_bert_tokenizer.pyx
cdef class WordpieceTokenizer:
"""Runs WordPiece tokenziation."""
cdef public vocab
cdef public str unk_token
cdef public long max_input_chars_per_word
def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word
def tokenize(self, text) -> List[str]:
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
:param text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.
:returns: A list of wordpiece tokens.
"""
cdef long max_input_chars_per_word = self.max_input_chars_per_word
cdef:
bint is_bad
long start
long end
Py_ssize_t n_chars
output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
n_chars = len(chars)
if n_chars > max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue
is_bad = False
start = 0
sub_tokens = []
while start < n_chars:
end = n_chars
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
# Now it's a subword
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end
if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens