GPT2 - tokenizer 内部实现
内部类的继承
from transformers import GPT2Tokenizer
== transformers/tokenization_utils_fast.py
== transformers/models/gpt2/tokenization_gpt2.py
== transformers/tokenization_utils_base.py
== transformers/tokenization_utils.py
内部实现步骤
以输入为一个简单的文本:
Human: I am trying to write a fairy tale. What is the most popular plot?\n
Assistant: The most popular plot might be “a princess goes to a faraway land.\n
Human: Do you like the plot of Hansel and Gretel?\n
Assistant: Not particularly.<|endoftext|>
执行函数
chosen_token = tokenizer(chosen,
max_length=max_length,#512
padding="max_length",
truncation=True,
return_tensors="pt")
跳转到函数
ransformers/tokenization_utils_base.py
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair_target: Optional[
Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
"""
Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
sequences.
Args:
text (`str`, `List[str]`, `List[List[str]]`, *optional*):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
text_pair (`str`, `List[str]`, `List[List[str]]`, *optional*):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
text_target (`str`, `List[str]`, `List[List[str]]`, *optional*):
The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
text_pair_target (`str`, `List[str]`, `List[List[str]]`, *optional*):
The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
"""
# To avoid duplicating
all_kwargs = dict(
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
)
print("transformers/tokenization_utils_base.py")
all_kwargs.update(kwargs)
if text is None and text_target is None:
raise ValueError("You need to specify either `text` or `text_target`.")
if text is not None:
# The context manager will send the inputs as normal texts and not text_target, but we shouldn't change the
# input mode in this case.
print("self._in_target_context_manager: ", self._in_target_context_manager)
# _in_target_context_manager: False
if not self._in_target_context_manager:
self._switch_to_input_mode()
#
encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs)
print("line 2509 codings: ", encodings)
exit()
if text_target is not None:
self._switch_to_target_mode()
target_encodings = self._call_one(text=text_target, text_pair=text_pair_target, **all_kwargs)
# Leave back tokenizer in input mode
self._switch_to_input_mode()
if text_target is None:
return encodings
elif text is None:
return target_encodings
else:
encodings["labels"] = target_encodings["input_ids"]
return encodings
首先,先执行**call**函数判断text 内容,如果不为空,将上下文管理器将以普通文本而不是text_target的形式发送输入,但是在这种情况下我们不应该改变输入模式。接下来调用switch_to_input_mode :将标记器置于输入模式的私有方法(当它具有不同的输入/输出模式时).
<不重要>
这个位置的上下文管理器中的内容不清楚
接下来转id 的过程。调用 _call_one,接下来调用self.encode_plus, 再调用_encode_plus
def _call_one(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
# Input type checking for clearer error
def _is_valid_text_input(t):
if isinstance(t, str):
# Strings are fine
return True
elif isinstance(t, (list, tuple)):
# List are fine as long as they are...
if len(t) == 0:
# ... empty
return True
elif isinstance(t[0], str):
# ... list of strings
return True
elif isinstance(t[0], (list, tuple)):
# ... list with an empty list or with a list of strings
return len(t[0]) == 0 or isinstance(t[0][0], str)
else:
return False
else:
return False
if not _is_valid_text_input(text):
raise ValueError(
"text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
"or `List[List[str]]` (batch of pretokenized examples)."
)
print("2556 _is_valid_text_input(text)", _is_valid_text_input(text))
if text_pair is not None and not _is_valid_text_input(text_pair):
raise ValueError(
"text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
"or `List[List[str]]` (batch of pretokenized examples)."
)
# False
if is_split_into_words:
is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
else:
is_batched = isinstance(text, (list, tuple))
print("2568 is_split_into_words(text)", is_split_into_words)
print("2569 is_batched: ", is_batched)
print("2597 text: ", text)
if is_batched:
if isinstance(text_pair, str):
raise TypeError(
"when tokenizing batches of text, `text_pair` must be a list or tuple with the same length as"
" `text`."
)
if text_pair is not None and len(text) != len(text_pair):
raise ValueError(
f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
f" {len(text_pair)}."
)
batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
return self.batch_encode_plus(
batch_text_or_text_pairs=batch_text_or_text_pairs,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
else:
return self.encode_plus(
text=text,
text_pair=text_pair,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def encode_plus(
self,
text: Union[TextInput, PreTokenizedInput, EncodedInput],
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
"""
Tokenize and prepare for the model a sequence or a pair of sequences.
<Tip warning={true}>
This method is deprecated, `__call__` should be used instead.
</Tip>
Args:
text (`str`, `List[str]` or `List[int]` (the latter only for not-fast tokenizers)):
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
`tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method).
text_pair (`str`, `List[str]` or `List[int]`, *optional*):
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
method).
"""
print(" === 2665 encode_plus == ")
# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)
print(" == 2676 padding_strategy == ", padding_strategy)
print(" == 2677 truncation_strategy == ", truncation_strategy)
print(" == 2678 max_length == ", max_length)
print(" == 2679 kwargs == ", kwargs)
# print(" === 2708 === ", text)
# exit()
return self._encode_plus(
text=text,
text_pair=text_pair,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
is_split_into_words=is_split_into_words,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)
def _encode_plus(
self,
text: Union[TextInput, PreTokenizedInput, EncodedInput],
text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
add_special_tokens: bool = True,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
max_length: Optional[int] = None,
stride: int = 0,
is_split_into_words: bool = False,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
print("==== 2738 _encode_plus =====")
# exit()
raise NotImplementedError
之后最重要的就是_encode_plus的执行顺序,接下来执行transformers/tokenization_utils.py中_encode_plus。 在该模型的内部:首先执行get_input_ids(text) 获取ids
def get_input_ids(text):
if isinstance(text, str):
tokens = self.tokenize(text, **kwargs)
return self.convert_tokens_to_ids(tokens)
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
if is_split_into_words:
tokens = list(
itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
)
return self.convert_tokens_to_ids(tokens)
else:
return self.convert_tokens_to_ids(text)
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
return text
else:
if is_split_into_words:
raise ValueError(
f"Input {text} is not valid. Should be a string or a list/tuple of strings when"
" `is_split_into_words=True`."
)
else:
raise ValueError(
f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of"
" integers."
)
在该函数中调用tokenize, 完成从utf-8 转为 bytes 的过程
text: Human: I am trying to write a fairy tale. What is the most popular plot?
Assistant: The most popular plot might be “a princess goes to a faraway land.
Human: Do you like the plot of Hansel and Gretel?
Assistant: Not particularly.<|endoftext|>
tokenize: [‘Human’, ‘:’, ‘ĠI’, ‘Ġam’, ‘Ġtrying’, ‘Ġto’, ‘Ġwrite’, ‘Ġa’, ‘Ġfairy’, ‘Ġtale’, ‘.’, ‘ĠWhat’, ‘Ġis’, ‘Ġthe’, ‘Ġmost’, ‘Ġpopular’, ‘Ġplot’, ‘?’, ‘Ċ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘ĠAssistant’, ‘:’, ‘ĠThe’, ‘Ġmost’, ‘Ġpopular’, ‘Ġplot’, ‘Ġmight’, ‘Ġbe’, ‘ĠâĢ’, ‘ľ’, ‘a’, ‘Ġprincess’, ‘Ġgoes’, ‘Ġto’, ‘Ġa’, ‘Ġfar’, ‘away’, ‘Ġland’, ‘.’, ‘Ċ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘ĠHuman’, ‘:’, ‘ĠDo’, ‘Ġyou’, ‘Ġlike’, ‘Ġthe’, ‘Ġplot’, ‘Ġof’, ‘ĠHan’, ‘sel’, ‘Ġand’, ‘ĠGret’, ‘e
l’, ‘?’, ‘Ċ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘Ġ’, ‘ĠAssistant’, ‘:’, ‘ĠNot’, ‘Ġparticularly’, ‘.’, ‘<|endoftext|>’]
代码实现:
def tokenize(self, text: TextInput, **kwargs) -> List[str]:
"""
Converts a string in a sequence of tokens, using the tokenizer.
Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
(BPE/SentencePieces/WordPieces). Takes care of added tokens.
Args:
text (`str`):
The sequence to be encoded.
**kwargs (additional keyword arguments):
Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
Returns:
`List[str]`: The list of tokens.
"""
# Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
all_special_tokens_extended = dict(
(str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
)
print("utils.py tokenizer", text)
"""函数重构:调用tokenization_gpt2.py: prepare_for_tokenization
输入:
Human: I am trying to write a fairy tale. What is the most popular plot?
Assistant: The most popular plot might be “a princess goes to a faraway land.
Human: Do you like the plot of Hansel and Gretel?
Assistant: Not particularly.<|endoftext|>
输出:Human: I am trying to write a fairy tale. What is the most popular plot?
Assistant: The most popular plot might be “a princess goes to a faraway land.
Human: Do you like the plot of Hansel and Gretel?
Assistant: Not particularly.<|endoftext|>
"""
text, kwargs = self.prepare_for_tokenization(text, **kwargs)
if kwargs:
logger.warning(f"Keyword arguments {kwargs} not recognized.")
# TODO: should this be in the base class?
# False
if hasattr(self, "do_lower_case") and self.do_lower_case:
# convert non-special tokens to lowercase
escaped_special_toks = [
re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)
]
pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
# 不进行切割<|endoftext|>
no_split_token = set(self.unique_no_split_tokens)
# diy 设计分割符号按照自己的设计进行切割
# 见3-2
tokens = self.tokens_trie.split(text)
# ["This is something", "<special_token_1>", " else"]
for i, token in enumerate(tokens):
if token in no_split_token:
tok_extended = all_special_tokens_extended.get(token, None)
left = tokens[i - 1] if i > 0 else None
right = tokens[i + 1] if i < len(tokens) - 1 else None
if isinstance(tok_extended, AddedToken):
if tok_extended.rstrip and right:
# A bit counter-intuitive but we strip the left of the string
# since tok_extended.rstrip means the special token is eating all white spaces on its right
tokens[i + 1] = right.lstrip()
# Strip white spaces on the left
if tok_extended.lstrip and left:
tokens[i - 1] = left.rstrip() # Opposite here
else:
# We strip left and right by default
if right:
tokens[i + 1] = right.lstrip()
if left:
tokens[i - 1] = left.rstrip()
# ["This is something", "<special_token_1>", "else"]
tokenized_text = []
for token in tokens:
# Need to skip eventual empty (fully stripped) tokens
if not token:
continue
if token in no_split_token:
tokenized_text.append(token)
else:
tokenized_text.extend(self._tokenize(token))
# ["This", " is", " something", "<special_token_1>", "else"]
print("tokenized_text: ", tokenized_text)
return tokenized_text
Tree 函数实现内部,转换成为树结构的数据;按照指定内容进行分割;例如指定不可分割内容: ||,
输入文本 :"Human: I am trying to write a fairy tale. <|endoftext|> What is the most popular plot? <|endoftext|> "
输出文本:['Human: I am trying to write a fairy tale. ‘, ‘<|endoftext|>’, ’ What is the most popular plot?’, ‘<|endoftext|>’]
分割之后的文本对空格进行处理,删除前后的空格或者不删除前后的空格, 在这里是不删除前后的空格,空格作为token的一部分
输出文本:['Human: I am trying to write a fairy tale. ', ‘<|endoftext|>’, ’ What is the most popular plot? ', ‘<|endoftext|>’, ’ ']
class Trie:
"""
Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass
Loose reference https://en.wikipedia.org/wiki/Trie
"""
def __init__(self):
self.data = {}
def add(self, word: str):
"""
Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation.
The special key `""` is used to represent termination.
This function is idempotent, adding twice the same word will leave the trie unchanged
Example:
```python
>>> trie = Trie()
>>> trie.add("Hello 友達")
>>> trie.data
{"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}}
>>> trie.add("Hello")
>>> trie.data
{"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}}
```
"""
if not word:
# Prevent empty string
return
ref = self.data
for char in word:
ref[char] = char in ref and ref[char] or {}
ref = ref[char]
ref[""] = 1
#输入
def split(self, text: str) -> List[str]:
"""
Will look for the words added to the trie within `text`. Output is the original string splitted along the
boundaries of the words found.
This trie will match the longest possible word first !
Example:
```python
>>> trie = Trie()
>>> trie.split("[CLS] This is a extra_id_100")
["[CLS] This is a extra_id_100"]
>>> trie.add("[CLS]")
>>> trie.add("extra_id_1")
>>> trie.add("extra_id_100")
>>> trie.split("[CLS] This is a extra_id_100")
["[CLS]", " This is a ", "extra_id_100"]
```
"""
# indexes are counted left of the chars index.
# "hello", index 0, is left of h, index 1 is between h and e.
# index 5 is right of the "o".
# States are going to capture every possible start (indexes as above)
# as keys, and have as values, a pointer to the position in the trie
# where we're at. This is a partial match for now.
# This enables to keep track of multiple matches while we're iterating
# the string
# If the trie contains, "blowing", and "lower" and we encounter the
# string "blower", we need to split into ["b", "lower"].
# This is where we need to keep track of multiple possible starts.
states = OrderedDict()
# This will contain every indices where we need
# to cut.
# We force to cut at offset 0 and len(text) (added later)
# 所有cut 的index
offsets = [0]
# This is used by the lookahead which needs to skip over
# some text where the full match exceeded the place in the initial
# for loop
# 需要跳过的内容
skip = 0
# Main loop, Giving this algorithm O(n) complexity
for current, current_char in enumerate(text):
if skip and current < skip:
# Prevents the lookahead for matching twice
# like extra_id_100 and id_100
continue
# This will track every state
# that stop matching, we need to stop tracking them.
# If we look at "lowball", we're going to match "l" (add it to states), "o", "w", then
# fail on "b", we need to remove 0 from the valid states.
to_remove = set()
# Whenever we found a match, we need to drop everything
# this is a greedy algorithm, it will match on the first found token
reset = False
# In this case, we already have partial matches (But unfinished)
for start, trie_pointer in states.items():
if "" in trie_pointer:
# This is a final match, we need to reset and
# store the results in `offsets`.
# Lookahead to match longest first
# Important in case of extra_id_1 vs extra_id_100
# Here we are also actively looking for other earlier partial
# matches
# "[CLS]", "L", we need to match CLS even if L is special
for lookstart, looktrie_pointer in states.items():
if lookstart > start:
# This partial match is later, we can stop looking
break
elif lookstart < start:
# This partial match is earlier, the trie pointer
# was already updated, so index is + 1
lookahead_index = current + 1
end = current + 1
else:
# Here lookstart == start and
# looktrie_pointer == trie_pointer
# It wasn't updated yet so indices are current ones
lookahead_index = current
end = current
next_char = text[lookahead_index] if lookahead_index < len(text) else None
if "" in looktrie_pointer:
start = lookstart
end = lookahead_index
skip = lookahead_index
while next_char in looktrie_pointer:
looktrie_pointer = looktrie_pointer[next_char]
lookahead_index += 1
if "" in looktrie_pointer:
start = lookstart
end = lookahead_index
skip = lookahead_index
if lookahead_index == len(text):
# End of string
break
next_char = text[lookahead_index]
# End lookahead
# Storing and resetting
offsets.append(start)
offsets.append(end)
reset = True
break
elif current_char in trie_pointer:
# The current character being looked at has a match within the trie
# update the pointer (it will be stored back into states later).
trie_pointer = trie_pointer[current_char]
# Storing back the new pointer into the states.
# Partial matches got longer by one.
states[start] = trie_pointer
else:
# The new character has not match in the trie, we need
# to stop keeping track of this partial match.
# We can't do it directly within the loop because of how
# python iteration works
to_remove.add(start)
# Either clearing the full start (we found a real match)
# Or clearing only the partial matches that didn't work.
if reset:
states = {}
else:
for start in to_remove:
del states[start]
# If this character is a starting character within the trie
# start keeping track of this partial match.
if current >= skip and current_char in self.data:
states[current] = self.data[current_char]
# We have a cut at the end with states.
for start, trie_pointer in states.items():
if "" in trie_pointer:
# This is a final match, we need to reset and
# store the results in `offsets`.
end = len(text)
offsets.append(start)
offsets.append(end)
# Longest cut is always the one with lower start so the first
# item so we need to break.
break
print("offeset: ", offsets)
print("text: ", text)
return self.cut_text(text, offsets)
def cut_text(self, text, offsets):
# We have all the offsets now, we just need to do the actual splitting.
# We need to eventually add the first part of the string and the eventual
# last part.
offsets.append(len(text))
tokens = []
start = 0
for end in offsets:
if start > end:
logger.error(
"There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it"
" anyway."
)
continue
elif start == end:
# This might happen if there's a match at index 0
# we're also preventing zero-width cuts in case of two
# consecutive matches
continue
tokens.append(text[start:end])
start = end
return tokens
接下来就是token转byte:
for i, token in enumerate(tokens):
if token in no_split_token:
tok_extended = all_special_tokens_extended.get(token, None)
left = tokens[i - 1] if i > 0 else None
right = tokens[i + 1] if i < len(tokens) - 1 else None
if isinstance(tok_extended, AddedToken):
if tok_extended.rstrip and right:
# A bit counter-intuitive but we strip the left of the string
# since tok_extended.rstrip means the special token is eating all white spaces on its right
tokens[i + 1] = right.lstrip()
# Strip white spaces on the left
if tok_extended.lstrip and left:
tokens[i - 1] = left.rstrip() # Opposite here
else:
# We strip left and right by default
if right:
tokens[i + 1] = right.lstrip()
if left:
tokens[i - 1] = left.rstrip()
在token转byte的过程, no_split_token 原样输出,其余token 进行转换BYTE转换。
输入文本: ['Human: I am trying to write a fairy tale. ', ‘<|endoftext|>’, ’ What is the most popular plot? ', ‘<|endoftext|>’, ’ ']
输出内容: [‘Human’, ‘:’, ‘ĠI’, ‘Ġam’, ‘Ġtrying’, ‘Ġto’, ‘Ġwrite’, ‘Ġa’, ‘Ġfairy’, ‘Ġtale’, ‘.’, ‘Ġ’, ‘Ġ’, ‘<|endoftext|>’, ‘Ġ’, ‘ĠWhat’, ‘Ġis’, ‘Ġthe’, ‘Ġmost’, ‘Ġpopular’, ‘Ġplot’, ‘?’, ‘Ġ’, ‘Ġ’, ‘<|endoftext|>’, ‘Ġ’, ‘Ġ’]
转BYTE的代码transformers/models/gpt2/tokenization_gpt2.py 中 line 297 _tokenize实现:
def _tokenize(self, text):
"""Tokenize a string."""
print("gpt2 _tokenizer text: ", text)
bpe_tokens = []
for token in re.findall(self.pat, text):
# 将token中的内容编码成为utf-8, 转换为UTF-8之后再进行bytes编码
token = "".join(
self.byte_encoder[b] for b in token.encode("utf-8")
) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
# exit()
return bpe_tokens
在上面的处理过程中:
输入token: Human
token 转成utf-8编码: b’Human’
utf-8编码转bytes 编码:
72 == H
117 == u
109 == m
97 == a
110 == n
将所有字节映射到unicode字符串,避免BPE的控制标记(在本例中为空格)
Human
最后在经过bpe 编码输出结果: [‘Human’, ‘:’, ‘ĠI’, ‘Ġam’, ‘Ġtrying’, ‘Ġto’, ‘Ġwrite’, ‘Ġa’, ‘Ġfairy’, ‘Ġtale’, ‘.’, ‘Ġ’, ‘Ġ’]
其中,byte_encoder 是utf-8与byte的映射表
self.byte_encoder: {33: '!', 34: '"', 35: '#', 36: '$', 37: '%', 38: '&', 39: "'", 40: '(', 41: ')', 42: '*', 43: '+', 44: ',', 45: '-', 46: '.', 47: '/', 48: '0', 49: '1', 50: '2', 51: '3', 52: '4', 53: '5', 54: '6', 55: '7', 56: '8', 57: '9', 58: ':', 59: ';', 60: '<', 61: '=', 62: '>', 63: '?', 64: '@', 65: 'A', 66: 'B', 67: 'C', 68: 'D', 69: 'E', 70: 'F', 71: 'G', 72: 'H', 73: 'I', 74: 'J', 75: 'K', 76: 'L', 77: 'M', 78: 'N', 79: 'O', 80: 'P', 81: 'Q', 82: 'R', 83: 'S', 84: 'T', 85: 'U', 86: 'V', 87: 'W', 88: 'X', 89: 'Y', 90: 'Z', 91: '[', 92: '\\', 93: ']', 94: '^', 95: '_', 96: '`', 97: 'a', 98: 'b', 99: 'c', 100: 'd', 101: 'e', 102: 'f', 103: 'g', 104: 'h', 105: 'i', 106: 'j', 107: 'k', 108: 'l', 109: 'm', 110: 'n', 111: 'o', 112: 'p', 113: 'q', 114: 'r', 115: 's', 116: 't', 117: 'u', 118: 'v', 119: 'w', 120: 'x', 121: 'y', 122: 'z', 123: '{', 124: '|', 125: '}', 126: '~', 161: '¡', 162: '¢', 163: '£', 164: '¤', 165: '¥', 166: '¦', 167: '§', 168: '¨', 169: '©', 170: 'ª', 171: '«', 172: '¬', 174: '®', 175: '¯', 176: '°', 177: '±', 178: '²', 179: '³', 180: '´', 181: 'µ', 182: '¶', 183: '·', 184: '¸', 185: '¹', 186: 'º', 187: '»', 188: '¼', 189: '½', 190: '¾', 191: '¿', 192: 'À', 193: 'Á', 194: 'Â', 195: 'Ã', 196: 'Ä', 197: 'Å', 198: 'Æ', 199: 'Ç', 200: 'È', 201: 'É', 202: 'Ê', 203: 'Ë', 204: 'Ì', 205: 'Í', 206: 'Î', 207: 'Ï', 208: 'Ð', 209: 'Ñ', 210: 'Ò', 211: 'Ó', 212: 'Ô', 213: 'Õ', 214: 'Ö', 215: '×', 216: 'Ø', 217: 'Ù', 218: 'Ú', 219: 'Û', 220: 'Ü', 221: 'Ý', 222: 'Þ', 223: 'ß', 224: 'à', 225: 'á', 226: 'â', 227: 'ã', 228: 'ä', 229: 'å', 230: 'æ', 231: 'ç', 232: 'è', 233: 'é', 234: 'ê', 235: 'ë', 236: 'ì', 237: 'í', 238: 'î', 239: 'ï', 240: 'ð', 241: 'ñ', 242: 'ò', 243: 'ó', 244: 'ô', 245: 'õ', 246: 'ö', 247: '÷', 248: 'ø', 249: 'ù', 250: 'ú', 251: 'û', 252: 'ü', 253: 'ý', 254: 'þ', 255: 'ÿ', 0: 'Ā', 1: 'ā', 2: 'Ă', 3: 'ă', 4: 'Ą', 5: 'ą', 6: 'Ć', 7: 'ć', 8: 'Ĉ', 9: 'ĉ', 10: 'Ċ', 11: 'ċ', 12: 'Č', 13: 'č', 14: 'Ď', 15: 'ď', 16: 'Đ', 17: 'đ', 18: 'Ē', 19: 'ē', 20: 'Ĕ', 21: 'ĕ', 22: 'Ė', 23: 'ė', 24: 'Ę', 25: 'ę', 26: 'Ě', 27: 'ě', 28: 'Ĝ', 29: 'ĝ', 30: 'Ğ', 31: 'ğ', 32: 'Ġ', 127: 'ġ', 128: 'Ģ', 129: 'ģ', 130: 'Ĥ', 131: 'ĥ', 132: 'Ħ', 133: 'ħ', 134: 'Ĩ', 135: 'ĩ', 136: 'Ī', 137: 'ī', 138: 'Ĭ', 139: 'ĭ', 140: 'Į', 141: 'į', 142: 'İ', 143: 'ı', 144: 'IJ', 145: 'ij', 146: 'Ĵ', 147: 'ĵ', 148: 'Ķ', 149: 'ķ', 150: 'ĸ', 151: 'Ĺ', 152: 'ĺ', 153: 'Ļ', 154: 'ļ', 155: 'Ľ', 156: 'ľ', 157: 'Ŀ', 158: 'ŀ', 159: 'Ł', 160: 'ł', 173: 'Ń'}
最后将bep 编码之后的数据转token: transformers/tokenization_utils.py line 581, convert_tokens_to_ids
def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
"""
Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
vocabulary.
Args:
tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).
Returns:
`int` or `List[int]`: The token id or list of token ids.
"""
if tokens is None:
return None
if isinstance(tokens, str):
return self._convert_token_to_id_with_added_voc(tokens)
ids = []
for token in tokens:
ids.append(self._convert_token_to_id_with_added_voc(token))
return ids
内部执行的具体操作:
convert_tokens_to_ids tokens: [‘Human’, ‘:’, ‘ĠI’, ‘Ġam’, ‘Ġtrying’, ‘Ġto’, ‘Ġwrite’, ‘Ġa’, ‘Ġfairy’, ‘Ġtale’, ‘.’, ‘Ġ’, ‘Ġ’, ‘<|endoftext|>’, ‘Ġ’, ‘ĠWhat’, ‘Ġis’, ‘Ġthe’, ‘Ġmost’, ‘Ġpopular’, ‘Ġplot’, ‘?’, ‘Ġ’, ‘Ġ’, ‘<|endoftext|>’, ‘Ġ’, ‘Ġ’]
Human == <|endoftext|> == 50256 == 20490
: == <|endoftext|> == 50256 == 25
ĠI == <|endoftext|> == 50256 == 314
Ġam == <|endoftext|> == 50256 == 716
Ġtrying == <|endoftext|> == 50256 == 2111
Ġto == <|endoftext|> == 50256 == 284
Ġwrite == <|endoftext|> == 50256 == 3551
Ġa == <|endoftext|> == 50256 == 257
Ġfairy == <|endoftext|> == 50256 == 25607
Ġtale == <|endoftext|> == 50256 == 12838
. == <|endoftext|> == 50256 == 13
Ġ == <|endoftext|> == 50256 == 220
Ġ == <|endoftext|> == 50256 == 220
<|endoftext|> == <|endoftext|> == 50256 == 50256
Ġ == <|endoftext|> == 50256 == 220
ĠWhat == <|endoftext|> == 50256 == 1867
Ġis == <|endoftext|> == 50256 == 318
Ġthe == <|endoftext|> == 50256 == 262
Ġmost == <|endoftext|> == 50256 == 749
Ġpopular == <|endoftext|> == 50256 == 2968
Ġplot == <|endoftext|> == 50256 == 7110
? == <|endoftext|> == 50256 == 30
Ġ == <|endoftext|> == 50256 == 220
Ġ == <|endoftext|> == 50256 == 220
<|endoftext|> == <|endoftext|> == 50256 == 50256
Ġ == <|endoftext|> == 50256 == 220
Ġ == <|endoftext|> == 50256 == 220
最后得到ids
[20490, 25, 314, 716, 2111, 284, 3551, 257, 25607, 12838, 13, 220, 220, 50256, 220, 1867, 318, 262, 749, 2968, 7110, 30, 220, 220, 50256, 220, 220]
prepare_for_model 函数 transformers/tokenization_utils_base.py line 3099中实现