Transformers 源码解析(三十二)

.\models\bert\tokenization_bert.py

# 指定编码为 UTF-8

# 版权声明,版权归Google AI Language Team和HuggingFace Inc.团队所有,使用Apache License 2.0授权
# 只有在遵守许可证的情况下才能使用此文件
# 可以在以下网址获取许可证的副本:http://www.apache.org/licenses/LICENSE-2.0

# 如果适用法律要求或书面同意,软件将按“原样”分发,不提供任何明示或暗示的保证或条件
# 请参阅许可证以了解详细信息

"""Bert的标记化类。"""

# 导入所需模块
import collections  # 导入collections模块
import os  # 导入os模块
import unicodedata  # 导入unicodedata模块
from typing import List, Optional, Tuple  # 导入类型提示所需的模块

# 从tokenization_utils.py中导入预训练的标记器和一些辅助函数
from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace

# 导入日志记录功能
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义词汇文件的名称,这里是一个包含词汇的文本文件
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}

# 预训练词汇文件的映射,这里假设只有一个vocab_file键,对应的值是vocab.txt文件名
PRETRAINED_VOCAB_FILES_MAP = {
    {
        "vocab_file": {
            "google-bert/bert-base-uncased": "https://huggingface.co/google-bert/bert-base-uncased/resolve/main/vocab.txt",
            "google-bert/bert-large-uncased": "https://huggingface.co/google-bert/bert-large-uncased/resolve/main/vocab.txt",
            "google-bert/bert-base-cased": "https://huggingface.co/google-bert/bert-base-cased/resolve/main/vocab.txt",
            "google-bert/bert-large-cased": "https://huggingface.co/google-bert/bert-large-cased/resolve/main/vocab.txt",
            "google-bert/bert-base-multilingual-uncased": (
                "https://huggingface.co/google-bert/bert-base-multilingual-uncased/resolve/main/vocab.txt"
            ),
            "google-bert/bert-base-multilingual-cased": "https://huggingface.co/google-bert/bert-base-multilingual-cased/resolve/main/vocab.txt",
            "google-bert/bert-base-chinese": "https://huggingface.co/google-bert/bert-base-chinese/resolve/main/vocab.txt",
            "google-bert/bert-base-german-cased": "https://huggingface.co/google-bert/bert-base-german-cased/resolve/main/vocab.txt",
            "google-bert/bert-large-uncased-whole-word-masking": (
                "https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking/resolve/main/vocab.txt"
            ),
            "google-bert/bert-large-cased-whole-word-masking": (
                "https://huggingface.co/google-bert/bert-large-cased-whole-word-masking/resolve/main/vocab.txt"
            ),
            "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad": (
                "https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt"
            ),
            "google-bert/bert-large-cased-whole-word-masking-finetuned-squad": (
                "https://huggingface.co/google-bert/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt"
            ),
            "google-bert/bert-base-cased-finetuned-mrpc": (
                "https://huggingface.co/google-bert/bert-base-cased-finetuned-mrpc/resolve/main/vocab.txt"
            ),
            "google-bert/bert-base-german-dbmdz-cased": "https://huggingface.co/google-bert/bert-base-german-dbmdz-cased/resolve/main/vocab.txt",
            "google-bert/bert-base-german-dbmdz-uncased": (
                "https://huggingface.co/google-bert/bert-base-german-dbmdz-uncased/resolve/main/vocab.txt"
            ),
            "TurkuNLP/bert-base-finnish-cased-v1": (
                "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/vocab.txt"
            ),
            "TurkuNLP/bert-base-finnish-uncased-v1": (
                "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/vocab.txt"
            ),
            "wietsedv/bert-base-dutch-cased": (
                "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/vocab.txt"
            ),
        }
    }
    
    
    注释:
    
    # vocab_file 是一个包含不同 BERT 模型及其对应词汇表 URL 的字典
    {
        "google-bert/bert-base-uncased": "https://huggingface.co/google-bert/bert-base-uncased/resolve/main/vocab.txt",  # Google BERT base uncased 模型的词汇表 URL
        "google-bert/bert-large-uncased": "https://huggingface.co/google-bert/bert-large-uncased/resolve/main/vocab.txt",  # Google BERT large uncased 模型的词汇表 URL
        "google-bert/bert-base-cased": "https://huggingface.co/google-bert/bert-base-cased/resolve/main/vocab.txt",  # Google BERT base cased 模型的词汇表 URL
        "google-bert/bert-large-cased": "https://huggingface.co/google-bert/bert-large-cased/resolve/main/vocab.txt",  # Google BERT large cased 模型的词汇表 URL
        "google-bert/bert-base-multilingual-uncased": (
            "https://huggingface.co/google-bert/bert-base-multilingual-uncased/resolve/main/vocab.txt"  # Google BERT base 多语言 uncased 模型的词汇表 URL
        ),
        "google-bert/bert-base-multilingual-cased": "https://huggingface.co/google-bert/bert-base-multilingual-cased/resolve/main/vocab.txt",  # Google BERT base 多语言 cased 模型的词汇表 URL
        "google-bert/bert-base-chinese": "https://huggingface.co/google-bert/bert-base-chinese/resolve/main/vocab.txt",  # Google BERT base 中文模型的词汇表 URL
        "google-bert/bert-base-german-cased": "https://huggingface.co/google-bert/bert-base-german-cased/resolve/main/vocab.txt",  # Google BERT base 德语 cased 模型的词汇表 URL
        "google-bert/bert-large-uncased-whole-word-masking": (
            "https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking/resolve/main/vocab.txt"  # Google BERT large uncased 整词屏蔽模型的词汇表 URL
        ),
        "google-bert/bert-large-cased-whole-word-masking": (
            "https://huggingface.co/google-bert/bert-large-cased-whole-word-masking/resolve/main/vocab.txt"  # Google BERT large cased 整词屏蔽模型的词汇表 URL
        ),
        "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad": (
            "https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt"  # Google BERT large uncased 整词屏蔽模型(在 SQuAD 上微调)的词汇表 URL
        ),
        "google-bert/bert-large-cased-whole-word-masking-finetuned-squad": (
            "https://huggingface.co/google-bert/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt"  # Google BERT large cased 整词屏蔽模型(在 SQuAD 上微调)的词汇表 URL
        ),
        "google-bert/bert-base-cased-finetuned-mrpc": (
            "https://huggingface.co/google-bert/bert-base-cased-finetuned-mrpc/resolve/main/vocab.txt"  # Google BERT base cased 模型(在 MRPC 数据集上微调)的词汇表 URL
        ),
        "google-bert/bert-base-german-dbmdz-cased": "https://huggingface.co/google-bert/bert-base-german-dbmdz-cased/resolve/main/vocab.txt",  # Google BERT base 德语(由 DBMDZ 组织提供,cased)模型的词汇表 URL
        "google-bert/bert-base-german-dbmdz-uncased": (
            "https://huggingface.co/google-bert/bert-base-german-dbmdz-uncased/resolve/main/vocab.txt"  # Google BERT base 德语(由 DBMDZ 组织提供,uncased)模型的词汇表 URL
        ),
        "TurkuNLP/bert-base-finnish-cased-v1": (
            "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/vocab.txt"  # TurkuNLP 提供的芬兰语 cased BERT base v1 模型的词汇表 URL
        ),
        "TurkuNLP/bert-base-finnish-uncased-v1": (
            "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/vocab.txt"  # TurkuNLP 提供的芬兰语 uncased BERT base v1 模型的词汇表 URL
        ),
        "wietsedv/bert-base-dutch-cased": (
            "
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "google-bert/bert-base-uncased": 512,  # 设置预训练模型的位置嵌入尺寸
    "google-bert/bert-large-uncased": 512,
    "google-bert/bert-base-cased": 512,
    "google-bert/bert-large-cased": 512,
    "google-bert/bert-base-multilingual-uncased": 512,
    "google-bert/bert-base-multilingual-cased": 512,
    "google-bert/bert-base-chinese": 512,
    "google-bert/bert-base-german-cased": 512,
    "google-bert/bert-large-uncased-whole-word-masking": 512,
    "google-bert/bert-large-cased-whole-word-masking": 512,
    "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad": 512,
    "google-bert/bert-large-cased-whole-word-masking-finetuned-squad": 512,
    "google-bert/bert-base-cased-finetuned-mrpc": 512,
    "google-bert/bert-base-german-dbmdz-cased": 512,
    "google-bert/bert-base-german-dbmdz-uncased": 512,
    "TurkuNLP/bert-base-finnish-cased-v1": 512,
    "TurkuNLP/bert-base-finnish-uncased-v1": 512,
    "wietsedv/bert-base-dutch-cased": 512,
}

PRETRAINED_INIT_CONFIGURATION = {
    "google-bert/bert-base-uncased": {"do_lower_case": True},  # 配置预训练模型初始化参数
    "google-bert/bert-large-uncased": {"do_lower_case": True},
    "google-bert/bert-base-cased": {"do_lower_case": False},
    "google-bert/bert-large-cased": {"do_lower_case": False},
    "google-bert/bert-base-multilingual-uncased": {"do_lower_case": True},
    "google-bert/bert-base-multilingual-cased": {"do_lower_case": False},
    "google-bert/bert-base-chinese": {"do_lower_case": False},
    "google-bert/bert-base-german-cased": {"do_lower_case": False},
    "google-bert/bert-large-uncased-whole-word-masking": {"do_lower_case": True},
    "google-bert/bert-large-cased-whole-word-masking": {"do_lower_case": False},
    "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True},
    "google-bert/bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False},
    "google-bert/bert-base-cased-finetuned-mrpc": {"do_lower_case": False},
    "google-bert/bert-base-german-dbmdz-cased": {"do_lower_case": False},
    "google-bert/bert-base-german-dbmdz-uncased": {"do_lower_case": True},
    "TurkuNLP/bert-base-finnish-cased-v1": {"do_lower_case": False},
    "TurkuNLP/bert-base-finnish-uncased-v1": {"do_lower_case": True},
    "wietsedv/bert-base-dutch-cased": {"do_lower_case": False},
}


def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()  # 创建一个有序字典用于存储词汇表
    with open(vocab_file, "r", encoding="utf-8") as reader:
        tokens = reader.readlines()  # 读取词汇文件中的所有行
    for index, token in enumerate(tokens):
        token = token.rstrip("\n")  # 去除每个词汇的换行符
        vocab[token] = index  # 将词汇添加到字典中,键为词汇,值为索引
    return vocab  # 返回加载后的词汇表字典


def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()  # 去除文本首尾空白字符
    if not text:
        return []  # 如果文本为空,则返回空列表
    tokens = text.split()  # 使用空格分割文本生成词汇列表
    return tokens  # 返回分割后的词汇列表


class BertTokenizer(PreTrainedTokenizer):
    r"""
    Construct a BERT tokenizer. Based on WordPiece.
    """
    # 从`PreTrainedTokenizer`继承,该类包含大多数主要方法。用户应参考这个超类以获取关于这些方法的更多信息。

    # 参数:
    # vocab_file (`str`):
    #     包含词汇表的文件。
    # do_lower_case (`bool`, *可选*, 默认为 `True`):
    #     在标记化时是否将输入转换为小写。
    # do_basic_tokenize (`bool`, *可选*, 默认为 `True`):
    #     是否在使用WordPiece之前进行基本的标记化。
    # never_split (`Iterable`, *可选*):
    #     在标记化时永远不会分割的一组标记。仅在 `do_basic_tokenize=True` 时有效。
    # unk_token (`str`, *可选*, 默认为 `"[UNK]"`):
    #     未知标记。词汇表中不存在的标记无法转换为ID,并将被设置为此标记。
    # sep_token (`str`, *可选*, 默认为 `"[SEP]"`):
    #     分隔符标记,在构建来自多个序列的序列时使用,例如用于序列分类或用于文本和问题的问题回答。在使用特殊标记构建的序列的最后一个标记也会使用此标记。
    # pad_token (`str`, *可选*, 默认为 `"[PAD]"`):
    #     用于填充的标记,例如在批处理不同长度的序列时使用。
    # cls_token (`str`, *可选*, 默认为 `"[CLS]"`):
    #     分类器标记,在进行序列分类(整个序列的分类而不是每个标记的分类)时使用。在使用特殊标记构建的序列的第一个标记。
    # mask_token (`str`, *可选*, 默认为 `"[MASK]"`):
    #     用于屏蔽值的标记。这是在使用掩蔽语言建模训练模型时使用的标记。模型将尝试预测此标记。
    # tokenize_chinese_chars (`bool`, *可选*, 默认为 `True`):
    #     是否标记化中文字符。
    #     对于日文,这可能应该停用(参见此[问题](https://github.com/huggingface/transformers/issues/328))。
    # strip_accents (`bool`, *可选*):
    #     是否删除所有重音符号。如果未指定此选项,则将根据 `lowercase` 的值确定(与原始BERT相同)。
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    # 初始化方法,用于初始化一个Tokenizer对象
    def __init__(
        self,
        vocab_file,
        do_lower_case=True,
        do_basic_tokenize=True,
        never_split=None,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        tokenize_chinese_chars=True,
        strip_accents=None,
        **kwargs,
    ):
        # 检查给定的词汇文件是否存在,如果不存在则抛出异常
        if not os.path.isfile(vocab_file):
            raise ValueError(
                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
                " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
            )
        # 加载词汇表文件到self.vocab中
        self.vocab = load_vocab(vocab_file)
        # 根据加载的词汇表构建从id到token的有序字典
        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
        # 是否进行基本的tokenize操作
        self.do_basic_tokenize = do_basic_tokenize
        # 如果需要进行基本tokenize,则初始化BasicTokenizer对象
        if do_basic_tokenize:
            self.basic_tokenizer = BasicTokenizer(
                do_lower_case=do_lower_case,
                never_split=never_split,
                tokenize_chinese_chars=tokenize_chinese_chars,
                strip_accents=strip_accents,
            )

        # 初始化WordpieceTokenizer对象,使用加载的词汇表和未知token
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))

        # 调用父类的初始化方法,传递相同的参数和额外的参数
        super().__init__(
            do_lower_case=do_lower_case,
            do_basic_tokenize=do_basic_tokenize,
            never_split=never_split,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )

    # 属性方法,返回是否进行小写处理的标志位
    @property
    def do_lower_case(self):
        return self.basic_tokenizer.do_lower_case

    # 属性方法,返回词汇表的大小
    @property
    def vocab_size(self):
        return len(self.vocab)

    # 方法,返回包含所有词汇和特殊token编码的字典
    def get_vocab(self):
        return dict(self.vocab, **self.added_tokens_encoder)

    # 方法,对文本进行tokenize操作,返回token列表
    def _tokenize(self, text, split_special_tokens=False):
        split_tokens = []
        # 如果需要进行基本tokenize操作
        if self.do_basic_tokenize:
            # 遍历基本tokenizer的tokenize结果
            for token in self.basic_tokenizer.tokenize(
                text, never_split=self.all_special_tokens if not split_special_tokens else None
            ):
                # 如果token在不分割集合中,则直接加入split_tokens列表
                if token in self.basic_tokenizer.never_split:
                    split_tokens.append(token)
                else:
                    # 否则,使用WordpieceTokenizer对token进行进一步的分词处理,并加入split_tokens列表
                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
        else:
            # 否则,直接使用WordpieceTokenizer对整个text进行tokenize操作
            split_tokens = self.wordpiece_tokenizer.tokenize(text)
        return split_tokens

    # 方法,根据token获取对应的id
    def _convert_token_to_id(self, token):
        """Converts a token (str) in an id using the vocab."""
        return self.vocab.get(token, self.vocab.get(self.unk_token))

    # 方法,根据id获取对应的token
    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        return self.ids_to_tokens.get(index, self.unk_token)
    def convert_tokens_to_string(self, tokens):
        """
        Converts a sequence of tokens (string) into a single string by joining them,
        removing '##' and stripping leading/trailing whitespace.

        Args:
            tokens (List[str]): List of tokens to be converted.

        Returns:
            str: The concatenated string of tokens.
        """
        out_string = " ".join(tokens).replace(" ##", "").strip()
        return out_string

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Builds model inputs from a sequence or a pair of sequences for sequence classification tasks
        by adding special tokens. A BERT sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (List[int]): List of token IDs for the first sequence.
            token_ids_1 (Optional[List[int]]): Optional list of token IDs for the second sequence.

        Returns:
            List[int]: List of input IDs with the appropriate special tokens added.
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + token_ids_1 + sep

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieves a mask indicating whether each token in the input list is a special token
        (1 for special token, 0 for sequence token). This is used when preparing tokens for a model.

        Args:
            token_ids_0 (List[int]): List of token IDs for the first sequence.
            token_ids_1 (Optional[List[int]]): Optional list of token IDs for the second sequence.
            already_has_special_tokens (bool, optional): Whether the input token lists already include special tokens.

        Returns:
            List[int]: A list of integers representing the mask.
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        if token_ids_1 is not None:
            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Creates token type IDs from token lists representing sequences or pairs of sequences.

        Args:
            token_ids_0 (List[int]): List of token IDs for the first sequence.
            token_ids_1 (Optional[List[int]]): Optional list of token IDs for the second sequence.

        Returns:
            List[int]: List of token type IDs.
        """
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
        pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs representing the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List representing the token type IDs for the given sequence(s).
        """
        # Define separator and classification tokens
        sep = [self.sep_token_id]  # Separator token ID
        cls = [self.cls_token_id]  # Classification token ID
        
        # If token_ids_1 is None, return a mask with zeros corresponding to the first sequence only
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]  # Create and return mask with zeros
        
        # If token_ids_1 is provided, return a mask with zeros for the first sequence and ones for the second sequence
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # Initialize index counter
        index = 0
        
        # Determine vocabulary file path
        if os.path.isdir(save_directory):
            # If save_directory is a directory, construct file path inside the directory
            vocab_file = os.path.join(
                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
            )
        else:
            # Otherwise, treat save_directory as the full file path
            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
        
        # Write vocabulary to the specified file
        with open(vocab_file, "w", encoding="utf-8") as writer:
            # Iterate through vocabulary items sorted by index
            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                # Check for non-consecutive indices in the vocabulary
                if index != token_index:
                    logger.warning(
                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
                        " Please check that the vocabulary is not corrupted!"
                    )
                    index = token_index  # Update index to current token's index
                writer.write(token + "\n")  # Write token to file
                index += 1  # Increment index for the next token
        
        # Return the path to the saved vocabulary file
        return (vocab_file,)
# 定义一个名为 BasicTokenizer 的类,用于执行基本的分词(如分割标点符号、转换为小写等)。
class BasicTokenizer(object):
    """
    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

    Args:
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
        never_split (`Iterable`, *optional*):
            Collection of tokens which will never be split during tokenization. Only has an effect when
            `do_basic_tokenize=True`
        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
            Whether or not to tokenize Chinese characters.

            This should likely be deactivated for Japanese (see this
            [issue](https://github.com/huggingface/transformers/issues/328)).
        strip_accents (`bool`, *optional*):
            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
            value for `lowercase` (as in the original BERT).
        do_split_on_punc (`bool`, *optional*, defaults to `True`):
            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
            the full context of the words, such as contractions.
    """

    # 初始化方法,设置类的属性
    def __init__(
        self,
        do_lower_case=True,          # 是否将输入转换为小写,默认为True
        never_split=None,            # 永远不分割的 token 集合,默认为 None
        tokenize_chinese_chars=True, # 是否分割中文字符,默认为 True
        strip_accents=None,          # 是否去除所有重音符号,默认根据 lowercase 决定
        do_split_on_punc=True,       # 是否在基本标点符号处分割,默认为 True
    ):
        # 如果 never_split 为 None,则设为一个空列表
        if never_split is None:
            never_split = []
        # 设置实例的属性值
        self.do_lower_case = do_lower_case                  # 是否小写化输入
        self.never_split = set(never_split)                 # 永远不分割的 token 集合,转为集合类型
        self.tokenize_chinese_chars = tokenize_chinese_chars # 是否分割中文字符
        self.strip_accents = strip_accents                  # 是否去除重音符号
        self.do_split_on_punc = do_split_on_punc            # 是否在基本标点符号处分割
    def tokenize(self, text, never_split=None):
        """
        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

        Args:
            never_split (`List[str]`, *optional*)
                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
        """
        # 使用 never_split 参数更新当前对象的 never_split 集合(若提供的话)
        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
        # 清理文本,如去除无用空白等
        text = self._clean_text(text)

        # 以下部分是为了支持多语言和中文模型而添加的代码(2018 年 11 月 1 日起)
        # 现在英语模型也应用了这一代码,但由于英语模型未经过中文数据的训练,
        # 这段代码对英语模型基本没有影响(尽管英语词汇表中包含了一些中文单词,
        # 这是因为英语维基百科中包含了一些中文词汇)。
        if self.tokenize_chinese_chars:
            # 对包含中文字符的文本进行特殊处理,分词
            text = self._tokenize_chinese_chars(text)
        # 将文本中的 Unicode 标准化为 NFC 格式(避免同一字符的不同 Unicode 编码被视为不同字符)
        unicode_normalized_text = unicodedata.normalize("NFC", text)
        # 使用空白符分割文本,得到原始 token 列表
        orig_tokens = whitespace_tokenize(unicode_normalized_text)
        split_tokens = []
        # 遍历每个原始 token
        for token in orig_tokens:
            # 如果 token 不在 never_split 集合中
            if token not in never_split:
                # 如果设置为小写处理,则将 token 转换为小写
                if self.do_lower_case:
                    token = token.lower()
                    # 如果需要去除重音符号,则执行去除重音符号的操作
                    if self.strip_accents is not False:
                        token = self._run_strip_accents(token)
                # 如果需要去除重音符号,则执行去除重音符号的操作
                elif self.strip_accents:
                    token = self._run_strip_accents(token)
            # 将处理后的 token 通过标点符号分割函数进一步分割
            split_tokens.extend(self._run_split_on_punc(token, never_split))

        # 使用空白符重新组合处理后的 token,并分割为最终的输出 token 列表
        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        # 返回最终的输出 token 列表
        return output_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        # 将文本中的字符标准化为 NFD 格式
        text = unicodedata.normalize("NFD", text)
        output = []
        # 遍历文本中的每个字符
        for char in text:
            # 获取字符的 Unicode 分类
            cat = unicodedata.category(char)
            # 如果字符是非组合型记号(Mn),则跳过
            if cat == "Mn":
                continue
            # 否则将字符添加到输出列表中
            output.append(char)
        # 将输出列表中的字符连接成字符串并返回
        return "".join(output)
    def _run_split_on_punc(self, text, never_split=None):
        """按照标点符号分割文本。

        Args:
            text (str): 要分割的文本。
            never_split (set): 不应该被分割的文本集合。

        Returns:
            list: 分割后的文本列表。

        """
        # 如果不需要按标点符号分割,或者文本在不分割的集合中,则直接返回原文本列表
        if not self.do_split_on_punc or (never_split is not None and text in never_split):
            return [text]
        # 将文本转换为字符列表
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            # 如果是标点符号,则作为新词开始
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                # 如果不是标点符号,根据start_new_word标记将字符添加到当前词列表中
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        # 将列表中的字符列表连接为字符串,并返回分割后的文本列表
        return ["".join(x) for x in output]

    def _tokenize_chinese_chars(self, text):
        """在每个CJK字符周围添加空格。

        Args:
            text (str): 要处理的文本。

        Returns:
            str: 处理后的文本。

        """
        output = []
        for char in text:
            cp = ord(char)
            # 如果是CJK字符,添加空格前后包裹该字符
            if self._is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                output.append(char)
        # 将列表中的字符连接为一个字符串,并返回处理后的文本
        return "".join(output)

    def _is_chinese_char(self, cp):
        """检查CP是否是CJK字符的码点。

        Args:
            cp (int): 要检查的字符的Unicode码点。

        Returns:
            bool: 如果是CJK字符则返回True,否则返回False。

        """
        # 这里的CJK字符定义来自于CJK统一表意文字块的Unicode范围
        if (
            (cp >= 0x4E00 and cp <= 0x9FFF)
            or (cp >= 0x3400 and cp <= 0x4DBF)
            or (cp >= 0x20000 and cp <= 0x2A6DF)
            or (cp >= 0x2A700 and cp <= 0x2B73F)
            or (cp >= 0x2B740 and cp <= 0x2B81F)
            or (cp >= 0x2B820 and cp <= 0x2CEAF)
            or (cp >= 0xF900 and cp <= 0xFAFF)
            or (cp >= 0x2F800 and cp <= 0x2FA1F)
        ):
            return True

        return False

    def _clean_text(self, text):
        """对文本进行无效字符移除和空白字符清理。

        Args:
            text (str): 要清理的文本。

        Returns:
            str: 清理后的文本。

        """
        output = []
        for char in text:
            cp = ord(char)
            # 移除无效字符和控制字符,以及替换空白字符为单个空格
            if cp == 0 or cp == 0xFFFD or _is_control(char):
                continue
            if _is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        # 将列表中的字符连接为一个字符串,并返回清理后的文本
        return "".join(output)
class WordpieceTokenizer(object):
    """Runs WordPiece tokenization."""

    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
        # 初始化WordpieceTokenizer对象,设置词汇表、未知标记和单词的最大字符数
        self.vocab = vocab
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word

    def tokenize(self, text):
        """
        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
        tokenization using the given vocabulary.

        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.

        Args:
            text: A single token or whitespace separated tokens. This should have
                already been passed through *BasicTokenizer*.

        Returns:
            A list of wordpiece tokens.
        """
        # 初始化输出token列表
        output_tokens = []
        # 使用whitespace_tokenize函数将文本分割成单词或标记
        for token in whitespace_tokenize(text):
            # 将token转换为字符列表
            chars = list(token)
            # 如果token的长度超过最大输入字符数,则将未知标记添加到输出token列表中
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue

            # 初始化标志变量和起始位置
            is_bad = False
            start = 0
            sub_tokens = []
            # 循环直到处理完所有字符
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                # 使用最长匹配算法找到合适的子串
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    # 如果找到了匹配词汇表的子串,则更新当前子串并跳出循环
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                # 如果未找到合适的子串,则标记为无效
                if cur_substr is None:
                    is_bad = True
                    break
                # 将找到的子串添加到sub_tokens列表中
                sub_tokens.append(cur_substr)
                start = end

            # 如果标记为无效,则将未知标记添加到输出token列表中;否则将sub_tokens列表中的token添加到输出token列表中
            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        # 返回最终的token列表
        return output_tokens

.\models\bert\tokenization_bert_fast.py

# coding=utf-8
# 上面是指定脚本的编码格式为 UTF-8

# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# 版权声明,指明了代码的版权归属

#
# Licensed under the Apache License, Version 2.0 (the "License");
# 根据 Apache License, Version 2.0 许可证,可以自由使用本代码
# you may not use this file except in compliance with the License.
# 除非遵循许可证规定,否则不能使用该文件

# You may obtain a copy of the License at
# 可以在以下链接获取许可证的副本
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# 除非适用法律要求或书面同意,否则根据许可证分发的软件是基于“原样”分发的,没有任何形式的保证或条件。
# 请参阅许可证以获取详细的权限和限制信息。

"""Fast Tokenization classes for Bert."""
# 用于 Bert 的快速标记化类

import json
from typing import List, Optional, Tuple

from tokenizers import normalizers

from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import logging
from .tokenization_bert import BertTokenizer

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 定义词汇文件的名称映射
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}

# 预训练模型所需的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    {
        "vocab_file": {
            "google-bert/bert-base-uncased": "https://huggingface.co/google-bert/bert-base-uncased/resolve/main/vocab.txt",
            "google-bert/bert-large-uncased": "https://huggingface.co/google-bert/bert-large-uncased/resolve/main/vocab.txt",
            "google-bert/bert-base-cased": "https://huggingface.co/google-bert/bert-base-cased/resolve/main/vocab.txt",
            "google-bert/bert-large-cased": "https://huggingface.co/google-bert/bert-large-cased/resolve/main/vocab.txt",
            "google-bert/bert-base-multilingual-uncased": (
                "https://huggingface.co/google-bert/bert-base-multilingual-uncased/resolve/main/vocab.txt"
            ),
            "google-bert/bert-base-multilingual-cased": "https://huggingface.co/google-bert/bert-base-multilingual-cased/resolve/main/vocab.txt",
            "google-bert/bert-base-chinese": "https://huggingface.co/google-bert/bert-base-chinese/resolve/main/vocab.txt",
            "google-bert/bert-base-german-cased": "https://huggingface.co/google-bert/bert-base-german-cased/resolve/main/vocab.txt",
            "google-bert/bert-large-uncased-whole-word-masking": (
                "https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking/resolve/main/vocab.txt"
            ),
            "google-bert/bert-large-cased-whole-word-masking": (
                "https://huggingface.co/google-bert/bert-large-cased-whole-word-masking/resolve/main/vocab.txt"
            ),
            "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad": (
                "https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt"
            ),
            "google-bert/bert-large-cased-whole-word-masking-finetuned-squad": (
                "https://huggingface.co/google-bert/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt"
            ),
            "google-bert/bert-base-cased-finetuned-mrpc": (
                "https://huggingface.co/google-bert/bert-base-cased-finetuned-mrpc/resolve/main/vocab.txt"
            ),
            "google-bert/bert-base-german-dbmdz-cased": "https://huggingface.co/google-bert/bert-base-german-dbmdz-cased/resolve/main/vocab.txt",
            "google-bert/bert-base-german-dbmdz-uncased": (
                "https://huggingface.co/google-bert/bert-base-german-dbmdz-uncased/resolve/main/vocab.txt"
            ),
            "TurkuNLP/bert-base-finnish-cased-v1": (
                "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/vocab.txt"
            ),
            "TurkuNLP/bert-base-finnish-uncased-v1": (
                "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/vocab.txt"
            ),
            "wietsedv/bert-base-dutch-cased": (
                "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/vocab.txt"
            )
        }
    }
    
    
    
    # 注释:
    "vocab_file" 字典包含了多个键值对,每个键代表一个预训练的BERT模型,对应的值是该模型的词汇表(vocab.txt)的下载链接。
    这些链接可以通过Hugging Face模型中心获取,用于获取BERT模型的词汇表数据。
    {
        // Tokenizer文件的映射,键是模型名称,值是对应的Tokenizer.json文件的URL
        "tokenizer_file": {
            "google-bert/bert-base-uncased": "https://huggingface.co/google-bert/bert-base-uncased/resolve/main/tokenizer.json",
            "google-bert/bert-large-uncased": "https://huggingface.co/google-bert/bert-large-uncased/resolve/main/tokenizer.json",
            "google-bert/bert-base-cased": "https://huggingface.co/google-bert/bert-base-cased/resolve/main/tokenizer.json",
            "google-bert/bert-large-cased": "https://huggingface.co/google-bert/bert-large-cased/resolve/main/tokenizer.json",
            "google-bert/bert-base-multilingual-uncased": (
                "https://huggingface.co/google-bert/bert-base-multilingual-uncased/resolve/main/tokenizer.json"
            ),
            "google-bert/bert-base-multilingual-cased": (
                "https://huggingface.co/google-bert/bert-base-multilingual-cased/resolve/main/tokenizer.json"
            ),
            "google-bert/bert-base-chinese": "https://huggingface.co/google-bert/bert-base-chinese/resolve/main/tokenizer.json",
            "google-bert/bert-base-german-cased": "https://huggingface.co/google-bert/bert-base-german-cased/resolve/main/tokenizer.json",
            "google-bert/bert-large-uncased-whole-word-masking": (
                "https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking/resolve/main/tokenizer.json"
            ),
            "google-bert/bert-large-cased-whole-word-masking": (
                "https://huggingface.co/google-bert/bert-large-cased-whole-word-masking/resolve/main/tokenizer.json"
            ),
            "google-bert/bert-large-uncased-whole-word-masking-finetuned-squad": (
                "https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/tokenizer.json"
            ),
            "google-bert/bert-large-cased-whole-word-masking-finetuned-squad": (
                "https://huggingface.co/google-bert/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/tokenizer.json"
            ),
            "google-bert/bert-base-cased-finetuned-mrpc": (
                "https://huggingface.co/google-bert/bert-base-cased-finetuned-mrpc/resolve/main/tokenizer.json"
            ),
            "google-bert/bert-base-german-dbmdz-cased": (
                "https://huggingface.co/google-bert/bert-base-german-dbmdz-cased/resolve/main/tokenizer.json"
            ),
            "google-bert/bert-base-german-dbmdz-uncased": (
                "https://huggingface.co/google-bert/bert-base-german-dbmdz-uncased/resolve/main/tokenizer.json"
            ),
            "TurkuNLP/bert-base-finnish-cased-v1": (
                "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/tokenizer.json"
            ),
            "TurkuNLP/bert-base-finnish-uncased-v1": (
                "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/tokenizer.json"
            ),
            "wietsedv/bert-base-dutch-cased": (
                "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/tokenizer.json"
            )
        }
    }
}

# 首先定义了一个空的类 BertTokenizerFast,该类继承自 PreTrainedTokenizerFast
class BertTokenizerFast(PreTrainedTokenizerFast):
    # docstring: 构建一个“快速”BERT tokenizer,使用 HuggingFace 的 tokenizers 库支持,基于 WordPiece
    r"""
    Construct a "fast" BERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.
    """
    # 定义类 BertTokenizer,用于处理 BERT 模型的分词器功能
    class BertTokenizer:
    
        # 类的初始化方法,用于设置分词器的各种参数和选项
        def __init__(
            self,
            vocab_file=None,  # 词汇表文件路径,用于加载模型的词汇表
            tokenizer_file=None,  # 分词器文件路径,可选,用于加载预训练的分词器模型
            do_lower_case=True,  # 是否将输入转换为小写
            unk_token="[UNK]",  # 未知标记,当词汇表中不存在某个词时使用
            sep_token="[SEP]",  # 分隔符标记,在构建多序列时使用
            pad_token="[PAD]",  # 填充标记,在对不同长度的序列进行批处理时使用
            cls_token="[CLS]",  # 分类器标记,用于序列分类任务中
            mask_token="[MASK]",  # 掩码标记,用于掩码语言模型任务中
            tokenize_chinese_chars=True,  # 是否分词中文字符
            strip_accents=None,  # 是否去除所有重音符号
            **kwargs,  # 其他参数,用于兼容未来可能添加的参数
    ):
        # 调用父类的构造函数,初始化模型的tokenizer
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            do_lower_case=do_lower_case,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )

        # 获取当前tokenizer的规范化器状态并转换为JSON格式
        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
        # 检查是否有用户设置的规范化器状态与当前初始化参数不匹配,如果不匹配则进行更新
        if (
            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
        ):
            # 获取当前规范化器的类并进行实例化
            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
            # 更新规范化器的参数
            normalizer_state["lowercase"] = do_lower_case
            normalizer_state["strip_accents"] = strip_accents
            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
            # 将更新后的规范化器应用于当前的tokenizer对象
            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)

        # 更新当前对象的小写处理标志
        self.do_lower_case = do_lower_case

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A BERT sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        # 构建带有特殊标记的模型输入序列,用于序列分类任务
        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]

        # 如果存在第二个序列token_ids_1,则连接第二个序列的特殊标记
        if token_ids_1 is not None:
            output += token_ids_1 + [self.sep_token_id]

        return output

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    def create_token_type_ids_from_sequences(self,
                                            token_ids_0: List[int],
                                            token_ids_1: Optional[List[int]] = None) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
        pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of token IDs representing the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of token IDs representing the second sequence in sequence-pair tasks.

        Returns:
            `List[int]`: List of token type IDs according to the given sequence(s).
        """
        # Define the separator token ID and the classification token ID
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        # If only one sequence is provided, return a mask with 0s for the first sequence
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]

        # If both sequences are provided, concatenate their lengths with separator and classification tokens
        # Return a mask with 0s for the first sequence and 1s for the second sequence
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]


    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the vocabulary files associated with the tokenizer's model to a specified directory.

        Args:
            save_directory (str):
                Directory where the vocabulary files will be saved.
            filename_prefix (Optional[str]):
                Optional prefix to prepend to the saved vocabulary file names.

        Returns:
            Tuple[str]: Tuple containing the filenames of the saved vocabulary files.
        """
        # Call the model's save method to save the vocabulary files to the specified directory
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        
        # Return the filenames as a tuple
        return tuple(files)

.\models\bert\tokenization_bert_tf.py

    # 导入所需的标准库和模块
    import os
    from typing import List, Union

    # 导入 TensorFlow 库
    import tensorflow as tf
    # 导入 TensorFlow Text 库中的 BERT 分词器
    from tensorflow_text import BertTokenizer as BertTokenizerLayer
    from tensorflow_text import FastBertTokenizer, ShrinkLongestTrimmer, case_fold_utf8, combine_segments, pad_model_inputs

    # 导入自定义的 Keras 辅助函数
    from ...modeling_tf_utils import keras
    # 导入自定义的 BERT 分词器
    from .tokenization_bert import BertTokenizer

    # 定义一个 Keras 层,用于在图中进行 BERT 分词
    class TFBertTokenizer(keras.layers.Layer):
        """
        This is an in-graph tokenizer for BERT. It should be initialized similarly to other tokenizers, using the
        `from_pretrained()` method. It can also be initialized with the `from_tokenizer()` method, which imports settings
        from an existing standard tokenizer object.

        In-graph tokenizers, unlike other Hugging Face tokenizers, are actually Keras layers and are designed to be run
        when the model is called, rather than during preprocessing. As a result, they have somewhat more limited options
        than standard tokenizer classes. They are most useful when you want to create an end-to-end model that goes
        straight from `tf.string` inputs to outputs.
        """
    # 初始化函数,用于创建一个 Tokenizer 对象
    def __init__(
        self,
        vocab_list: List,                   # 词汇表列表,包含了 Tokenizer 所需的词汇
        do_lower_case: bool,                # 是否将输入文本转换为小写进行分词
        cls_token_id: int = None,           # 分类器标记的 ID,在序列分类中用作序列的第一个标记
        sep_token_id: int = None,           # 分隔符标记的 ID,在构建序列时用于多序列的分隔
        pad_token_id: int = None,           # 填充标记的 ID,在批处理不同长度的序列时使用
        padding: str = "longest",           # 填充类型,可以是"longest"或"max_length"
        truncation: bool = True,            # 是否对序列进行截断,使其不超过最大长度
        max_length: int = 512,              # 序列的最大长度,用于填充和截断
        pad_to_multiple_of: int = None,     # 如果设置,序列将填充到此值的倍数
        return_token_type_ids: bool = True, # 是否返回 token_type_ids
        return_attention_mask: bool = True, # 是否返回 attention_mask
        use_fast_bert_tokenizer: bool = True,  # 是否使用 FastBertTokenizer 类(Tensorflow Text)进行分词
        **tokenizer_kwargs,                 # 其他可能传递给 tokenizer 的参数
        ):
            super().__init__()
            # 调用父类的初始化方法

            if use_fast_bert_tokenizer:
                # 如果使用快速的 BERT 分词器
                self.tf_tokenizer = FastBertTokenizer(
                    vocab_list, token_out_type=tf.int64, lower_case_nfd_strip_accents=do_lower_case, **tokenizer_kwargs
                )
            else:
                # 否则使用静态词汇表创建查找表
                lookup_table = tf.lookup.StaticVocabularyTable(
                    tf.lookup.KeyValueTensorInitializer(
                        keys=vocab_list,
                        key_dtype=tf.string,
                        values=tf.range(tf.size(vocab_list, out_type=tf.int64), dtype=tf.int64),
                        value_dtype=tf.int64,
                    ),
                    num_oov_buckets=1,
                )
                # 使用查找表创建 BERT 分词器层
                self.tf_tokenizer = BertTokenizerLayer(
                    lookup_table, token_out_type=tf.int64, lower_case=do_lower_case, **tokenizer_kwargs
                )

            self.vocab_list = vocab_list
            self.do_lower_case = do_lower_case
            # 设置特殊 token 的索引,如果未提供则从 vocab_list 中获取
            self.cls_token_id = vocab_list.index("[CLS]") if cls_token_id is None else cls_token_id
            self.sep_token_id = vocab_list.index("[SEP]") if sep_token_id is None else sep_token_id
            self.pad_token_id = vocab_list.index("[PAD]") if pad_token_id is None else pad_token_id
            # 初始化用于截断最长序列的 paired_trimmer
            self.paired_trimmer = ShrinkLongestTrimmer(max_length - 3, axis=1)  # Allow room for special tokens
            self.max_length = max_length
            self.padding = padding
            self.truncation = truncation
            self.pad_to_multiple_of = pad_to_multiple_of
            self.return_token_type_ids = return_token_type_ids
            self.return_attention_mask = return_attention_mask
    def from_tokenizer(cls, tokenizer: "PreTrainedTokenizerBase", **kwargs):  # noqa: F821
        """
        Initialize a `TFBertTokenizer` from an existing `Tokenizer`.

        Args:
            tokenizer (`PreTrainedTokenizerBase`):
                The tokenizer to use to initialize the `TFBertTokenizer`.

        Examples:

        ```
        from transformers import AutoTokenizer, TFBertTokenizer

        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
        tf_tokenizer = TFBertTokenizer.from_tokenizer(tokenizer)
        ```
        """
        # Retrieve the 'do_lower_case' parameter from kwargs; if not provided, use tokenizer's setting
        do_lower_case = kwargs.pop("do_lower_case", None)
        do_lower_case = tokenizer.do_lower_case if do_lower_case is None else do_lower_case
        # Retrieve the 'cls_token_id' parameter from kwargs; if not provided, use tokenizer's setting
        cls_token_id = kwargs.pop("cls_token_id", None)
        cls_token_id = tokenizer.cls_token_id if cls_token_id is None else cls_token_id
        # Retrieve the 'sep_token_id' parameter from kwargs; if not provided, use tokenizer's setting
        sep_token_id = kwargs.pop("sep_token_id", None)
        sep_token_id = tokenizer.sep_token_id if sep_token_id is None else sep_token_id
        # Retrieve the 'pad_token_id' parameter from kwargs; if not provided, use tokenizer's setting
        pad_token_id = kwargs.pop("pad_token_id", None)
        pad_token_id = tokenizer.pad_token_id if pad_token_id is None else pad_token_id

        # Get the vocabulary dictionary from the tokenizer and sort it by indices
        vocab = tokenizer.get_vocab()
        vocab = sorted(vocab.items(), key=lambda x: x[1])
        # Extract just the vocabulary tokens into a list
        vocab_list = [entry[0] for entry in vocab]
        # Instantiate a new TFBertTokenizer using the retrieved parameters and vocab_list
        return cls(
            vocab_list=vocab_list,
            do_lower_case=do_lower_case,
            cls_token_id=cls_token_id,
            sep_token_id=sep_token_id,
            pad_token_id=pad_token_id,
            **kwargs,
        )

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], *init_inputs, **kwargs):
        """
        Instantiate a `TFBertTokenizer` from a pre-trained tokenizer.

        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                The name or path to the pre-trained tokenizer.

        Examples:

        ```
        from transformers import TFBertTokenizer

        tf_tokenizer = TFBertTokenizer.from_pretrained("google-bert/bert-base-uncased")
        ```
        """
        try:
            # Attempt to create a BertTokenizer instance from the provided pretrained_model_name_or_path
            tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)
        except:  # noqa: E722
            # If the above fails, fall back to using BertTokenizerFast
            from .tokenization_bert_fast import BertTokenizerFast

            tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)
        # Call from_tokenizer to create a TFBertTokenizer instance using the obtained tokenizer
        return cls.from_tokenizer(tokenizer, **kwargs)

    def unpaired_tokenize(self, texts):
        # If do_lower_case is True, convert texts to lowercase using case_fold_utf8
        if self.do_lower_case:
            texts = case_fold_utf8(texts)
        # Tokenize texts using tf_tokenizer's tokenize method
        tokens = self.tf_tokenizer.tokenize(texts)
        # Merge dimensions from 1 to -1 in tokens
        return tokens.merge_dims(1, -1)

    def call(
        self,
        text,
        text_pair=None,
        padding=None,
        truncation=None,
        max_length=None,
        pad_to_multiple_of=None,
        return_token_type_ids=None,
        return_attention_mask=None,
    # 定义一个方法,用于获取配置信息的字典
    def get_config(self):
        # 返回包含各种配置项的字典
        return {
            "vocab_list": self.vocab_list,       # 返回实例的词汇表列表
            "do_lower_case": self.do_lower_case, # 返回是否执行小写转换的布尔值
            "cls_token_id": self.cls_token_id,   # 返回类别标记的 ID
            "sep_token_id": self.sep_token_id,   # 返回分隔标记的 ID
            "pad_token_id": self.pad_token_id,   # 返回填充标记的 ID
        }

.\models\bert\__init__.py

# 从 typing 模块导入 TYPE_CHECKING 类型检查工具
from typing import TYPE_CHECKING

# 从 ...utils 中导入必要的模块和异常类
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_tensorflow_text_available,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义一个字典 _import_structure,用于组织各模块需要导入的内容列表
_import_structure = {
    "configuration_bert": ["BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BertConfig", "BertOnnxConfig"],
    "tokenization_bert": ["BasicTokenizer", "BertTokenizer", "WordpieceTokenizer"],
}

# 检查是否安装了 tokenizers 库,如果未安装则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果安装了 tokenizers,则添加 tokenization_bert_fast 模块到 _import_structure 字典
    _import_structure["tokenization_bert_fast"] = ["BertTokenizerFast"]

# 检查是否安装了 torch 库,如果未安装则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果安装了 torch,则添加 modeling_bert 模块到 _import_structure 字典
    _import_structure["modeling_bert"] = [
        "BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "BertForMaskedLM",
        "BertForMultipleChoice",
        "BertForNextSentencePrediction",
        "BertForPreTraining",
        "BertForQuestionAnswering",
        "BertForSequenceClassification",
        "BertForTokenClassification",
        "BertLayer",
        "BertLMHeadModel",
        "BertModel",
        "BertPreTrainedModel",
        "load_tf_weights_in_bert",
    ]

# 检查是否安装了 TensorFlow 库,如果未安装则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果安装了 TensorFlow,则添加 modeling_tf_bert 模块到 _import_structure 字典
    _import_structure["modeling_tf_bert"] = [
        "TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFBertEmbeddings",
        "TFBertForMaskedLM",
        "TFBertForMultipleChoice",
        "TFBertForNextSentencePrediction",
        "TFBertForPreTraining",
        "TFBertForQuestionAnswering",
        "TFBertForSequenceClassification",
        "TFBertForTokenClassification",
        "TFBertLMHeadModel",
        "TFBertMainLayer",
        "TFBertModel",
        "TFBertPreTrainedModel",
    ]

# 检查是否安装了 TensorFlow Text 库,如果未安装则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_tensorflow_text_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果安装了 TensorFlow Text,则添加 tokenization_bert_tf 模块到 _import_structure 字典
    _import_structure["tokenization_bert_tf"] = ["TFBertTokenizer"]

# 检查是否安装了 Flax 库,如果未安装则抛出 OptionalDependencyNotAvailable 异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果安装了 Flax,则继续添加相关内容,未提供完整的代码
    pass
    # 将多个模型类名添加到_import_structure字典中的"modeling_flax_bert"键下
    _import_structure["modeling_flax_bert"] = [
        "FlaxBertForCausalLM",                   # FlaxBert用于因果语言建模的模型类
        "FlaxBertForMaskedLM",                   # FlaxBert用于遮蔽语言建模的模型类
        "FlaxBertForMultipleChoice",             # FlaxBert用于多选题的模型类
        "FlaxBertForNextSentencePrediction",     # FlaxBert用于下一句预测的模型类
        "FlaxBertForPreTraining",                # FlaxBert用于预训练的模型类
        "FlaxBertForQuestionAnswering",          # FlaxBert用于问答的模型类
        "FlaxBertForSequenceClassification",     # FlaxBert用于序列分类的模型类
        "FlaxBertForTokenClassification",        # FlaxBert用于标记分类的模型类
        "FlaxBertModel",                         # FlaxBert模型的基础模型类
        "FlaxBertPreTrainedModel",               # FlaxBert预训练模型的基础模型类
    ]
# 如果在类型检查模式下
if TYPE_CHECKING:
    # 导入 BERT 配置相关的模块和类
    from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig, BertOnnxConfig
    # 导入 BERT 的分词器相关模块和类
    from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer

    # 尝试检查 tokenizers 是否可用,如果不可用则抛出异常 OptionalDependencyNotAvailable
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用,导入快速的 BERT 分词器
        from .tokenization_bert_fast import BertTokenizerFast

    # 尝试检查 torch 是否可用,如果不可用则抛出异常 OptionalDependencyNotAvailable
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用,导入 BERT 相关的模型和类
        from .modeling_bert import (
            BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
            BertForMaskedLM,
            BertForMultipleChoice,
            BertForNextSentencePrediction,
            BertForPreTraining,
            BertForQuestionAnswering,
            BertForSequenceClassification,
            BertForTokenClassification,
            BertLayer,
            BertLMHeadModel,
            BertModel,
            BertPreTrainedModel,
            load_tf_weights_in_bert,
        )

    # 尝试检查 tensorflow 是否可用,如果不可用则抛出异常 OptionalDependencyNotAvailable
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用,导入 TF 版本的 BERT 相关模型和类
        from .modeling_tf_bert import (
            TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFBertEmbeddings,
            TFBertForMaskedLM,
            TFBertForMultipleChoice,
            TFBertForNextSentencePrediction,
            TFBertForPreTraining,
            TFBertForQuestionAnswering,
            TFBertForSequenceClassification,
            TFBertForTokenClassification,
            TFBertLMHeadModel,
            TFBertMainLayer,
            TFBertModel,
            TFBertPreTrainedModel,
        )

    # 尝试检查 tensorflow_text 是否可用,如果不可用则抛出异常 OptionalDependencyNotAvailable
    try:
        if not is_tensorflow_text_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用,导入 TF 版本的 BERT 分词器
        from .tokenization_bert_tf import TFBertTokenizer

    # 尝试检查 flax 是否可用,如果不可用则抛出异常 OptionalDependencyNotAvailable
    try:
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果可用,导入 Flax 版本的 BERT 相关模型和类
        from .modeling_flax_bert import (
            FlaxBertForCausalLM,
            FlaxBertForMaskedLM,
            FlaxBertForMultipleChoice,
            FlaxBertForNextSentencePrediction,
            FlaxBertForPreTraining,
            FlaxBertForQuestionAnswering,
            FlaxBertForSequenceClassification,
            FlaxBertForTokenClassification,
            FlaxBertModel,
            FlaxBertPreTrainedModel,
        )

# 如果不在类型检查模式下
else:
    # 导入 sys 模块
    import sys

    # 将当前模块设置为一个 LazyModule 对象,并导入相关结构和规范
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

.\models\bertweet\tokenization_bertweet.py

# 导入标准库和第三方库
import html  # 用于 HTML 编码和解码
import os    # 提供与操作系统交互的功能
import re    # 用于正则表达式操作
from shutil import copyfile  # 用于复制文件
from typing import List, Optional, Tuple  # 引入类型提示相关的库

import regex  # 引入 regex 库,支持更强大的正则表达式功能

# 导入 Tokenizer 的基类 PreTrainedTokenizer 和日志模块
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义词汇文件和合并文件的名称映射
VOCAB_FILES_NAMES = {
    "vocab_file": "vocab.txt",
    "merges_file": "bpe.codes",
}

# 预训练模型的词汇文件映射
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "vinai/bertweet-base": "https://huggingface.co/vinai/bertweet-base/resolve/main/vocab.txt",
    },
    "merges_file": {
        "vinai/bertweet-base": "https://huggingface.co/vinai/bertweet-base/resolve/main/bpe.codes",
    },
}

# 预训练模型的位置编码大小映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "vinai/bertweet-base": 128,
}

def get_pairs(word):
    """
    返回单词中的符号对集合。

    单词被表示为符号元组(符号是长度可变的字符串)。
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char

    pairs = set(pairs)
    return pairs


class BertweetTokenizer(PreTrainedTokenizer):
    """
    构造一个 BERTweet 分词器,使用字节对编码。

    此分词器继承自 PreTrainedTokenizer,该类包含大多数主要方法。用户应参考这个超类以获取更多关于这些方法的信息。
    """
    # 定义一个 Transformer 模型的配置类,用于管理与模型相关的参数和配置
    
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    
    # 初始化函数,用于设置模型配置参数
    def __init__(
        self,
        vocab_file,  # 词汇表文件的路径
        merges_file,  # 合并文件的路径
        normalization=False,  # 是否进行标准化预处理,默认为False
        bos_token="<s>",  # 预训练期间用于序列开始的特殊符号,默认为"<s>"
        eos_token="</s>",  # 序列结束的特殊符号,默认为"</s>"
        sep_token="</s>",  # 用于多个序列构建时的分隔符,默认为"</s>"
        cls_token="<s>",  # 序列分类时使用的特殊符号,构建时是序列的第一个符号,默认为"<s>"
        unk_token="<unk>",  # 未知符号,词汇表中没有时的替代符号,默认为"<unk>"
        pad_token="<pad>",  # 填充符号,用于处理不同长度序列时的填充,默认为"<pad>"
        mask_token="<mask>",  # 掩码符号,用于掩码语言建模训练中的标记,默认为"<mask>"
        **kwargs,  # 其他可选参数
    ):
        try:
            from emoji import demojize  # 尝试导入 demojize 函数从 emoji 模块
            self.demojizer = demojize  # 如果成功导入,将 demojize 函数赋值给 self.demojizer
        except ImportError:
            logger.warning(
                "emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3"
                " install emoji==0.6.0"
            )
            self.demojizer = None  # 如果导入失败,记录警告信息,并将 self.demojizer 设为 None

        self.vocab_file = vocab_file  # 初始化词汇表文件路径
        self.merges_file = merges_file  # 初始化合并文件路径

        self.encoder = {}  # 初始化编码器字典
        self.encoder[str(bos_token)] = 0  # 将特殊标记 bos_token 编码为 0
        self.encoder[str(pad_token)] = 1  # 将特殊标记 pad_token 编码为 1
        self.encoder[str(eos_token)] = 2  # 将特殊标记 eos_token 编码为 2
        self.encoder[str(unk_token)] = 3  # 将特殊标记 unk_token 编码为 3

        self.add_from_file(vocab_file)  # 调用 add_from_file 方法,从 vocab_file 添加更多词汇到编码器

        self.decoder = {v: k for k, v in self.encoder.items()}  # 创建解码器,将编码器的键值对颠倒

        with open(merges_file, encoding="utf-8") as merges_handle:
            merges = merges_handle.read().split("\n")[:-1]  # 读取并处理合并文件的内容
        merges = [tuple(merge.split()[:-1]) for merge in merges]  # 将每行合并内容转换为元组列表
        self.bpe_ranks = dict(zip(merges, range(len(merges))))  # 创建 BPE 合并的排名字典
        self.cache = {}  # 初始化缓存字典

        self.normalization = normalization  # 设置文本规范化选项
        self.tweetPreprocessor = TweetTokenizer()  # 初始化 TweetTokenizer 作为 tweetPreprocessor
        self.special_puncts = {"’": "'", "…": "..."}  # 定义特殊标点符号映射

        super().__init__(  # 调用父类的初始化方法,传递相应参数和关键字参数
            normalization=normalization,
            bos_token=bos_token,
            eos_token=eos_token,
            sep_token=sep_token,
            cls_token=cls_token,
            unk_token=unk_token,
            pad_token=pad_token,
            mask_token=mask_token,
            **kwargs,
        )
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
        already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """

        # If the token list already has special tokens, delegate to the superclass method
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
            )

        # If there are no sequence pairs (token_ids_1 is None), add special tokens around token_ids_0
        if token_ids_1 is None:
            return [1] + ([0] * len(token_ids_0)) + [1]
        
        # For sequence pairs, add special tokens around both token_ids_0 and token_ids_1
        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. BERTweet does
        not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        """

        # Define special tokens for separation and classification
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        # If there are no sequence pairs, return a list of zeros of length equal to cls + token_ids_0 + sep
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        
        # For sequence pairs, return a list of zeros of length equal to cls + token_ids_0 + sep + sep + token_ids_1 + sep
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

    @property
    def vocab_size(self):
        # Return the size of the vocabulary, which is the length of the encoder dictionary
        return len(self.encoder)

    def get_vocab(self):
        # Return the combined dictionary of encoder and added_tokens_encoder
        return dict(self.encoder, **self.added_tokens_encoder)
    def bpe(self, token):
        # 如果 token 已经在缓存中,直接返回缓存中的结果
        if token in self.cache:
            return self.cache[token]
        
        # 将 token 转换为元组形式
        word = tuple(token)
        # 在 token 的末尾添加 "</w>",表示单词结束
        word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
        # 获取单词中的所有字符对,并进行 BPE 算法处理
        pairs = get_pairs(word)

        # 如果没有字符对,直接返回原始 token
        if not pairs:
            return token

        # 循环处理字符对,直到无法再合并为止
        while True:
            # 找到优先级最高的字符对
            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
            # 如果该字符对不在预定义的 BPE 优先级中,停止处理
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            # 遍历单词中的字符
            while i < len(word):
                try:
                    j = word.index(first, i)
                except ValueError:
                    # 如果找不到字符对的第一个字符,直接将剩余部分添加到新单词中
                    new_word.extend(word[i:])
                    break
                else:
                    # 将当前位置到字符对第一个字符位置之间的部分添加到新单词中
                    new_word.extend(word[i:j])
                    i = j

                # 如果当前位置的字符与字符对的第一个字符相同,并且下一个字符与字符对的第二个字符相同,则合并为一个新的字符
                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    # 否则,将当前位置的字符添加到新单词中,并移动到下一个位置
                    new_word.append(word[i])
                    i += 1
            # 将新单词转换为元组形式,并更新 word 变量为新单词
            new_word = tuple(new_word)
            word = new_word
            # 如果新单词长度为1,停止循环
            if len(word) == 1:
                break
            else:
                # 否则,继续获取新的字符对
                pairs = get_pairs(word)
        
        # 将处理后的单词以 "@@ " 连接起来,并去掉末尾的特殊标记 "</w>"
        word = "@@ ".join(word)
        word = word[:-4]
        # 将处理后的结果缓存起来,并返回
        self.cache[token] = word
        return word

    def _tokenize(self, text):
        """Tokenize a string."""
        # 如果启用了 Tweet 规范化,则在进行 BPE 处理之前先对文本进行规范化
        if self.normalization:
            text = self.normalizeTweet(text)

        split_tokens = []
        # 使用正则表达式将文本分割成单词列表
        words = re.findall(r"\S+\n?", text)
        for token in words:
            # 对每个单词进行 BPE 处理,并将处理结果按空格分割后添加到 split_tokens 列表中
            split_tokens.extend(list(self.bpe(token).split(" ")))
        return split_tokens

    def normalizeTweet(self, tweet):
        """
        Normalize a raw Tweet
        """
        # 替换 Tweet 中的特殊标点符号
        for punct in self.special_puncts:
            tweet = tweet.replace(punct, self.special_puncts[punct])

        # 使用 Tweet 预处理器对 Tweet 进行分词
        tokens = self.tweetPreprocessor.tokenize(tweet)
        # 对每个 token 进行规范化处理,并用空格连接起来
        normTweet = " ".join([self.normalizeToken(token) for token in tokens])

        # 进行特定的单词规范化处理,替换常见的缩写和缩略语
        normTweet = (
            normTweet.replace("cannot ", "can not ")
            .replace("n't ", " n't ")
            .replace("n 't ", " n't ")
            .replace("ca n't", "can't")
            .replace("ai n't", "ain't")
        )
        normTweet = (
            normTweet.replace("'m ", " 'm ")
            .replace("'re ", " 're ")
            .replace("'s ", " 's ")
            .replace("'ll ", " 'll ")
            .replace("'d ", " 'd ")
            .replace("'ve ", " 've ")
        )
        normTweet = (
            normTweet.replace(" p . m .", "  p.m.")
            .replace(" p . m ", " p.m ")
            .replace(" a . m .", " a.m.")
            .replace(" a . m ", " a.m ")
        )

        return " ".join(normTweet.split())
    # 将给定的 token 标准化为小写形式
    def normalizeToken(self, token):
        lowercased_token = token.lower()
        # 如果 token 以 "@" 开头,则返回 "@USER"
        if token.startswith("@"):
            return "@USER"
        # 如果 token 的小写形式以 "http" 或 "www" 开头,则返回 "HTTPURL"
        elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
            return "HTTPURL"
        # 如果 token 的长度为 1
        elif len(token) == 1:
            # 如果 token 是特殊标点符号中的一种,则返回其对应的值
            if token in self.special_puncts:
                return self.special_puncts[token]
            # 如果存在表情解析器,则用表情解析器处理 token,否则返回原 token
            if self.demojizer is not None:
                return self.demojizer(token)
            else:
                return token
        # 对于其他情况,直接返回 token
        else:
            return token

    # 根据 token 转换为对应的 id,使用给定的词汇表
    def _convert_token_to_id(self, token):
        return self.encoder.get(token, self.encoder.get(self.unk_token))

    # 根据 id 转换为对应的 token,使用给定的词汇表
    def _convert_id_to_token(self, index):
        return self.decoder.get(index, self.unk_token)

    # 将一系列 tokens 转换为单个字符串
    def convert_tokens_to_string(self, tokens):
        out_string = " ".join(tokens).replace("@@ ", "").strip()
        return out_string

    # 保存词汇表到指定目录
    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        # 如果保存目录不存在,记录错误并返回
        if not os.path.isdir(save_directory):
            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
            return
        # 构造词汇表文件路径和合并文件路径
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )
        out_merge_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
        )
        
        # 如果当前词汇表文件路径与目标路径不同且当前路径下存在词汇表文件,则复制词汇表文件到目标路径
        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        # 如果当前路径下不存在词汇表文件,则将当前模型的序列化词汇表模型写入目标路径
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

        # 如果当前合并文件路径与目标路径不同,则复制合并文件到目标路径
        if os.path.abspath(self.merges_file) != os.path.abspath(out_merge_file):
            copyfile(self.merges_file, out_merge_file)

        return out_vocab_file, out_merge_file
    def add_from_file(self, f):
        """
        从文本文件中加载一个预先存在的字典,并将其符号添加到当前实例中。
        """
        # 如果输入参数 f 是字符串类型,则尝试打开该文件
        if isinstance(f, str):
            try:
                with open(f, "r", encoding="utf-8") as fd:
                    # 递归调用 add_from_file 方法,加载文件内容
                    self.add_from_file(fd)
            except FileNotFoundError as fnfe:
                # 如果文件不存在,则抛出 FileNotFound 异常
                raise fnfe
            except UnicodeError:
                # 如果在文件中检测到不正确的编码,则抛出异常
                raise Exception(f"Incorrect encoding detected in {f}, please rebuild the dataset")
            # 返回,结束当前函数调用
            return

        # 读取文件中的所有行
        lines = f.readlines()
        # 遍历每一行内容
        for lineTmp in lines:
            # 去除行首尾空白符
            line = lineTmp.strip()
            # 查找行中最后一个空格的位置
            idx = line.rfind(" ")
            # 如果找不到空格,则抛出数值错误异常
            if idx == -1:
                raise ValueError("Incorrect dictionary format, expected '<token> <cnt>'")
            # 提取空格之前的部分作为单词
            word = line[:idx]
            # 将单词作为键,将当前编码器长度作为值存入编码器字典中
            self.encoder[word] = len(self.encoder)
# Natural Language Toolkit: Twitter Tokenizer
#
# Copyright (C) 2001-2020 NLTK Project
# Author: Christopher Potts <cgpotts@stanford.edu>
#         Ewan Klein <ewan@inf.ed.ac.uk> (modifications)
#         Pierpaolo Pantone <> (modifications)
# URL: http://nltk.org/
# For license information, see LICENSE.TXT
#

"""
Twitter-aware tokenizer, designed to be flexible and easy to adapt to new domains and tasks. The basic logic is this:

1. The tuple regex_strings defines a list of regular expression strings.

2. The regex_strings strings are put, in order, into a compiled regular expression object called word_re.

3. The tokenization is done by word_re.findall(s), where s is the user-supplied string, inside the tokenize() method of
   the class Tokenizer.

4. When instantiating Tokenizer objects, there is a single option: preserve_case. By default, it is set to True. If it
   is set to False, then the tokenizer will lowercase everything except for emoticons.

"""


######################################################################
#
# import regex  # https://github.com/nltk/nltk/issues/2409
# import html
#
######################################################################
# The following strings are components in the regular expression
# that is used for tokenizing. It's important that phone_number
# appears first in the final regex (since it can contain whitespace).
# It also could matter that tags comes after emoticons, due to the
# possibility of having text like
#
#     <:| and some text >:)
#
# Most importantly, the final element should always be last, since it
# does a last ditch whitespace-based tokenization of whatever is left.

# ToDo: Update with http://en.wikipedia.org/wiki/List_of_emoticons ?

# This particular element is used in a couple ways, so we define it
# with a name:
# docstyle-ignore
EMOTICONS = r"""
    (?:
      [<>]?                           # optional opening angle bracket
      [:;=8]                          # eyes
      [\-o\*\']?                      # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\]      # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\]      # mouth
      [\-o\*\']?                      # optional nose
      [:;=8]                          # eyes
      [<>]?                           # optional closing angle bracket
      |
      <3                               # heart
    )"""

# URL pattern due to John Gruber, modified by Tom Winzig. See
# https://gist.github.com/winzig/8894715
# docstyle-ignore
URLS = r"""            # Capture 1: entire matched URL
  (?:
  https?:                     # URL protocol and colon
    (?:
      /{1,3}                     # 1-3 slashes
      |                         #   or
      [a-z0-9%]                     # Single letter or digit or '%'
                                       # (Trying not to match e.g. "URI::Escape")
    )
    |                         #   or
                                       # looks like domain name followed by a slash:
    [a-z0-9.\-]+[.]
    (?:[a-z]{2,13})
    /
  )
  (?:                         # One or more:
    [^\s()<>{}\[\]]+                 # Run of non-space, non-()<>{}[]
    |                         #   or

    \(
      [^\s()<>{}\[\]]+
    \)
  )+
  (?:                         # End with:
    \(
      [^\s()<>{}\[\]]+
    \)
    |                         #   or
    [^\s`!()\[\]{};:'".,<>?«»“”‘’]
  )
"""

# The above pattern defines URLs using a regex for tokenization purposes,
# covering various formats and components typically found in URLs.
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # 匹配具有平衡括号的一级深度的表达式:(...(...)...)
    |
    \([^\s]+?\)                # 匹配非递归的平衡括号表达式:(...)
  )+                          # 上述两种模式可以出现一次或多次,即匹配多个括号嵌套或单个括号
  (?:                          # 结尾处可以是以下模式之一:
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # 匹配具有平衡括号的一级深度的表达式:(...(...)...)
    |
    \([^\s]+?\)                # 匹配非递归的平衡括号表达式:(...)
    |                          # 或者
    [^\s`!()\[\]{};:'".,<>?«»“”‘’]    # 不是空格或特定的标点字符
  )
  |                          # 或者,用于匹配裸域名:
  (?:
    (?<!@)                    # 前面不是 @,避免在电子邮件地址中匹配例如 "foo@_gmail.com_"
    [a-z0-9]+
    (?:[.\-][a-z0-9]+)*
    [.]
    (?:[a-z]{2,13})
    \b
    /?
    (?!@)                    # 后面不是 @,避免在电子邮件地址中匹配例如 "foo.na" 在 "foo.na@example.com" 中
  )


这段代码是一个正则表达式模式,用于匹配具有特定形式的括号结构和裸域名。
# 定义正则表达式模式以识别不同类型的标记
# 包括 URL、电话号码、ASCII 表情、HTML 标签、ASCII 箭头、Twitter 用户名、Twitter 主题标签、电子邮件地址等
REGEXPS = (
    URLS,  # 匹配 URL
    r"""
    (?:
      (?:            # (国际)
        \+?[01]
        [ *\-.\)]*
      )?
      (?:            # (区号)
        [\(]?
        \d{3}
        [ *\-.\)]*
      )?
      \d{3}          # 交换机
      [ *\-.\)]*
      \d{4}          # 基站
    )""",  # 匹配电话号码
    EMOTICONS,  # 匹配 ASCII 表情
    r"""<[^>\s]+>""",  # 匹配 HTML 标签
    r"""[\-]+>|<[\-]+""",  # 匹配 ASCII 箭头
    r"""(?:@[\w_]+)""",  # 匹配 Twitter 用户名
    r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""",  # 匹配 Twitter 主题标签
    r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""",  # 匹配电子邮件地址
    r"""
    (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # 带有撇号或破折号的单词
    |
    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # 数字,包括分数、小数点
    |
    (?:[\w_]+)                     # 没有撇号或破折号的单词
    |
    (?:\.(?:\s*\.){1,})            # 省略号
    |
    (?:\S)                         # 其他非空白字符
    """,  # 匹配剩余的词类
)

######################################################################
# 这是核心的分词正则表达式:

# 将 REGEXPS 中的所有模式组合成一个大的正则表达式
WORD_RE = regex.compile(r"""(%s)""" % "|".join(REGEXPS), regex.VERBOSE | regex.I | regex.UNICODE)

# HANG_RE 用于识别连续字符的模式
HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}")

# EMOTICON_RE 用于识别表情符号的模式
EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE | regex.I | regex.UNICODE)

# ENT_RE 用于将 HTML 实体转换为 Unicode 字符的模式
ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);")
    # 导入HTML实体替换函数
    from nltk.tokenize.casual import _replace_html_entities

    # 使用HTML实体替换函数处理包含HTML实体的字节字符串,返回替换后的字符串
    _replace_html_entities(b"Price: &pound;100")
    # 输出结果:'Price: \\xa3100'

    # 打印使用HTML实体替换函数处理包含HTML实体的字节字符串,应该输出替换后的Unicode字符串
    print(_replace_html_entities(b"Price: &pound;100"))
    # 输出结果:Price: £100
class TweetTokenizer:
    r"""
    Examples:

    ```
    >>> # Tokenizer for tweets.
    >>> from nltk.tokenize import TweetTokenizer

    >>> tknzr = TweetTokenizer()
    >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
    >>> tknzr.tokenize(s0)
    ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']

    >>> # Examples using *strip_handles* and *reduce_len parameters*:
    >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
    >>> s1 = "@remy: This is waaaaayyyy too much for you!!!!!!"
    >>> tknzr.tokenize(s1)
    [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
    ```"""

    def __init__(self, preserve_case=True, reduce_len=False, strip_handles=False):
        # Initialize the TweetTokenizer with options to preserve case, reduce elongated words, and strip handles.
        self.preserve_case = preserve_case
        self.reduce_len = reduce_len
        self.strip_handles = strip_handles

    def tokenize(self, text):
        """
        Tokenize a given text into a list of words.

        Args:
            text: str

        Returns:
            list(str): A list of tokens extracted from the text.
        """
        # Fix HTML character entities before tokenization
        text = _replace_html_entities(text)
        # Remove Twitter handles if strip_handles is enabled
        if self.strip_handles:
            text = remove_handles(text)
        # Reduce elongated words to their base form if reduce_len is enabled
        if self.reduce_len:
            text = reduce_lengthening(text)
        # Replace problematic sequences of characters for safe tokenization
        safe_text = HANG_RE.sub(r"\1\1\1", text)
        # Tokenize the text using a regular expression for word boundaries
        words = WORD_RE.findall(safe_text)
        # Adjust word case unless it is part of an emoticon to preserve emoticon capitalization
        if not self.preserve_case:
            words = [x if EMOTICON_RE.search(x) else x.lower() for x in words]
        return words


######################################################################
# Normalization Functions
######################################################################

def reduce_lengthening(text):
    """
    Reduce repeated character sequences of length 3 or greater to sequences of length 3.

    Args:
        text: str

    Returns:
        str: Text with reduced elongations.
    """
    pattern = regex.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1\1", text)


def remove_handles(text):
    """
    Remove Twitter username handles from text.

    Args:
        text: str

    Returns:
        str: Text with removed handles replaced by spaces.
    """
    pattern = regex.compile(
        r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)"
    )
    # Substitute handles with ' ' to ensure correct tokenization around removed handles
    return pattern.sub(" ", text)


######################################################################
# Tokenization Function
######################################################################

def casual_tokenize(text, preserve_case=True, reduce_len=False, strip_handles=False):
    """
    Tokenize a text string using casual tokenization rules.

    Args:
        text: str
        preserve_case: bool, optional (default=True)
            Whether to preserve case in tokens.
        reduce_len: bool, optional (default=False)
            Whether to reduce elongated words.
        strip_handles: bool, optional (default=False)
            Whether to remove Twitter handles.

    Returns:
        list(str): A list of tokens extracted from the text based on specified rules.
    """
    # 创建一个TweetTokenizer对象,用于分词化处理,根据参数设置保留大小写、缩短长度和去除句柄
    """
    Convenience function for wrapping the tokenizer.
    """
    # 返回通过TweetTokenizer对象对文本进行分词化处理得到的结果
    return TweetTokenizer(preserve_case=preserve_case, reduce_len=reduce_len, strip_handles=strip_handles).tokenize(
        text
    )
###############################################################################

# 定义一个函数 `calculate_total`,接收一个参数 `items`
def calculate_total(items):
    # 初始化一个变量 `total`,用于累计总和
    total = 0
    # 遍历参数 `items` 中的每个元素,将其加到 `total` 中
    for item in items:
        total += item
    # 返回累计的总和 `total`
    return total
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值