Transformers 源码解析（二百一十四）

最新推荐文章于 2024-07-10 00:07:50 发布

绝不原创的飞龙

最新推荐文章于 2024-07-10 00:07:50 发布

阅读量288

点赞数 2

分类专栏： transformers 文章标签： transformers

License CC BY-NC-SA 4.0 / 自豪地采用谷歌翻译

本文链接：https://blog.csdn.net/wizardforcel/article/details/140120918

版权

transformers 专栏收录该内容

310 篇文章

订阅专栏

`.\models\roformer\tokenization_roformer_fast.py`

# 导入必要的模块和库
import json  # 导入 json 模块，用于处理 JSON 格式数据
from typing import List, Optional, Tuple  # 导入类型提示相关的模块

from tokenizers import normalizers  # 导入 tokenizers 库中的 normalizers 模块
from tokenizers.pre_tokenizers import BertPreTokenizer, PreTokenizer  # 导入 tokenizers 库中的预分词器类

from ...tokenization_utils_fast import PreTrainedTokenizerFast  # 从上级目录导入 PreTrainedTokenizerFast 类
from ...utils import logging  # 从上级目录导入 logging 模块
from .tokenization_roformer import RoFormerTokenizer  # 从当前目录导入 RoFormerTokenizer 类
from .tokenization_utils import JiebaPreTokenizer  # 从当前目录导入 JiebaPreTokenizer 类

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义用于 RoFormer 的词汇文件和 tokenizer 文件的名称映射
VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}

# 定义预训练模型的词汇文件映射，以及它们对应的下载链接
PRETRAINED_VOCAB_FILES_MAP = {
    "vocab_file": {
        "junnyu/roformer_chinese_small": "https://huggingface.co/junnyu/roformer_chinese_small/resolve/main/vocab.txt",
        "junnyu/roformer_chinese_base": "https://huggingface.co/junnyu/roformer_chinese_base/resolve/main/vocab.txt",
        "junnyu/roformer_chinese_char_small": (
            "https://huggingface.co/junnyu/roformer_chinese_char_small/resolve/main/vocab.txt"
        ),
        "junnyu/roformer_chinese_char_base": (
            "https://huggingface.co/junnyu/roformer_chinese_char_base/resolve/main/vocab.txt"
        ),
        "junnyu/roformer_small_discriminator": (
            "https://huggingface.co/junnyu/roformer_small_discriminator/resolve/main/vocab.txt"
        ),
        "junnyu/roformer_small_generator": (
            "https://huggingface.co/junnyu/roformer_small_generator/resolve/main/vocab.txt"
        ),
    }
}

# 定义预训练模型的位置编码大小映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "junnyu/roformer_chinese_small": 1536,
    "junnyu/roformer_chinese_base": 1536,
    "junnyu/roformer_chinese_char_small": 512,
    "junnyu/roformer_chinese_char_base": 512,
    "junnyu/roformer_small_discriminator": 128,
    "junnyu/roformer_small_generator": 128,
}

# 定义预训练模型的初始化配置映射，指定是否小写化
PRETRAINED_INIT_CONFIGURATION = {
    "junnyu/roformer_chinese_small": {"do_lower_case": True},
    "junnyu/roformer_chinese_base": {"do_lower_case": True},
    "junnyu/roformer_chinese_char_small": {"do_lower_case": True},
    "junnyu/roformer_chinese_char_base": {"do_lower_case": True},
    "junnyu/roformer_small_discriminator": {"do_lower_case": True},
    "junnyu/roformer_small_generator": {"do_lower_case": True},
}


class RoFormerTokenizerFast(PreTrainedTokenizerFast):
    r"""
    Construct a "fast" RoFormer tokenizer (backed by HuggingFace's *tokenizers* library).
    # `RoFormerTokenizerFast`几乎与`BertTokenizerFast`相同，实现端到端的分词：
    # 标点符号分割和WordPiece。它们在处理中文时有些差异。
    
    # 此分词器继承自`PreTrainedTokenizerFast`，其中包含大部分主要方法。用户应该
    # 参考这个超类以获取有关这些方法的更多信息。
    
    # 示例：
    #
    # ```
    # >>> from transformers import RoFormerTokenizerFast
    #
    # >>> tokenizer = RoFormerTokenizerFast.from_pretrained("junnyu/roformer_chinese_base")
    # >>> tokenizer.tokenize("今天天气非常好。")
    # ['今', '天', '天', '气', '非常', '好', '。']
    # ```
    
    vocab_files_names = VOCAB_FILES_NAMES  # 获取词汇文件的名称列表
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP  # 获取预训练词汇文件的映射
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES  # 获取预训练位置嵌入的最大模型输入尺寸
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION  # 获取预训练初始化配置
    slow_tokenizer_class = RoFormerTokenizer  # 慢速分词器类为RoFormerTokenizer
    
    def __init__(
        self,
        vocab_file=None,
        tokenizer_file=None,
        do_lower_case=True,
        unk_token="[UNK]",
        sep_token="[SEP]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        mask_token="[MASK]",
        tokenize_chinese_chars=True,
        strip_accents=None,
        **kwargs,
    ):
        # 调用父类的初始化方法，设置基本的分词器参数
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
            do_lower_case=do_lower_case,
            unk_token=unk_token,
            sep_token=sep_token,
            pad_token=pad_token,
            cls_token=cls_token,
            mask_token=mask_token,
            tokenize_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            **kwargs,
        )
    
        # 从后端分词器的normalizer状态中加载JSON数据
        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
        # 如果normalizer的lowercase属性与当前设置不符，则更新
        if (
            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
        ):
            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
            normalizer_state["lowercase"] = do_lower_case
            normalizer_state["strip_accents"] = strip_accents
            # 更新后端分词器的normalizer
            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
    
        # 确保正确设置自定义的PreTokenizer
        vocab = self.backend_tokenizer.get_vocab()
        self.backend_tokenizer.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer(vocab))
    
        self.do_lower_case = do_lower_case
    
    def __getstate__(self):
        state = self.__dict__.copy()
        # 将分词器的pre_tokenizer设置为BertPreTokenizer()
        state["_tokenizer"].pre_tokenizer = BertPreTokenizer()
        return state
    
    def __setstate__(self, d):
        self.__dict__ = d
        # 获取当前分词器的词汇表
        vocab = self.__dict__["_tokenizer"].get_vocab()
        # 将分词器的pre_tokenizer设置为自定义的JiebaPreTokenizer
        self.__dict__["_tokenizer"].pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer(vocab))
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A RoFormer sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        """
        # Initialize output with CLS token ID, token_ids_0, and SEP token ID
        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]

        # If token_ids_1 is provided, concatenate token_ids_1 and SEP token ID
        if token_ids_1 is not None:
            output += token_ids_1 + [self.sep_token_id]

        return output

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A RoFormer
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        """
        # Define SEP and CLS tokens as lists
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        # If token_ids_1 is None, return a list of zeros corresponding to token_ids_0 + CLS + SEP
        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        
        # Return a concatenated list of zeros for token_ids_0 + CLS + SEP and ones for token_ids_1 + SEP
        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the tokenizer's vocabulary to a directory.

        Args:
            save_directory (str):
                Directory to save the vocabulary files.
            filename_prefix (str, *optional*):
                Prefix for the vocabulary files.

        Returns:
            `Tuple[str]`: Tuple of file paths where the vocabulary was saved.
        """
        # Save the model vocabulary using the tokenizer's save method
        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
        return tuple(files)

    def save_pretrained(
        self,
        save_directory,
        legacy_format=None,
        filename_prefix=None,
        push_to_hub=False,
        **kwargs,
    ):
        """
        Save the pretrained model and its tokenizer.

        Args:
            save_directory (str):
                Directory to save the pretrained model.
            legacy_format (str, *optional*):
                Legacy format compatibility.
            filename_prefix (str, *optional*):
                Prefix for the saved files.
            push_to_hub (bool):
                Whether to push the saved model to the Hugging Face model hub.
            **kwargs:
                Additional arguments passed to the superclass method.

        Returns:
            `Any`: Output of the superclass's `save_pretrained` method.
        """
        # Set the pre_tokenizer to BertPreTokenizer before saving
        self.backend_tokenizer.pre_tokenizer = BertPreTokenizer()
        
        # Call the superclass's save_pretrained method with the specified arguments
        return super().save_pretrained(save_directory, legacy_format, filename_prefix, push_to_hub, **kwargs)

`.\models\roformer\tokenization_utils.py`

# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization utils for RoFormer."""

from typing import List

from tokenizers import NormalizedString, PreTokenizedString, normalizers


class JiebaPreTokenizer:
    def __init__(self, vocab) -> None:
        self.vocab = vocab
        # 初始化BERT风格的文本规范化器，用于清理文本，处理中文字符，不去除重音符号，不转换为小写
        self.normalizers = normalizers.BertNormalizer(
            clean_text=False,
            handle_chinese_chars=True,
            strip_accents=False,
            lowercase=False,
        )
        try:
            import rjieba
        except ImportError:
            # 如果导入rjieba失败，引发ImportError并提供安装链接
            raise ImportError(
                "You need to install rjieba to use RoFormerTokenizer. "
                "See https://pypi.org/project/rjieba/ for installation."
            )
        # 导入成功后，将rjieba赋值给self.jieba
        self.jieba = rjieba

    def jieba_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
        splits = []

        # 使用rjieba对normalized_string进行分词，hmm参数设为False以提高速度
        for token, start, end in self.jieba.tokenize(str(normalized_string), hmm=False):
            # 如果分词结果在词汇表中，则将对应的NormalizedString加入splits列表
            if token in self.vocab:
                splits.append(normalized_string[start:end])
            else:
                # 否则，对token进行文本规范化处理，并按照处理后的结果拆分为多个token加入splits列表
                token_list = self.normalizers.normalize_str(token).split()
                for token in token_list:
                    if token:
                        end = start + len(token)
                        splits.append(normalized_string[start:end])
                        start = end

        # 返回分词后的NormalizedString列表
        return splits

    def pre_tokenize(self, pretok: PreTokenizedString):
        # 使用jieba_split方法对PreTokenizedString对象进行分词处理
        pretok.split(self.jieba_split)

`.\models\roformer\init.py`

# 导入必要的模块和函数来检查当前环境中是否可用特定的依赖项
from typing import TYPE_CHECKING
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_tf_available,
    is_tokenizers_available,
    is_torch_available,
)

# 定义一个字典，表示需要导入的模块结构
_import_structure = {
    "configuration_roformer": ["ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoFormerConfig", "RoFormerOnnxConfig"],
    "tokenization_roformer": ["RoFormerTokenizer"],
}

# 检查是否可用tokenizers，若不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_tokenizers_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，将RoFormerTokenizerFast加入导入结构字典
    _import_structure["tokenization_roformer_fast"] = ["RoFormerTokenizerFast"]

# 检查是否可用torch，若不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，将模型相关的torch模块加入导入结构字典
    _import_structure["modeling_roformer"] = [
        "ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "RoFormerForCausalLM",
        "RoFormerForMaskedLM",
        "RoFormerForMultipleChoice",
        "RoFormerForQuestionAnswering",
        "RoFormerForSequenceClassification",
        "RoFormerForTokenClassification",
        "RoFormerLayer",
        "RoFormerModel",
        "RoFormerPreTrainedModel",
        "load_tf_weights_in_roformer",
    ]

# 检查是否可用tensorflow，若不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，将模型相关的tensorflow模块加入导入结构字典
    _import_structure["modeling_tf_roformer"] = [
        "TF_ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "TFRoFormerForCausalLM",
        "TFRoFormerForMaskedLM",
        "TFRoFormerForMultipleChoice",
        "TFRoFormerForQuestionAnswering",
        "TFRoFormerForSequenceClassification",
        "TFRoFormerForTokenClassification",
        "TFRoFormerLayer",
        "TFRoFormerModel",
        "TFRoFormerPreTrainedModel",
    ]

# 检查是否可用flax，若不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果可用，将模型相关的flax模块加入导入结构字典
    _import_structure["modeling_flax_roformer"] = [
        "FLAX_ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
        "FlaxRoFormerForMaskedLM",
        "FlaxRoFormerForMultipleChoice",
        "FlaxRoFormerForQuestionAnswering",
        "FlaxRoFormerForSequenceClassification",
        "FlaxRoFormerForTokenClassification",
        "FlaxRoFormerModel",
        "FlaxRoFormerPreTrainedModel",
    ]

# 如果是类型检查阶段，处理完成
if TYPE_CHECKING:
    pass
    # 导入 RoFormer 相关配置文件和类
    from .configuration_roformer import ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, RoFormerConfig, RoFormerOnnxConfig
    # 导入 RoFormer 的 Tokenizer 类
    from .tokenization_roformer import RoFormerTokenizer
    
    # 检查是否安装了 tokenizers 库，如果未安装则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_tokenizers_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果安装了 tokenizers 库，则导入 RoFormer 的快速 Tokenizer 类
        from .tokenization_roformer_fast import RoFormerTokenizerFast
    
    # 检查是否安装了 PyTorch 库，如果未安装则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果安装了 PyTorch 库，则导入 RoFormer 的相关模型和函数
        from .modeling_roformer import (
            ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
            RoFormerForCausalLM,
            RoFormerForMaskedLM,
            RoFormerForMultipleChoice,
            RoFormerForQuestionAnswering,
            RoFormerForSequenceClassification,
            RoFormerForTokenClassification,
            RoFormerLayer,
            RoFormerModel,
            RoFormerPreTrainedModel,
            load_tf_weights_in_roformer,
        )
    
    # 检查是否安装了 TensorFlow 库，如果未安装则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果安装了 TensorFlow 库，则导入 TensorFlow 版本的 RoFormer 模型和函数
        from .modeling_tf_roformer import (
            TF_ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
            TFRoFormerForCausalLM,
            TFRoFormerForMaskedLM,
            TFRoFormerForMultipleChoice,
            TFRoFormerForQuestionAnswering,
            TFRoFormerForSequenceClassification,
            TFRoFormerForTokenClassification,
            TFRoFormerLayer,
            TFRoFormerModel,
            TFRoFormerPreTrainedModel,
        )
    
    # 检查是否安装了 Flax 库，如果未安装则抛出 OptionalDependencyNotAvailable 异常
    try:
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果安装了 Flax 库，则导入 Flax 版本的 RoFormer 模型和函数
        from .modeling_flax_roformer import (
            FLAX_ROFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
            FlaxRoFormerForMaskedLM,
            FlaxRoFormerForMultipleChoice,
            FlaxRoFormerForQuestionAnswering,
            FlaxRoFormerForSequenceClassification,
            FlaxRoFormerForTokenClassification,
            FlaxRoFormerModel,
            FlaxRoFormerPreTrainedModel,
        )
else:
    # 导入 sys 模块，用于操作 Python 解释器的系统功能
    import sys
    
    # 将当前模块添加到 sys.modules 中，以 LazyModule 的形式延迟加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\rwkv\configuration_rwkv.py`

# coding=utf-8
# Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" RWKV configuration"""

# 导入配置基类 PretrainedConfig 和日志工具 logging
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# 定义 RWKV 预训练模型的配置文件映射字典
RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "RWKV/rwkv-4-169m-pile": "https://huggingface.co/RWKV/rwkv-4-169m-pile/resolve/main/config.json",
    "RWKV/rwkv-4-430m-pile": "https://huggingface.co/RWKV/rwkv-4-430m-pile/resolve/main/config.json",
    "RWKV/rwkv-4-1b5-pile": "https://huggingface.co/RWKV/rwkv-4-1b5-pile/resolve/main/config.json",
    "RWKV/rwkv-4-3b-pile": "https://huggingface.co/RWKV/rwkv-4-3b-pile/resolve/main/config.json",
    "RWKV/rwkv-4-7b-pile": "https://huggingface.co/RWKV/rwkv-4-7b-pile/resolve/main/config.json",
    "RWKV/rwkv-4-14b-pile": "https://huggingface.co/RWKV/rwkv-4-14b-pile/resolve/main/config.json",
    "RWKV/rwkv-raven-1b5": "https://huggingface.co/RWKV/rwkv-raven-1b5/resolve/main/config.json",
    "RWKV/rwkv-raven-3b": "https://huggingface.co/RWKV/rwkv-raven-3b/resolve/main/config.json",
    "RWKV/rwkv-raven-7b": "https://huggingface.co/RWKV/rwkv-raven-7b/resolve/main/config.json",
    "RWKV/rwkv-raven-14b": "https://huggingface.co/RWKV/rwkv-raven-14b/resolve/main/config.json",
}

# RWKV 配置类，用于存储 RWKV 模型的配置信息
class RwkvConfig(PretrainedConfig):
    """
    This is the configuration class to store the configuration of a [`RwkvModel`]. It is used to instantiate a RWKV
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the RWVK-4
    [RWKV/rwkv-4-169m-pile](https://huggingface.co/RWKV/rwkv-4-169m-pile) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """
    # 定义 RWKV 模型类型
    model_type = "rwkv"
    # 映射模型属性，将 "max_position_embeddings" 映射到类中的 "context_length"
    attribute_map = {"max_position_embeddings": "context_length"}
    
    # RWKV 模型的配置类，包含了模型的各种参数设置
    def __init__(
        self,
        vocab_size=50277,  # 词汇表大小，默认为 50277
        context_length=1024,  # 模型可以处理的最大序列长度，默认为 1024
        hidden_size=4096,  # 嵌入层和隐藏状态的维度
        num_hidden_layers=32,  # 模型中的隐藏层数量，默认为 32
        attention_hidden_size=None,  # 注意力机制隐藏状态的维度，默认为 hidden_size
        intermediate_size=None,  # 内部前馈层的维度，默认为 hidden_size 的四倍
        layer_norm_epsilon=1e-5,  # 层归一化层使用的 epsilon 值，默认为 1e-5
        bos_token_id=0,  # 词汇表中句子开头 token 的 id，默认为 0
        eos_token_id=0,  # 词汇表中句子结尾 token 的 id，默认为 0
        rescale_every=6,  # 推断时，每隔多少层将隐藏状态和对应输出层的权重除以 2，默认为 6
        tie_word_embeddings=False,  # 是否将词嵌入与输入 token 的嵌入进行绑定，默认为 False
        use_cache=True,  # 模型是否应返回最后状态，默认为 True
        **kwargs,  # 允许接受任意其他参数
    ):
        ):
        # 初始化模型的参数：词汇表大小、上下文长度、隐藏层大小、隐藏层数量、注意力隐藏大小
        # 如果注意力隐藏大小未指定，则使用隐藏层大小作为默认值
        self.vocab_size = vocab_size
        self.context_length = context_length
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.attention_hidden_size = attention_hidden_size if attention_hidden_size is not None else hidden_size
        # 如果中间层大小未指定，则使用隐藏层大小的四倍作为默认值
        self.intermediate_size = intermediate_size if intermediate_size is not None else 4 * hidden_size
        self.layer_norm_epsilon = layer_norm_epsilon
        self.rescale_every = rescale_every
        self.use_cache = use_cache

        # 设置模型的特殊令牌（起始和结束令牌）的标识符
        self.bos_token_id = bos_token_id
        self.eos_token_id = eos_token_id

        # 调用父类的初始化方法，传递一些参数，如是否共享词嵌入、起始和结束令牌的标识符等
        super().__init__(
            tie_word_embeddings=tie_word_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs
        )

`.\models\rwkv\convert_rwkv_checkpoint_to_hf.py`

# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert a RWKV checkpoint from BlinkDL to the Hugging Face format."""

import argparse  # 导入处理命令行参数的模块
import gc  # 导入垃圾回收模块
import json  # 导入处理 JSON 格式的模块
import os  # 导入与操作系统交互的模块
import re  # 导入处理正则表达式的模块

import torch  # 导入 PyTorch 深度学习框架
from huggingface_hub import hf_hub_download  # 导入从 HF Hub 下载模型的功能

from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerFast, RwkvConfig
from transformers.modeling_utils import WEIGHTS_INDEX_NAME, shard_checkpoint

NUM_HIDDEN_LAYERS_MAPPING = {  # 定义模型尺寸与隐藏层映射关系的字典
    "169M": 12,
    "430M": 24,
    "1B5": 24,
    "3B": 32,
    "7B": 32,
    "14B": 40,
}

HIDEN_SIZE_MAPPING = {  # 定义模型尺寸与隐藏单元大小映射关系的字典
    "169M": 768,
    "430M": 1024,
    "1B5": 2048,
    "3B": 2560,
    "7B": 4096,
    "14B": 5120,
}

def convert_state_dict(state_dict):
    state_dict_keys = list(state_dict.keys())
    for name in state_dict_keys:
        weight = state_dict.pop(name)
        # 对模型参数名称进行转换，适配 Hugging Face 模型格式
        # emb -> embedding
        if name.startswith("emb."):
            name = name.replace("emb.", "embeddings.")
        # ln_0 -> pre_ln (only present at block 0)
        if name.startswith("blocks.0.ln0"):
            name = name.replace("blocks.0.ln0", "blocks.0.pre_ln")
        # att -> attention
        name = re.sub(r"blocks\.(\d+)\.att", r"blocks.\1.attention", name)
        # ffn -> feed_forward
        name = re.sub(r"blocks\.(\d+)\.ffn", r"blocks.\1.feed_forward", name)
        # time_mix_k -> time_mix_key and reshape
        if name.endswith(".time_mix_k"):
            name = name.replace(".time_mix_k", ".time_mix_key")
        # time_mix_v -> time_mix_value and reshape
        if name.endswith(".time_mix_v"):
            name = name.replace(".time_mix_v", ".time_mix_value")
        # time_mix_r -> time_mix_key and reshape
        if name.endswith(".time_mix_r"):
            name = name.replace(".time_mix_r", ".time_mix_receptance")

        if name != "head.weight":
            name = "rwkv." + name  # 添加前缀以标识 RWKV 格式的参数

        state_dict[name] = weight
    return state_dict

def convert_rmkv_checkpoint_to_hf_format(
    repo_id, checkpoint_file, output_dir, size=None, tokenizer_file=None, push_to_hub=False, model_name=None
):
    # 1. If possible, build the tokenizer.
    if tokenizer_file is None:
        print("No `--tokenizer_file` provided, we will use the default tokenizer.")
        vocab_size = 50277
        tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")  # 使用默认的分词器模型
    else:
        # 如果没有指定 tokenizer_file，则使用 PreTrainedTokenizerFast 加载默认的分词器
        tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file)
        # 获取分词器的词汇表大小
        vocab_size = len(tokenizer)
    # 将 tokenizer 保存到输出目录
    tokenizer.save_pretrained(output_dir)

    # 2. 构建配置文件
    # 定义可能的隐藏层大小列表
    possible_sizes = list(NUM_HIDDEN_LAYERS_MAPPING.keys())
    if size is None:
        # 尝试从 checkpoint 文件名推断 size
        for candidate in possible_sizes:
            if candidate in checkpoint_file:
                size = candidate
                break
        if size is None:
            # 如果无法推断出 size，则抛出错误
            raise ValueError("Could not infer the size, please provide it with the `--size` argument.")
    if size not in possible_sizes:
        # 如果 size 不在可能的大小列表中，则抛出错误
        raise ValueError(f"`size` should be one of {possible_sizes}, got {size}.")

    # 创建 RwkvConfig 对象，配置模型的参数
    config = RwkvConfig(
        vocab_size=vocab_size,
        num_hidden_layers=NUM_HIDDEN_LAYERS_MAPPING[size],
        hidden_size=HIDEN_SIZE_MAPPING[size],
    )
    # 将配置保存到输出目录
    config.save_pretrained(output_dir)

    # 3. 下载模型文件并转换 state_dict
    # 从 HF Hub 下载模型文件
    model_file = hf_hub_download(repo_id, checkpoint_file)
    # 加载模型的 state_dict
    state_dict = torch.load(model_file, map_location="cpu")
    # 转换 state_dict
    state_dict = convert_state_dict(state_dict)

    # 4. 分割成片段并保存
    # 将 state_dict 拆分成多个片段
    shards, index = shard_checkpoint(state_dict)
    for shard_file, shard in shards.items():
        # 保存每个片段到输出目录
        torch.save(shard, os.path.join(output_dir, shard_file))

    if index is not None:
        # 如果存在 index，则保存 index 到输出目录
        save_index_file = os.path.join(output_dir, WEIGHTS_INDEX_NAME)
        with open(save_index_file, "w", encoding="utf-8") as f:
            # 将 index 写入文件
            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
            f.write(content)

        # 5. 清理片段（有时 PyTorch 保存的文件会占用与完整 state_dict 相同的空间）
        print(
            "Cleaning up shards. This may error with an OOM error, it this is the case don't worry you still have converted the model."
        )
        # 获取所有片段文件名列表
        shard_files = list(shards.keys())

        # 清理变量以释放内存
        del state_dict
        del shards
        gc.collect()

        # 重新加载每个片段并保存（确保在 CPU 上）
        for shard_file in shard_files:
            state_dict = torch.load(os.path.join(output_dir, shard_file))
            torch.save({k: v.cpu().clone() for k, v in state_dict.items()}, os.path.join(output_dir, shard_file))

    # 清理 state_dict 变量以释放内存
    del state_dict
    gc.collect()

    # 如果需要推送到 HF Hub
    if push_to_hub:
        if model_name is None:
            # 如果未提供 model_name，则抛出错误
            raise ValueError("Please provide a `model_name` to push the model to the Hub.")
        # 加载模型并推送到 HF Hub
        model = AutoModelForCausalLM.from_pretrained(output_dir)
        model.push_to_hub(model_name, max_shard_size="2GB")
        # 将分词器也推送到 HF Hub
        tokenizer.push_to_hub(model_name)
if __name__ == "__main__":
    # 如果当前脚本作为主程序运行，则执行以下代码块

    parser = argparse.ArgumentParser()
    # 创建参数解析器对象

    # 必需参数
    parser.add_argument(
        "--repo_id", default=None, type=str, required=True, help="Repo ID from which to pull the checkpoint."
    )
    # repo_id 参数，从中获取检查点的仓库 ID

    parser.add_argument(
        "--checkpoint_file", default=None, type=str, required=True, help="Name of the checkpoint file in the repo."
    )
    # checkpoint_file 参数，检查点文件在仓库中的名称

    parser.add_argument(
        "--output_dir", default=None, type=str, required=True, help="Where to save the converted model."
    )
    # output_dir 参数，用于保存转换后模型的目录路径

    parser.add_argument(
        "--tokenizer_file",
        default=None,
        type=str,
        help="Path to the tokenizer file to use (if not provided, only the model is converted).",
    )
    # tokenizer_file 参数，用于指定要使用的分词器文件路径（如果未提供，则仅转换模型）

    parser.add_argument(
        "--size",
        default=None,
        type=str,
        help="Size of the model. Will be inferred from the `checkpoint_file` if not passed.",
    )
    # size 参数，指定模型的大小；如果未传入，则将从 checkpoint_file 推断大小

    parser.add_argument(
        "--push_to_hub",
        action="store_true",
        help="Push to the Hub the converted model.",
    )
    # push_to_hub 参数，如果设置，则推送转换后的模型到 Hub 上

    parser.add_argument(
        "--model_name",
        default=None,
        type=str,
        help="Name of the pushed model on the Hub, including the username / organization.",
    )
    # model_name 参数，指定推送到 Hub 上的模型名称，包括用户名或组织名

    args = parser.parse_args()
    # 解析命令行参数并返回一个命名空间对象 args

    convert_rmkv_checkpoint_to_hf_format(
        args.repo_id,
        args.checkpoint_file,
        args.output_dir,
        size=args.size,
        tokenizer_file=args.tokenizer_file,
        push_to_hub=args.push_to_hub,
        model_name=args.model_name,
    )
    # 调用 convert_rmkv_checkpoint_to_hf_format 函数，传递解析后的参数作为函数的输入

`.\models\rwkv\modeling_rwkv.py`

# 设置文件编码为 UTF-8
# 版权声明：2023 年 Bo Peng 和 HuggingFace 公司团队版权所有
# 版权声明：2018 年 NVIDIA 公司版权所有
#
# 根据 Apache 许可证 2.0 版本许可，除非符合许可协议，否则不得使用本文件
# 您可以在以下网址获取许可协议的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则按“原样”分发软件
# 不提供任何形式的担保或条件，无论是明示的还是默示的
# 有关详细信息，请参阅许可协议

"""PyTorch RWKV 模型."""

import math
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss

# 从模型工具中导入预训练模型类
from ...modeling_utils import PreTrainedModel
# 从工具中导入文档字符串生成函数和其它实用函数
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    is_bitsandbytes_available,
    is_ninja_available,
    is_torch_cuda_available,
    logging,
)
# 从相应模块导入 RWKV 配置类
from .configuration_rwkv import RwkvConfig

# 获取日志记录器实例
logger = logging.get_logger(__name__)

# 用于文档的检查点和配置信息
_CHECKPOINT_FOR_DOC = "RWKV/rwkv-4-169m-pile"
_CONFIG_FOR_DOC = "RwkvConfig"

# 预训练模型归档列表
RWKV_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "RWKV/rwkv-4-169m-pile",
    "RWKV/rwkv-4-430m-pile",
    "RWKV/rwkv-4-1b5-pile",
    "RWKV/rwkv-4-3b-pile",
    "RWKV/rwkv-4-7b-pile",
    "RWKV/rwkv-4-14b-pile",
    "RWKV/rwkv-raven-1b5",
    "RWKV/rwkv-raven-3b",
    "RWKV/rwkv-raven-7b",
    "RWKV/rwkv-raven-14b",
    # 查看所有 RWKV 模型：https://huggingface.co/models?filter=rwkv
]

# RWKV CUDA 核心初始化为 None
rwkv_cuda_kernel = None


def load_wkv_cuda_kernel(context_length):
    # 从 torch.utils.cpp_extension 中加载 CUDA 核心
    from torch.utils.cpp_extension import load as load_kernel

    global rwkv_cuda_kernel

    # 获取 CUDA 核心文件夹路径
    kernel_folder = Path(__file__).resolve().parent.parent.parent / "kernels" / "rwkv"
    cuda_kernel_files = [kernel_folder / f for f in ["wkv_op.cpp", "wkv_cuda.cu", "wkv_cuda_bf16.cu"]]

    # 如果已加载的 CUDA 核心存在且上下文长度未更改，则直接返回
    if rwkv_cuda_kernel is not None and rwkv_cuda_kernel.max_seq_length == context_length:
        return

    # 记录加载 RWKV CUDA 核心的信息
    logger.info(f"Loading CUDA kernel for RWKV at context length of {context_length}.")

    # CUDA 编译标志
    flags = [
        "-res-usage",
        "--maxrregcount 60",
        "--use_fast_math",
        "-O3",
        "-Xptxas -O3",
        "--extra-device-vectorization",
        f"-DTmax={context_length}",
    ]
    # 加载 CUDA 核心
    rwkv_cuda_kernel = load_kernel(
        name=f"wkv_{context_length}",
        sources=cuda_kernel_files,
        verbose=(logging.get_verbosity() == logging.DEBUG),
        extra_cuda_cflags=flags,
    )
    rwkv_cuda_kernel.max_seq_length = context_length


class RwkvLinearAttention(torch.autograd.Function):
    @staticmethod
    # 定义一个静态方法 `forward`，接受多个参数和可选的状态信息，执行前向传播计算
    def forward(ctx, time_decay, time_first, key, value, state=None, return_state=False):
        # 获取输入张量的批量大小、序列长度和隐藏层大小
        batch_size, seq_len, hidden_size = key.size()
        # 如果序列长度超过最大允许长度，抛出异常
        if seq_len > rwkv_cuda_kernel.max_seq_length:
            raise ValueError(
                f"Cannot process a batch with {seq_len} tokens at the same time, use a maximum of "
                f"{rwkv_cuda_kernel.max_seq_length} with this model."
            )
        # 如果批量大小乘以隐藏层大小不能整除最小值（32），抛出异常
        if batch_size * hidden_size % min(hidden_size, 32) != 0:
            raise ValueError(
                f"The product of batch size ({batch_size}) and hidden size ({hidden_size}) needs to be a round "
                f"multiple of {min(hidden_size, 32)}."
            )

        # 设置上下文对象的输入数据类型为 key 的数据类型
        ctx.input_dtype = key.dtype

        # 检查时间衰减、时间优先、key 和 value 张量是否都在 CUDA 设备上，否则抛出异常
        if (
            time_decay.device.type != "cuda"
            or time_first.device.type != "cuda"
            or key.device.type != "cuda"
            or value.device.type != "cuda"
        ):
            raise ValueError("Calling the CUDA kernel for wkv attention requires all tensors to be on CUDA devices.")

        # 将时间衰减张量取负指数，转换为 float 类型并保证连续内存布局
        time_decay = -torch.exp(time_decay.float().contiguous())
        # 如果 key 的数据类型为 float16，将 time_first、key 和 value 转换为 float32 类型
        if key.dtype == torch.float16:
            time_first = time_first.float()
            key = key.float()
            value = value.float()
        # 确保 time_first、key 和 value 的连续内存布局
        time_first = time_first.contiguous()
        key = key.contiguous()
        value = value.contiguous()

        # 根据 key 的内存布局创建一个空的输出张量，保证其连续内存布局
        # CUDA 内核将填充这个张量
        output = torch.empty_like(key, memory_format=torch.contiguous_format)

        # 如果需要返回状态信息或者已提供状态信息
        if return_state or state is not None:
            # 如果未提供状态信息，则创建全零状态张量，并初始化最后一维度为 -1e38
            if state is None:
                state = torch.zeros(
                    batch_size,
                    hidden_size,
                    3,
                    dtype=torch.float32,
                    device=key.device,
                    memory_format=torch.contiguous_format,
                )
                state[:, :, 2] -= 1e38
            else:
                # 否则，将现有状态信息按最后一维度拼接，并保证连续内存布局
                state = torch.cat([s.unsqueeze(2) for s in state], dim=2).contiguous()
            # 根据 key 的数据类型选择前向传播函数，处理状态信息
            if key.dtype == torch.bfloat16:
                forward_func = rwkv_cuda_kernel.forward_with_state_bf16
            else:
                forward_func = rwkv_cuda_kernel.forward_with_state
            # 调用 CUDA 内核执行前向传播计算，并传递状态信息
            forward_func(time_decay, time_first, key, value, output, state)
        else:
            # 否则，根据 key 的数据类型选择相应的前向传播函数，不处理状态信息
            forward_func = rwkv_cuda_kernel.forward_bf16 if key.dtype == torch.bfloat16 else rwkv_cuda_kernel.forward
            # 调用 CUDA 内核执行前向传播计算，不传递状态信息
            forward_func(time_decay, time_first, key, value, output)

        # 将输入的关键数据和输出保存在上下文对象的备份中
        ctx.save_for_backward(time_decay, time_first, key, value, output)

        # 如果提供了状态信息，将其拆分并返回
        if state is not None:
            state = [s.squeeze(2) for s in torch.chunk(state, 3, dim=2)]

        # 返回计算结果的输出张量，并保证其数据类型与输入一致，同时返回状态信息
        return output.to(ctx.input_dtype), state

    @staticmethod
    # 静态方法的注释，g 代表梯度
    def backward(ctx, g_output, g_state=None):
        # 获取输入数据类型
        input_dtype = ctx.input_dtype

        # 从上下文中恢复保存的张量数据
        time_decay, time_first, key, value, output = ctx.saved_tensors
        # CUDA核心将填充这些张量。

        # 根据输入数据类型创建对应的梯度张量
        g_time_decay = torch.empty_like(
            time_decay,
            memory_format=torch.contiguous_format,
            dtype=torch.bfloat16 if input_dtype == torch.bfloat16 else torch.float32,
        )
        g_time_first = torch.empty_like(time_first, memory_format=torch.contiguous_format)
        g_key = torch.empty_like(key, memory_format=torch.contiguous_format)
        g_value = torch.empty_like(value, memory_format=torch.contiguous_format)

        # 如果输入数据类型是torch.float16，则将g_output转换为float类型
        if input_dtype == torch.float16:
            g_output = g_output.float()

        # 选择对应的CUDA函数进行反向传播计算
        backward_func = rwkv_cuda_kernel.backward_bf16 if input_dtype == torch.bfloat16 else rwkv_cuda_kernel.backward
        backward_func(
            time_decay,
            time_first,
            key,
            value,
            output,
            g_output.contiguous(),  # 获取g_output的连续内存视图
            g_time_decay,
            g_time_first,
            g_key,
            g_value,
        )

        # 将计算得到的梯度张量转换回输入数据类型并返回
        return (
            g_time_decay.to(input_dtype),
            g_time_first.to(input_dtype),
            g_key.to(input_dtype),
            g_value.to(input_dtype),
            None,
            None,
        )
# 使用线性键值注意力的 CPU 版本实现。如果不在 torch.no_grad 下执行，可能比自定义 CUDA 内核更慢且消耗更多内存。
def rwkv_linear_attention_cpu(time_decay, time_first, key, value, state=None, return_state=False):
    _, seq_length, _ = key.size()  # 获取键张量的序列长度
    output = torch.zeros_like(key)  # 初始化输出张量，与键张量相同形状

    if state is None:
        # 如果状态为空，初始化状态张量
        num_state = torch.zeros_like(key[:, 0], dtype=torch.float32)
        den_state = torch.zeros_like(key[:, 0], dtype=torch.float32)
        max_state = torch.zeros_like(key[:, 0], dtype=torch.float32) - 1e38
    else:
        num_state, den_state, max_state = state  # 否则使用提供的状态张量

    # 对数值稳定性的考虑
    time_decay = -torch.exp(time_decay)

    # 迭代序列长度
    for current_index in range(seq_length):
        current_key = key[:, current_index].float()  # 当前时间步的键张量
        current_value = value[:, current_index]  # 当前时间步的值张量

        # 在时间步 t 计算线性键值注意力
        max_for_output = torch.maximum(max_state, current_key + time_first)
        e1 = torch.exp(max_state - max_for_output)
        e2 = torch.exp(current_key + time_first - max_for_output)
        numerator = e1 * num_state + e2 * current_value
        denominator = e1 * den_state + e2
        output[:, current_index] = (numerator / denominator).to(output.dtype)

        # 更新状态以备下一次迭代
        max_for_state = torch.maximum(max_state + time_decay, current_key)
        e1 = torch.exp(max_state + time_decay - max_for_state)
        e2 = torch.exp(current_key - max_for_state)
        num_state = e1 * num_state + e2 * current_value
        den_state = e1 * den_state + e2
        max_state = max_for_state

    # 如果需要返回状态或者状态不为空，则返回更新后的状态张量
    if return_state or state is not None:
        state = [num_state, den_state, max_state]

    return output, state


# 使用线性键值注意力的入口函数，根据硬件支持情况选择 CPU 或 CUDA 实现
def rwkv_linear_attention(time_decay, time_first, key, value, state=None, return_state=False):
    # 检查是否存在不支持 CUDA 的硬件，或者键张量的长度为 1
    no_cuda = any(t.device.type != "cuda" for t in [time_decay, time_first, key, value])
    one_token = key.size(1) == 1

    # 如果没有 CUDA 内核、不支持 CUDA 的硬件或者键张量的长度为 1，则调用 CPU 版本实现
    if rwkv_cuda_kernel is None or no_cuda or one_token:
        return rwkv_linear_attention_cpu(time_decay, time_first, key, value, state=state, return_state=return_state)
    else:
        # 否则调用 CUDA 版本实现
        return RwkvLinearAttention.apply(time_decay, time_first, key, value, state, return_state)
    # 初始化函数，用于初始化一个自定义的注意力层对象
    def __init__(self, config, layer_id=0):
        # 调用父类的初始化方法
        super().__init__()
        # 将配置信息保存在对象属性中
        self.config = config
        # 检查是否已经加载了CUDA内核，并且内核支持的最大序列长度符合配置中的上下文长度
        kernel_loaded = rwkv_cuda_kernel is not None and rwkv_cuda_kernel.max_seq_length == config.context_length
        # 如果可以使用Ninja编译器、有可用的CUDA设备，并且尚未加载CUDA内核，则尝试加载自定义CUDA内核
        if is_ninja_available() and is_torch_cuda_available() and not kernel_loaded:
            try:
                load_wkv_cuda_kernel(config.context_length)
            except Exception:
                logger.info("Could not load the custom CUDA kernel for RWKV attention.")
        # 将层的ID保存在对象属性中
        self.layer_id = layer_id
        # 获取隐藏层的大小
        hidden_size = config.hidden_size
        # 获取注意力隐藏层的大小，如果未指定，则默认与隐藏层大小相同
        attention_hidden_size = (
            config.attention_hidden_size if config.attention_hidden_size is not None else hidden_size
        )
        # 将注意力隐藏层的大小保存在对象属性中
        self.attention_hidden_size = attention_hidden_size

        # 初始化时间衰减参数，用于注意力机制
        self.time_decay = nn.Parameter(torch.empty(attention_hidden_size))
        # 初始化时间首参数，用于注意力机制
        self.time_first = nn.Parameter(torch.empty(attention_hidden_size))

        # 初始化时间混合关键字参数，用于注意力机制
        self.time_mix_key = nn.Parameter(torch.empty(1, 1, hidden_size))
        # 初始化时间混合数值参数，用于注意力机制
        self.time_mix_value = nn.Parameter(torch.empty(1, 1, hidden_size))
        # 初始化时间混合接收参数，用于注意力机制
        self.time_mix_receptance = nn.Parameter(torch.empty(1, 1, hidden_size))

        # 初始化时间偏移层，使用2D零填充，只在垂直方向（时间维度）上进行
        self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
        # 初始化关键字线性层，将隐藏层映射到注意力隐藏层，无偏置
        self.key = nn.Linear(hidden_size, attention_hidden_size, bias=False)
        # 初始化数值线性层，将隐藏层映射到注意力隐藏层，无偏置
        self.value = nn.Linear(hidden_size, attention_hidden_size, bias=False)
        # 初始化接收线性层，将隐藏层映射到注意力隐藏层，无偏置
        self.receptance = nn.Linear(hidden_size, attention_hidden_size, bias=False)
        # 初始化输出线性层，将注意力隐藏层映射回隐藏层大小，无偏置
        self.output = nn.Linear(attention_hidden_size, hidden_size, bias=False)

    # TODO: maybe jit, otherwise move inside forward
    # 提取关键字和数值，可能使用jit，否则将其移动到前向传播方法内
    def extract_key_value(self, hidden, state=None):
        # 将当前隐藏状态与上一时间步状态混合，生成关键字、数值、接收参数
        if hidden.size(1) == 1 and state is not None:
            # 如果隐藏状态的时间步为1且状态不为空，则从状态中提取上一时间步的值
            shifted = state[1][:, :, self.layer_id]
        else:
            # 否则，使用时间偏移层处理当前隐藏状态
            shifted = self.time_shift(hidden)
            # 如果状态不为空，则将上一时间步的值混合到当前时间步
            if state is not None:
                shifted[:, 0] = state[1][:, :, self.layer_id]
        # 使用时间混合关键字参数混合当前隐藏状态和上一时间步状态，生成关键字
        key = hidden * self.time_mix_key + shifted * (1 - self.time_mix_key)
        # 使用时间混合数值参数混合当前隐藏状态和上一时间步状态，生成数值
        value = hidden * self.time_mix_value + shifted * (1 - self.time_mix_value)
        # 使用时间混合接收参数混合当前隐藏状态和上一时间步状态，生成接收参数，并使用Sigmoid函数处理
        receptance = hidden * self.time_mix_receptance + shifted * (1 - self.time_mix_receptance)

        # 将关键字、数值、接收参数分别通过线性层映射到注意力隐藏层
        key = self.key(key)
        value = self.value(value)
        receptance = torch.sigmoid(self.receptance(receptance))
        # 如果状态不为空，则更新状态中的上一时间步隐藏状态
        if state is not None:
            state[1][:, :, self.layer_id] = hidden[:, -1]
        # 返回接收参数、关键字、数值、状态
        return receptance, key, value, state
    # 前向传播函数，用于处理输入隐藏状态，可选地使用缓存
    def forward(self, hidden, state=None, use_cache=False):
        # 从隐藏状态中提取接受度、键和值，同时更新状态
        receptance, key, value, state = self.extract_key_value(hidden, state=state)
        
        # 如果存在状态，则从状态中提取当前层的状态信息
        layer_state = tuple(s[:, :, self.layer_id] for s in state[2:]) if state is not None else None
        
        # 使用 RWKV 线性注意力计算，考虑时间衰减和时间维度
        rwkv, layer_state = rwkv_linear_attention(
            self.time_decay,
            self.time_first,
            key,
            value,
            state=layer_state,
            return_state=use_cache,
        )

        # 如果存在层状态信息，则更新整体状态的当前层信息
        if layer_state is not None:
            state[2][:, :, self.layer_id] = layer_state[0]
            state[3][:, :, self.layer_id] = layer_state[1]
            state[4][:, :, self.layer_id] = layer_state[2]

        # 返回经过输出层处理后的结果以及更新后的状态
        return self.output(receptance * rwkv), state
# 定义一个名为 RwkvFeedForward 的新神经网络模块，继承自 nn.Module 类
class RwkvFeedForward(nn.Module):
    # 初始化函数，接受配置参数 config 和层编号 layer_id
    def __init__(self, config, layer_id=0):
        super().__init__()
        # 保存配置信息和层编号到对象属性中
        self.config = config
        self.layer_id = layer_id
        # 从配置中获取隐藏层大小和中间层大小
        hidden_size = config.hidden_size
        intermediate_size = (
            config.intermediate_size if config.intermediate_size is not None else 4 * config.hidden_size
        )

        # 创建一个沿时间轴零填充的二维零填充层
        self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))
        # 创建一个时间混合关键字的可训练参数
        self.time_mix_key = nn.Parameter(torch.empty(1, 1, hidden_size))
        # 创建一个时间混合接受度的可训练参数
        self.time_mix_receptance = nn.Parameter(torch.empty(1, 1, hidden_size))

        # 创建一个线性层对象，用于生成关键字
        self.key = nn.Linear(hidden_size, intermediate_size, bias=False)
        # 创建一个线性层对象，用于生成接受度
        self.receptance = nn.Linear(hidden_size, hidden_size, bias=False)
        # 创建一个线性层对象，用于生成值
        self.value = nn.Linear(intermediate_size, hidden_size, bias=False)

    # 前向传播函数，接受隐藏层输入和状态信息
    def forward(self, hidden, state=None):
        # 如果隐藏层的第二维大小为1且状态不为空，则获取状态中的相应层次的偏移量
        if hidden.size(1) == 1 and state is not None:
            shifted = state[0][:, :, self.layer_id]
        else:
            # 否则，对隐藏层进行时间轴零填充操作，并根据状态调整填充结果
            shifted = self.time_shift(hidden)
            if state is not None:
                shifted[:, 0] = state[0][:, :, self.layer_id]

        # 计算关键字和接受度，根据时间混合参数和偏移量
        key = hidden * self.time_mix_key + shifted * (1 - self.time_mix_key)
        receptance = hidden * self.time_mix_receptance + shifted * (1 - self.time_mix_receptance)

        # 对关键字进行非负整数平方操作，并使用 ReLU 激活函数
        key = torch.square(torch.relu(self.key(key)))
        # 将处理后的关键字输入值生成线性层，并输出值
        value = self.value(key)
        # 对接受度应用 sigmoid 激活函数
        receptance = torch.sigmoid(self.receptance(receptance))

        # 如果状态不为空，则更新状态中的隐藏层信息
        if state is not None:
            state[0][:, :, self.layer_id] = hidden[:, -1]

        # 返回接受度乘以值和更新后的状态
        return receptance * value, state


# 定义一个名为 RwkvBlock 的新神经网络模块，继承自 nn.Module 类
class RwkvBlock(nn.Module):
    # 初始化函数，接受配置参数 config 和层编号 layer_id
    def __init__(self, config, layer_id):
        super().__init__()
        # 保存配置信息和层编号到对象属性中
        self.config = config
        self.layer_id = layer_id

        # 如果层编号为0，则创建一个 LayerNorm 层对象，对隐藏层进行预处理
        if layer_id == 0:
            self.pre_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)

        # 创建两个 LayerNorm 层对象，用于注意力机制前后的归一化处理
        self.ln1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
        self.ln2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)

        # 创建 RwkvSelfAttention 和 RwkvFeedForward 的实例对象，用于注意力机制和前向传播
        self.attention = RwkvSelfAttention(config, layer_id)
        self.feed_forward = RwkvFeedForward(config, layer_id)

    # 前向传播函数，接受隐藏层输入、状态信息、是否使用缓存和是否输出注意力矩阵的参数
    def forward(self, hidden, state=None, use_cache=False, output_attentions=False):
        # 如果层编号为0，则对隐藏层进行预处理
        if self.layer_id == 0:
            hidden = self.pre_ln(hidden)

        # 将隐藏层输入传入注意力机制，获取注意力结果和更新后的状态
        attention, state = self.attention(self.ln1(hidden), state=state, use_cache=use_cache)
        # 将注意力结果加上原始隐藏层输入，得到新的隐藏层输出
        hidden = hidden + attention

        # 将新的隐藏层输入传入前向传播模块，获取前向传播结果和更新后的状态
        feed_forward, state = self.feed_forward(self.ln2(hidden), state=state)
        # 将前向传播结果加上原始隐藏层输入，得到最终的隐藏层输出
        hidden = hidden + feed_forward

        # 将隐藏层输出和状态信息作为元组返回
        outputs = (hidden, state)
        # 如果需要输出注意力矩阵，则将注意力矩阵加入返回的元组中
        if output_attentions:
            outputs += (attention,)
        else:
            outputs += (None,)

        # 返回最终的输出元组
        return outputs


# 定义一个名为 RwkvPreTrainedModel 的抽象神经网络模型类，继承自 PreTrainedModel
class RwkvPreTrainedModel(PreTrainedModel):
    """
    一个抽象类，处理权重初始化和简单的预训练模型下载与加载接口。
    """

    # 类属性，配置类为 RwkvConfig
    config_class = RwkvConfig
    # 基础模型前缀为 "rwkv"
    base_model_prefix = "rwkv"
    # 不需要分割的模块名称列表中包含 "RwkvBlock"
    _no_split_modules = ["RwkvBlock"]
    # 定义需要保留在 FP32 模块中的模块名称列表
    _keep_in_fp32_modules = ["time_decay", "time_first"]
    # 支持梯度检查点
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """初始化权重"""
        # 如果模块是 RwkvSelfAttention 类型
        if isinstance(module, RwkvSelfAttention):
            # 获取当前层的编号和总隐藏层数
            layer_id = module.layer_id
            num_hidden_layers = module.config.num_hidden_layers
            hidden_size = module.config.hidden_size
            attention_hidden_size = module.attention_hidden_size

            # 计算比率 0 到 1，表示当前层在所有隐藏层中的位置
            ratio_0_to_1 = layer_id / (num_hidden_layers - 1)  # 0 到 1
            # 计算比率 1 到 接近 0，表示当前层在所有隐藏层中的位置的反向比率
            ratio_1_to_almost0 = 1.0 - (layer_id / num_hidden_layers)  # 1 到 ~0

            # 创建时间权重张量，用于调整时间相关的关键字
            time_weight = torch.tensor(
                [i / hidden_size for i in range(hidden_size)],
                dtype=module.time_mix_key.dtype,
                device=module.time_mix_key.device,
            )
            time_weight = time_weight[None, None, :]

            # 计算时间衰减速度，根据注意力隐藏层大小和层位置动态调整
            decay_speed = [
                -5 + 8 * (h / (attention_hidden_size - 1)) ** (0.7 + 1.3 * ratio_0_to_1)
                for h in range(attention_hidden_size)
            ]
            decay_speed = torch.tensor(decay_speed, dtype=module.time_decay.dtype, device=module.time_decay.device)
            # 创建用于时间优先标记的波动
            zigzag = (
                torch.tensor(
                    [(i + 1) % 3 - 1 for i in range(attention_hidden_size)],
                    dtype=module.time_first.dtype,
                    device=module.time_first.device,
                )
                * 0.5
            )

            # 使用无梯度操作设置模块的时间衰减、时间优先和时间权重混合
            with torch.no_grad():
                module.time_decay.data = decay_speed
                module.time_first.data = torch.ones_like(module.time_first * math.log(0.3) + zigzag)

                module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0)
                module.time_mix_value.data = torch.pow(time_weight, ratio_1_to_almost0) + 0.3 * ratio_0_to_1
                module.time_mix_receptance.data = torch.pow(time_weight, 0.5 * ratio_1_to_almost0)
        
        # 如果模块是 RwkvFeedForward 类型
        elif isinstance(module, RwkvFeedForward):
            # 获取当前层的编号和总隐藏层数
            layer_id = module.layer_id
            num_hidden_layers = module.config.num_hidden_layers
            hidden_size = module.config.hidden_size

            # 计算比率 1 到 接近 0，表示当前层在所有隐藏层中的位置的反向比率
            ratio_1_to_almost0 = 1.0 - (layer_id / num_hidden_layers)  # 1 到 ~0

            # 创建时间权重张量，用于调整时间相关的关键字
            time_weight = torch.tensor(
                [i / hidden_size for i in range(hidden_size)],
                dtype=module.time_mix_key.dtype,
                device=module.time_mix_key.device,
            )
            time_weight = time_weight[None, None, :]

            # 使用无梯度操作设置模块的时间权重混合和时间接受度
            with torch.no_grad():
                module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0)
                module.time_mix_receptance.data = torch.pow(time_weight, ratio_1_to_almost0)
# 使用 @dataclass 装饰器声明一个数据类，用于封装 RWKV 模型的输出结果
@dataclass
class RwkvOutput(ModelOutput):
    """
    Class for the RWKV model outputs.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
            avoid providing the old `input_ids`.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    # 定义 RWKV 模型的输出属性
    last_hidden_state: torch.FloatTensor = None  # 最后一层模型的隐藏状态
    state: Optional[List[torch.FloatTensor]] = None  # 模型在最后时间步的状态
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None  # 每层模型的隐藏状态
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None  # 每层注意力权重
    """
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
            avoid providing the old `input_ids`.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    """

    # 损失值，用于语言建模任务中的下一个标记预测，当提供了`labels`时返回
    loss: Optional[torch.FloatTensor] = None
    # 语言建模头部的预测分数，即在应用SoftMax之前每个词汇标记的分数，形状为`(batch_size, sequence_length, config.vocab_size)`
    logits: torch.FloatTensor = None
    # 模型在最后一个时间步的状态，可以在下一个`input_ids`的前向方法中使用，避免提供旧的`input_ids`
    state: Optional[List[torch.FloatTensor]] = None
    # 模型每一层的隐藏状态的元组，包括（如果存在）嵌入层的输出，形状为`(batch_size, sequence_length, hidden_size)`
    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
    # 注意力权重的元组，用于自注意力头部中的加权平均计算，形状为`(batch_size, num_heads, sequence_length, sequence_length)`
    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
# RWKV_START_DOCSTRING 定义了一个多行字符串，用于描述某个模型类的文档字符串。
# 文档字符串解释了该模型继承自 PreTrainedModel，列出了该库对所有模型实现的通用方法（如下载或保存模型、调整输入嵌入、剪枝头部等）。
# 这个模型也是 PyTorch 的 torch.nn.Module 的子类，可以像普通的 PyTorch 模块一样使用，所有与一般使用和行为相关的事项请参考 PyTorch 文档。

RWKV_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            # `input_ids` 是输入序列的 token 索引，在词汇表中进行查找得到。
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            # 用来避免对填充 token 索引执行注意力操作的掩码。掩码值选择在 `[0, 1]` 范围内：

            - 1 表示**未被掩码**的 token，
            - 0 表示**被掩码**的 token。

            This is currently not used by `RwkvModel`, but will be supported in the future.

            [What are attention masks?](../glossary#attention-mask)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            # 可选参数，代替 `input_ids` 直接传递嵌入表示。如果希望更好地控制如何将 `input_ids` 索引转换为关联向量，
            这是非常有用的，比如使用自定义的嵌入查找矩阵。

            This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
            # 如果提供，模型将在所有块中使用先前状态（这将给出模型对提供的 `input_ids` 和 `state_input_ids` 作为上下文的输出）。

            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            # 如果设置为 `True`，则返回最后的状态，并且可以用于快速生成下一个 logits。

            If set to `True`, the last state is returned and can be used to quickly generate the next logits.
        output_attentions (`bool`, *optional*):
            # 是否返回所有注意力层的注意力张量。详细信息请参见返回的张量中的 `attentions`。

            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            # 是否返回所有层的隐藏状态。详细信息请参见返回的张量中的 `hidden_states`。

            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            # 是否返回 [`~utils.ModelOutput`] 而不是普通的元组。

            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
定义一个 RwkvModel 类，继承自 RwkvPreTrainedModel 类。

@add_start_docstrings(
    "The bare RWKV Model transformer outputting raw hidden-states without any specific head on top.",
    RWKV_START_DOCSTRING,
)
添加文档字符串，描述该模型是一个裸的 RWKV 模型，输出未经特定顶层处理的原始隐藏状态。

class RwkvModel(RwkvPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)

        # 初始化嵌入层，使用给定的词汇量大小和隐藏层大小
        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
        
        # 创建包含多个 RwkvBlock 的层列表，每个块的配置由传入的 config 控制
        self.blocks = nn.ModuleList([RwkvBlock(config, layer_id=idx) for idx in range(config.num_hidden_layers)])
        
        # 初始化 LayerNorm 层，对隐藏状态进行归一化处理
        self.ln_out = nn.LayerNorm(config.hidden_size)

        # 初始化标志：层是否被重新缩放
        self.layers_are_rescaled = False

        # 初始化标志：是否使用梯度检查点
        self.gradient_checkpointing = False

        # 执行额外的初始化操作
        # 这可能包括权重初始化和最终处理
        self.post_init()

    # 返回嵌入层
    def get_input_embeddings(self):
        return self.embeddings

    # 设置新的嵌入层
    def set_input_embeddings(self, new_embeddings):
        self.embeddings = new_embeddings

    @add_start_docstrings_to_model_forward(RWKV_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=RwkvOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    定义 forward 方法，接收多个输入参数，执行模型的前向传播过程。

    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,  # noqa
        inputs_embeds: Optional[torch.FloatTensor] = None,
        state: Optional[List[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    def _rescale_layers(self):
        # Layers should be rescaled for inference only.
        if self.layers_are_rescaled == (not self.training):
            return
        # Check if rescaling interval is specified
        if self.config.rescale_every > 0:
            # Perform rescaling without gradient tracking
            with torch.no_grad():
                # Iterate over blocks in the model
                for block_id, block in enumerate(self.blocks):
                    if self.training:
                        # Scale weights during training
                        block.attention.output.weight.mul_(2 ** int(block_id // self.config.rescale_every))
                        block.feed_forward.value.weight.mul_(2 ** int(block_id // self.config.rescale_every))
                    else:
                        # Handle quantization statistics during inference
                        if hasattr(block.attention.output.weight, "SCB"):
                            block.attention.output.weight.SCB.div_(2 ** int(block_id // self.config.rescale_every))
                            block.feed_forward.value.weight.SCB.div_(2 ** int(block_id // self.config.rescale_every))
                        elif hasattr(block.attention.output.weight, "quant_state"):
                            # Perform 4-bit dequantization and rescaling
                            self._bnb_4bit_dequantize_and_rescale(block.attention.output, block_id)
                            self._bnb_4bit_dequantize_and_rescale(block.feed_forward.value, block_id)
                        else:
                            # Default case: rescale weights
                            block.attention.output.weight.div_(2 ** int(block_id // self.config.rescale_every))
                            block.feed_forward.value.weight.div_(2 ** int(block_id // self.config.rescale_every))

        # Update rescaling status
        self.layers_are_rescaled = not self.training

    def _bnb_4bit_dequantize_and_rescale(self, target_layer, block_id):
        r"""
        Perform the dequantization and rescaling of the weights of a given layer. After that operation the layer will
        be quantized again.
        """
        # Check if bitsandbytes library is available
        if not is_bitsandbytes_available():
            raise ImportError("Please install bitsandbytes to use this method.")
        import bitsandbytes as bnb

        # Dequantize 4-bit weights
        dequant_weights = bnb.functional.dequantize_4bit(target_layer.weight.data, target_layer.weight.quant_state)

        # Rescale weights
        dequant_weights.div_(2 ** int(block_id // self.config.rescale_every))

        # Re-quantize the weights
        # Move weights to CPU and back to device to handle quantization
        quant_weight = bnb.nn.Params4bit(dequant_weights.to("cpu"), requires_grad=False).to(dequant_weights.device)
        setattr(target_layer, "weight", quant_weight)
@add_start_docstrings(
    """
    The RWKV Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    """,
    RWKV_START_DOCSTRING,
)
class RwkvForCausalLM(RwkvPreTrainedModel):
    _tied_weights_keys = ["head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.rwkv = RwkvModel(config)  # 初始化 RWKV 模型
        self.head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)  # 创建线性层作为语言建模的输出层

        # Initialize weights and apply final processing
        self.post_init()  # 执行初始化权重和最终处理

    def get_output_embeddings(self):
        return self.head  # 返回输出层的权重

    def set_output_embeddings(self, new_embeddings):
        self.head = new_embeddings  # 设置新的输出层权重

    def generate(self, *args, **kwargs):
        # Thin wrapper to raise exceptions when trying to generate with methods that manipulate `past_key_values`.
        # RWKV is one of the few models that don't have it (it has `state` instead, which has different properties and
        # usage).
        try:
            gen_output = super().generate(*args, **kwargs)  # 调用父类的 generate 方法
        except AttributeError as exc:
            # Expected exception: "AttributeError: '(object name)' object has no attribute 'past_key_values'"
            if "past_key_values" in str(exc):
                raise AttributeError(
                    "You tried to call `generate` with a decoding strategy that manipulates `past_key_values`. RWKV "
                    "doesn't have that attribute, try another generation strategy instead. For the available "
                    "generation strategies, check this doc: https://huggingface.co/docs/transformers/en/generation_strategies#decoding-strategies"
                )
            else:
                raise exc
        return gen_output

    def prepare_inputs_for_generation(self, input_ids, state=None, inputs_embeds=None, **kwargs):
        # only last token for inputs_ids if the state is passed along.
        if state is not None:
            input_ids = input_ids[:, -1].unsqueeze(-1)  # 只使用输入的最后一个标记作为生成输入

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and state is None:
            model_inputs = {"inputs_embeds": inputs_embeds}  # 如果传入了 inputs_embeds，则只在第一个生成步骤中使用它们
        else:
            model_inputs = {"input_ids": input_ids}

        model_inputs["state"] = state  # 将状态信息添加到模型输入中
        return model_inputs

    @add_start_docstrings_to_model_forward(RWKV_INPUTS_DOCSTRING)
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=RwkvCausalLMOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.LongTensor] = None,  # noqa
        inputs_embeds: Optional[torch.FloatTensor] = None,
        state: Optional[List[torch.FloatTensor]] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, RwkvCausalLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        """
        # 如果 return_dict 为 None，则使用模型配置中的默认值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 rwkv 方法进行前向传播
        rwkv_outputs = self.rwkv(
            input_ids,
            inputs_embeds=inputs_embeds,
            state=state,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        # 获取 rwkv 输出中的隐藏状态
        hidden_states = rwkv_outputs[0]

        # 将隐藏状态传入头部模型计算 logits
        logits = self.head(hidden_states)

        # 初始化损失为 None
        loss = None
        # 如果存在标签，则计算损失
        if labels is not None:
            # 将标签移动到与 logits 相同的设备上，以便进行模型并行计算
            labels = labels.to(logits.device)
            # 将 logits 向左移动一个位置，以对齐标签
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # 使用交叉熵损失函数计算损失
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

        # 如果 return_dict 为 False，则返回一个元组
        if not return_dict:
            output = (logits,) + rwkv_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        # 如果 return_dict 为 True，则返回 RwkvCausalLMOutput 对象
        return RwkvCausalLMOutput(
            loss=loss,
            logits=logits,
            state=rwkv_outputs.state,
            hidden_states=rwkv_outputs.hidden_states,
            attentions=rwkv_outputs.attentions,
        )

`.\models\rwkv\init.py`

# 版权声明和许可证信息，指出此代码版权归HuggingFace团队所有，并遵循Apache License, Version 2.0。
#
# 如果不满足许可证的要求，禁止使用此文件。可以从以下链接获取许可证的副本：
# http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按"原样"提供，不附带任何明示或暗示的保证或条件。
# 有关详细信息，请参阅许可证。

from typing import TYPE_CHECKING

# 从utils模块中导入所需的类和函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
)

# 定义了模块的导入结构
_import_structure = {
    "configuration_rwkv": ["RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP", "RwkvConfig", "RwkvOnnxConfig"],
}

# 检查是否有torch库可用，如果不可用，则抛出OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果torch可用，将modeling_rwkv模块添加到导入结构中
    _import_structure["modeling_rwkv"] = [
        "RWKV_PRETRAINED_MODEL_ARCHIVE_LIST",
        "RwkvForCausalLM",
        "RwkvModel",
        "RwkvPreTrainedModel",
    ]

# 如果当前是类型检查阶段，导入所需的类型定义
if TYPE_CHECKING:
    from .configuration_rwkv import RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP, RwkvConfig, RwkvOnnxConfig

    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_rwkv import (
            RWKV_PRETRAINED_MODEL_ARCHIVE_LIST,
            RwkvForCausalLM,
            RwkvModel,
            RwkvPreTrainedModel,
        )
# 如果不是类型检查阶段，则在sys.modules中注册一个LazyModule
else:
    import sys

    # 使用_LazyModule类将当前模块注册到sys.modules中，以实现惰性加载
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\sam\configuration_sam.py`

# 设置文件编码为 UTF-8
# 版权声明，指出版权归 HuggingFace Inc. 团队所有
#
# 根据 Apache 许可证 2.0 版本授权使用本文件，除非遵守许可证的条款，否则不得使用此文件
# 可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，本软件是基于“原样”分发的，不提供任何形式的担保或条件，无论是明示的还是隐含的
# 请参阅许可证了解具体的法律规定
""" SAM 模型配置"""


# 从配置工具中导入预训练配置类
from ...configuration_utils import PretrainedConfig
# 从工具包中导入日志记录模块
from ...utils import logging


# 获取名为 __name__ 的日志记录器
logger = logging.get_logger(__name__)

# 定义 SAM 预训练配置文件映射字典
SAM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "facebook/sam-vit-huge": "https://huggingface.co/facebook/sam-vit-huge/resolve/main/config.json",
    "facebook/sam-vit-large": "https://huggingface.co/facebook/sam-vit-large/resolve/main/config.json",
    "facebook/sam-vit-base": "https://huggingface.co/facebook/sam-vit-base/resolve/main/config.json",
}


# 定义 SamPromptEncoderConfig 类，继承自 PretrainedConfig
class SamPromptEncoderConfig(PretrainedConfig):
    r"""
    这是用于存储 [`SamPromptEncoder`] 配置的配置类。[`SamPromptEncoder`] 模块用于编码输入的 2D 点和边界框。
    实例化配置默认将生成与 SAM-vit-h
    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) 架构类似的配置。

    配置对象继承自 [`PretrainedConfig`]，可用于控制模型输出。有关更多信息，请阅读 [`PretrainedConfig`] 的文档。

    Args:
        hidden_size (`int`, *optional*, 默认为 256):
            隐藏状态的维度。
        image_size (`int`, *optional*, 默认为 1024):
            图像的预期输出分辨率。
        patch_size (`int`, *optional*, 默认为 16):
            每个补丁的大小（分辨率）。
        mask_input_channels (`int`, *optional*, 默认为 16):
            要馈送到 `MaskDecoder` 模块的通道数。
        num_point_embeddings (`int`, *optional*, 默认为 4):
            要使用的点嵌入数量。
        hidden_act (`str`, *optional*, 默认为 `"gelu"`):
            编码器和池化器中的非线性激活函数。
    """

    def __init__(
        self,
        hidden_size=256,
        image_size=1024,
        patch_size=16,
        mask_input_channels=16,
        num_point_embeddings=4,
        hidden_act="gelu",
        layer_norm_eps=1e-6,
        **kwargs,
        ):
        # 调用父类的构造函数，传递所有的关键字参数
        super().__init__(**kwargs)
        # 设置隐藏层大小
        self.hidden_size = hidden_size
        # 设置图像大小
        self.image_size = image_size
        # 设置补丁大小
        self.patch_size = patch_size
        # 计算图像嵌入大小，等于图像大小除以补丁大小
        self.image_embedding_size = image_size // patch_size
        # 设置掩码输入通道数
        self.mask_input_channels = mask_input_channels
        # 设置点嵌入数量
        self.num_point_embeddings = num_point_embeddings
        # 设置隐藏层激活函数
        self.hidden_act = hidden_act
        # 设置层归一化的 epsilon 值
        self.layer_norm_eps = layer_norm_eps
# `SamMaskDecoderConfig` 类，用于存储 `SamMaskDecoder` 的配置信息。
# 继承自 `PretrainedConfig`，用于控制模型输出。
# 该配置类用于实例化一个 `SamMaskDecoder`，定义模型的架构。
# 默认情况下，实例化配置类将生成类似于 `facebook/sam-vit-huge` 架构的配置。

class SamMaskDecoderConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`SamMaskDecoder`]. It is used to instantiate a SAM
    mask decoder to the specified arguments, defining the model architecture. Instantiating a configuration defaults
    will yield a similar configuration to that of the SAM-vit-h
    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the hidden states.
        hidden_act (`str`, *optional*, defaults to `"relu"`):
            The non-linear activation function used inside the `SamMaskDecoder` module.
        mlp_dim (`int`, *optional*, defaults to 2048):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 2):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        attention_downsample_rate (`int`, *optional*, defaults to 2):
            The downsampling rate of the attention layer.
        num_multimask_outputs (`int`, *optional*, defaults to 3):
            The number of outputs from the `SamMaskDecoder` module. In the Segment Anything paper, this is set to 3.
        iou_head_depth (`int`, *optional*, defaults to 3):
            The number of layers in the IoU head module.
        iou_head_hidden_dim (`int`, *optional*, defaults to 256):
            The dimensionality of the hidden states in the IoU head module.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.

    """
    
    # 初始化方法，用于设置配置参数
    def __init__(
        self,
        hidden_size=256,
        hidden_act="relu",
        mlp_dim=2048,
        num_hidden_layers=2,
        num_attention_heads=8,
        attention_downsample_rate=2,
        num_multimask_outputs=3,
        iou_head_depth=3,
        iou_head_hidden_dim=256,
        layer_norm_eps=1e-6,
        **kwargs,
    ):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 设置各个配置参数
        self.hidden_size = hidden_size
        self.hidden_act = hidden_act
        self.mlp_dim = mlp_dim
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.attention_downsample_rate = attention_downsample_rate
        self.num_multimask_outputs = num_multimask_outputs
        self.iou_head_depth = iou_head_depth
        self.iou_head_hidden_dim = iou_head_hidden_dim
        self.layer_norm_eps = layer_norm_eps


class SamVisionConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`SamVisionModel`]. It is used to instantiate a SAM
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    defaults will yield a similar configuration to that of the SAM ViT-h
    [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    # 设置编码器层和池化层的维度大小，默认为768
    hidden_size (`int`, *optional*, defaults to 768):
    # Patch Encoder 中输出通道的维度大小，默认为256
    output_channels (`int`, *optional*, defaults to 256):
    # Transformer 编码器中隐藏层的数量，默认为12
    num_hidden_layers (`int`, *optional*, defaults to 12):
    # Transformer 编码器中每个注意力层的注意力头数，默认为12
    num_attention_heads (`int`, *optional*, defaults to 12):
    # 输入图像的通道数，默认为3
    num_channels (`int`, *optional*, defaults to 3):
    # 期望的输入图像分辨率，默认为1024
    image_size (`int`, *optional*, defaults to 1024):
    # 从输入图像中提取的补丁大小，默认为16
    patch_size (`int`, *optional*, defaults to 16):
    # 非线性激活函数的类型，默认为 "gelu"
    hidden_act (`str`, *optional*, defaults to `"gelu"`):
    # 层归一化层中使用的 epsilon 值，默认为 1e-06
    layer_norm_eps (`float`, *optional*, defaults to 1e-06):
    # 注意力概率的 dropout 比率，默认为0.0（不使用 dropout）
    attention_dropout (`float`, *optional*, defaults to 0.0):
    # 初始化所有权重矩阵的截断正态分布的标准差，默认为1e-10
    initializer_range (`float`, *optional*, defaults to 1e-10):
    # 是否向查询、键、值的投影中添加偏置，默认为 True
    qkv_bias (`bool`, *optional*, defaults to `True`):
    # MLP 隐藏层维度与嵌入维度之比，默认为4.0
    mlp_ratio (`float`, *optional*, defaults to 4.0):
    # 是否使用绝对位置编码，默认为 True
    use_abs_pos (`bool`, *optional*, defaults to `True`):
    # 是否使用相对位置编码，默认为 True
    use_rel_pos (`bool`, *optional*, defaults to `True`):
    # 相对位置的窗口大小，默认为14
    window_size (`int`, *optional*, defaults to 14):
    # 全局注意力层的索引列表，默认为 `[2, 5, 8, 11]`
    global_attn_indexes (`List[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
    # 位置嵌入的维度大小，默认为128
    num_pos_feats (`int`, *optional*, defaults to 128):
    # Transformer 编码器中 MLP 层的维度大小。如果为 `None`，则默认为 `mlp_ratio * hidden_size`
    mlp_dim (`int`, *optional*):
    # 初始化函数，设置Transformer模型的各项参数
    def __init__(
        self,
        hidden_size=768,                  # 隐藏层大小，默认为768
        output_channels=256,              # 输出通道数，默认为256
        num_hidden_layers=12,             # 隐藏层的数量，默认为12
        num_attention_heads=12,           # 注意力头的数量，默认为12
        num_channels=3,                   # 输入图像的通道数，默认为3（RGB）
        image_size=1024,                  # 输入图像的大小，默认为1024x1024像素
        patch_size=16,                    # 图像分块的大小，默认为16x16像素
        hidden_act="gelu",                # 隐藏层激活函数，默认为GELU
        layer_norm_eps=1e-06,             # Layer Normalization的epsilon，默认为1e-06
        attention_dropout=0.0,            # 注意力机制的dropout率，默认为0.0（不使用dropout）
        initializer_range=1e-10,          # 参数初始化的范围，默认为1e-10
        qkv_bias=True,                    # 是否在QKV矩阵中使用偏置，默认为True
        mlp_ratio=4.0,                    # MLP的维度扩展比例，默认为4.0
        use_abs_pos=True,                 # 是否使用绝对位置编码，默认为True
        use_rel_pos=True,                 # 是否使用相对位置编码，默认为True
        window_size=14,                   # 局部注意力窗口大小，默认为14
        global_attn_indexes=[2, 5, 8, 11], # 全局注意力层的索引，默认为[2, 5, 8, 11]
        num_pos_feats=128,                # 位置特征的数量，默认为128
        mlp_dim=None,                     # MLP的维度，默认为hidden_size * mlp_ratio，若给定mlp_dim则使用给定值
        **kwargs,                         # 其他未指定的参数
    ):
        super().__init__(**kwargs)        # 调用父类的初始化方法
    
        self.hidden_size = hidden_size    # 设置隐藏层大小属性
        self.output_channels = output_channels  # 设置输出通道数属性
        self.num_hidden_layers = num_hidden_layers  # 设置隐藏层数量属性
        self.num_attention_heads = num_attention_heads  # 设置注意力头数量属性
        self.num_channels = num_channels  # 设置输入图像通道数属性
        self.image_size = image_size      # 设置输入图像大小属性
        self.patch_size = patch_size      # 设置图像分块大小属性
        self.hidden_act = hidden_act      # 设置隐藏层激活函数属性
        self.layer_norm_eps = layer_norm_eps  # 设置Layer Normalization的epsilon属性
        self.attention_dropout = attention_dropout  # 设置注意力dropout率属性
        self.initializer_range = initializer_range  # 设置参数初始化范围属性
        self.qkv_bias = qkv_bias          # 设置是否使用QKV偏置属性
        self.mlp_ratio = mlp_ratio        # 设置MLP维度扩展比例属性
        self.use_abs_pos = use_abs_pos    # 设置是否使用绝对位置编码属性
        self.use_rel_pos = use_rel_pos    # 设置是否使用相对位置编码属性
        self.window_size = window_size    # 设置局部注意力窗口大小属性
        self.global_attn_indexes = global_attn_indexes  # 设置全局注意力层的索引属性
        self.num_pos_feats = num_pos_feats  # 设置位置特征数量属性
        self.mlp_dim = int(hidden_size * mlp_ratio) if mlp_dim is None else mlp_dim  # 设置MLP的维度属性，如果mlp_dim未指定则计算为hidden_size * mlp_ratio
# 定义 `SamConfig` 类，用于存储 `SamModel` 的配置信息，继承自 `PretrainedConfig`。
class SamConfig(PretrainedConfig):
    # 文档字符串，描述了 `SamConfig` 的作用和用法，以及如何实例化 SAM 模型的相关参数。
    r"""
    [`SamConfig`] is the configuration class to store the configuration of a [`SamModel`]. It is used to instantiate a
    SAM model according to the specified arguments, defining the vision model, prompt-encoder model and mask decoder
    configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the
    SAM-ViT-H [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vision_config (Union[`dict`, `SamVisionConfig`], *optional*):
            Dictionary of configuration options used to initialize [`SamVisionConfig`].
        prompt_encoder_config (Union[`dict`, `SamPromptEncoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`SamPromptEncoderConfig`].
        mask_decoder_config (Union[`dict`, `SamMaskDecoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`SamMaskDecoderConfig`].

        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```
    >>> from transformers import (
    ...     SamVisionConfig,
    ...     SamPromptEncoderConfig,
    ...     SamMaskDecoderConfig,
    ...     SamModel,
    ... )

    >>> # Initializing a SamConfig with `"facebook/sam-vit-huge"` style configuration
    >>> configuration = SamConfig()

    >>> # Initializing a SamModel (with random weights) from the `"facebook/sam-vit-huge"` style configuration
    >>> model = SamModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a SamConfig from a SamVisionConfig, SamPromptEncoderConfig, and SamMaskDecoderConfig

    >>> # Initializing SAM vision, SAM Q-Former and language model configurations
    >>> vision_config = SamVisionConfig()
    >>> prompt_encoder_config = SamPromptEncoderConfig()
    >>> mask_decoder_config = SamMaskDecoderConfig()

    >>> config = SamConfig(vision_config, prompt_encoder_config, mask_decoder_config)
    ```"""

    # 类属性 `model_type`，指定模型类型为 "sam"。
    model_type = "sam"

    # 构造函数 `__init__`，用于初始化 `SamConfig` 类的实例。
    def __init__(
        self,
        vision_config=None,
        prompt_encoder_config=None,
        mask_decoder_config=None,
        initializer_range=0.02,
        **kwargs,
        ):
            # 调用父类的构造方法，传递所有的关键字参数
            super().__init__(**kwargs)
            # 如果 vision_config 不为 None，则使用其值；否则使用空字典
            vision_config = vision_config if vision_config is not None else {}
            # 如果 prompt_encoder_config 不为 None，则使用其值；否则使用空字典
            prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {}
            # 如果 mask_decoder_config 不为 None，则使用其值；否则使用空字典
            mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {}

            # 如果 vision_config 是 SamVisionConfig 类的实例，则转换为字典
            if isinstance(vision_config, SamVisionConfig):
                vision_config = vision_config.to_dict()
            # 如果 prompt_encoder_config 是 SamPromptEncoderConfig 类的实例，则转换为字典
            if isinstance(prompt_encoder_config, SamPromptEncoderConfig):
                prompt_encoder_config = prompt_encoder_config.to_dict()
            # 如果 mask_decoder_config 是 SamMaskDecoderConfig 类的实例，则转换为字典
            if isinstance(mask_decoder_config, SamMaskDecoderConfig):
                mask_decoder_config = mask_decoder_config.to_dict()

            # 使用 vision_config 字典创建 SamVisionConfig 对象
            self.vision_config = SamVisionConfig(**vision_config)
            # 使用 prompt_encoder_config 字典创建 SamPromptEncoderConfig 对象
            self.prompt_encoder_config = SamPromptEncoderConfig(**prompt_encoder_config)
            # 使用 mask_decoder_config 字典创建 SamMaskDecoderConfig 对象
            self.mask_decoder_config = SamMaskDecoderConfig(**mask_decoder_config)
            # 设置 initializer_range 实例变量
            self.initializer_range = initializer_range