Transformers 源码解析（九十八）

最新推荐文章于 2025-03-27 21:37:41 发布

绝不原创的飞龙

最新推荐文章于 2025-03-27 21:37:41 发布

阅读量447

点赞数 4

分类专栏： transformers 文章标签： transformers

License CC BY-NC-SA 4.0 / 自豪地采用谷歌翻译

本文链接：https://blog.csdn.net/wizardforcel/article/details/140121536

版权

transformers 专栏收录该内容

310 篇文章

订阅专栏

`.\models\encodec\init.py`

# 版权声明及使用许可信息，声明该代码版权归HuggingFace团队所有
#
# 在Apache许可版本2.0下授权使用本文件；除非符合许可条款，否则不得使用本文件
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件基于“现状”提供，不附带任何明示或暗示的担保
# 查看许可证以获取特定语言的权限和限制
from typing import TYPE_CHECKING

# 从utils模块导入所需的内容
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
)

# 定义模块的导入结构
_import_structure = {
    "configuration_encodec": [
        "ENCODEC_PRETRAINED_CONFIG_ARCHIVE_MAP",
        "EncodecConfig",
    ],
    "feature_extraction_encodec": ["EncodecFeatureExtractor"],
}

# 检查是否存在torch可用，若不可用则抛出OptionalDependencyNotAvailable异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果torch可用，则将以下模块添加到导入结构中
    _import_structure["modeling_encodec"] = [
        "ENCODEC_PRETRAINED_MODEL_ARCHIVE_LIST",
        "EncodecModel",
        "EncodecPreTrainedModel",
    ]

# 如果类型检查为真，则导入以下模块
if TYPE_CHECKING:
    from .configuration_encodec import (
        ENCODEC_PRETRAINED_CONFIG_ARCHIVE_MAP,
        EncodecConfig,
    )
    from .feature_extraction_encodec import EncodecFeatureExtractor

    # 再次检查torch是否可用，不可用则忽略
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        # 如果torch可用，则导入以下模块
        from .modeling_encodec import (
            ENCODEC_PRETRAINED_MODEL_ARCHIVE_LIST,
            EncodecModel,
            EncodecPreTrainedModel,
        )

else:
    # 如果不是类型检查，则导入sys模块，并将当前模块设置为_LazyModule的实例
    import sys

    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\encoder_decoder\configuration_encoder_decoder.py`

# 设置文件编码为UTF-8
# 版权声明：2020年由HuggingFace Inc.团队版权所有。
# 版权声明：2018年，NVIDIA CORPORATION版权所有。
#
# 根据Apache许可证2.0版（“许可证”）授权，除非符合许可证规定，否则不得使用此文件。
# 您可以在以下网址获取许可证的副本：
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# 除非适用法律要求或书面同意，否则本软件按“原样”分发，不提供任何明示或暗示的担保或条件。
# 有关详细信息，请参阅许可证。

# 导入必要的模块
from ...configuration_utils import PretrainedConfig
from ...utils import logging

# 获取名为__name__的当前日志记录器
logger = logging.get_logger(__name__)

# 定义EncoderDecoderConfig类，继承自PretrainedConfig
class EncoderDecoderConfig(PretrainedConfig):
    r"""
    [`EncoderDecoderConfig`]是用于存储[`EncoderDecoderModel`]配置的配置类。它用于根据指定的参数实例化编码器和解码器模型。

    配置对象继承自[`PretrainedConfig`]，可用于控制模型输出。有关更多信息，请阅读[`PretrainedConfig`]的文档。

    Args:
        kwargs (*可选参数*):
            关键字参数的字典。特别是:

                - **encoder** ([`PretrainedConfig`]，*可选*) -- 定义编码器配置的配置对象实例。
                - **decoder** ([`PretrainedConfig`]，*可选*) -- 定义解码器配置的配置对象实例。

    Examples:

    ```
    >>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel

    >>> # 初始化一个Bert google-bert/bert-base-uncased风格的配置
    >>> config_encoder = BertConfig()
    >>> config_decoder = BertConfig()

    >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)

    >>> # 初始化一个Bert2Bert模型（带有随机权重），从google-bert/bert-base-uncased风格的配置开始
    >>> model = EncoderDecoderModel(config=config)

    >>> # 访问模型配置
    >>> config_encoder = model.config.encoder
    >>> config_decoder = model.config.decoder
    >>> # 将解码器配置设置为因果语言模型
    >>> config_decoder.is_decoder = True
    >>> config_decoder.add_cross_attention = True

    >>> # 保存模型，包括其配置
    >>> model.save_pretrained("my-model")

    >>> # 从预训练文件夹加载模型和配置
    >>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained("my-model")
    >>> model = EncoderDecoderModel.from_pretrained("my-model", config=encoder_decoder_config)
    ```"""
    
    # 模型类型为“encoder-decoder”
    model_type = "encoder-decoder"
    # 是复合对象
    is_composition = True
    # 初始化方法，继承自父类并接收关键字参数
    def __init__(self, **kwargs):
        # 调用父类的初始化方法
        super().__init__(**kwargs)
        # 断言确保参数中包含 "encoder" 和 "decoder"，否则抛出异常
        assert (
            "encoder" in kwargs and "decoder" in kwargs
        ), "Config has to be initialized with encoder and decoder config"
        # 从参数中弹出 "encoder" 和 "decoder" 的配置信息
        encoder_config = kwargs.pop("encoder")
        # 获取编码器模型类型并弹出其配置信息
        encoder_model_type = encoder_config.pop("model_type")
        # 获取解码器配置信息并弹出其模型类型
        decoder_config = kwargs.pop("decoder")
        decoder_model_type = decoder_config.pop("model_type")

        # 导入自动配置模块
        from ..auto.configuration_auto import AutoConfig

        # 使用自动配置模块为编码器创建配置对象
        self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config)
        # 使用自动配置模块为解码器创建配置对象
        self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config)
        # 设置标志，表明这是一个编码器-解码器结构
        self.is_encoder_decoder = True

    # 类方法：根据预训练的编码器和解码器配置实例化一个编码器-解码器配置对象
    @classmethod
    def from_encoder_decoder_configs(
        cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs
    ) -> PretrainedConfig:
        r"""
        Instantiate a [`EncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model configuration and
        decoder model configuration.

        Returns:
            [`EncoderDecoderConfig`]: An instance of a configuration object
        """
        # 记录信息：为解码器配置设置 `is_decoder=True` 和 `add_cross_attention=True`
        logger.info("Set `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config")
        # 设置解码器配置为解码器类型，并启用交叉注意力机制
        decoder_config.is_decoder = True
        decoder_config.add_cross_attention = True

        # 使用当前类构造函数创建一个编码器-解码器配置对象，并返回
        return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)

`.\models\encoder_decoder\modeling_encoder_decoder.py`

# 设置编码方式为 UTF-8，确保脚本可以正确处理各种字符
# 版权声明和许可证信息，表明此代码遵循 Apache License, Version 2.0
# 详细许可证信息可以通过指定的 URL 获取
# 除非符合许可证中的规定，否则不得使用此文件
# 引入必要的模块和库
import gc  # Python 的垃圾回收模块，用于手动控制内存的释放
import inspect  # 用于获取对象信息的模块，如获取函数或类的源代码
import os  # 提供了许多与操作系统交互的函数
import tempfile  # 用于创建临时文件和目录的模块
import warnings  # 用于处理警告的模块
from typing import Optional, Tuple, Union  # Python 的类型提示模块，指定函数参数和返回值的类型

import torch  # PyTorch 深度学习库
from torch import nn  # PyTorch 中的神经网络模块
from torch.nn import CrossEntropyLoss  # 交叉熵损失函数

# 从 Hugging Face 的 Transformers 库中导入相关的模块和函数
from ...configuration_utils import PretrainedConfig  # 预训练配置文件类
from ...modeling_outputs import BaseModelOutput, Seq2SeqLMOutput  # 模型输出类
from ...modeling_utils import PreTrainedModel  # 预训练模型基类
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings  # 辅助函数和日志模块
from ..auto.configuration_auto import AutoConfig  # 自动配置模块
from ..auto.modeling_auto import AutoModel, AutoModelForCausalLM  # 自动模型和自动语言模型模块
from .configuration_encoder_decoder import EncoderDecoderConfig  # 编码器-解码器配置类

# 获取日志记录器对象
logger = logging.get_logger(__name__)

# 用于文档的配置对象名称
_CONFIG_FOR_DOC = "EncoderDecoderConfig"

# 弃用警告信息，指出新版本的变化和使用建议
DEPRECATION_WARNING = (
    "Version v4.12.0 introduces a better way to train encoder-decoder models by computing the loss inside the"
    " encoder-decoder framework rather than in the decoder itself. You may observe training discrepancies if"
    " fine-tuning a model trained with versions anterior to 4.12.0. The decoder_input_ids are now created based on the"
    " labels, no need to pass them yourself anymore."
)

# Encoder-Decoder 模型文档字符串的起始部分，使用原始字符串表示
ENCODER_DECODER_START_DOCSTRING = r"""
    This class can be used to initialize a sequence-to-sequence model with any pretrained autoencoding model as the
    encoder and any pretrained autoregressive model as the decoder. The encoder is loaded via
    [`~AutoModel.from_pretrained`] function and the decoder is loaded via [`~AutoModelForCausalLM.from_pretrained`]
    function. Cross-attention layers are automatically added to the decoder and should be fine-tuned on a downstream
    generative task, like summarization.

    The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation
    tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation
    Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
    Zhou, Wei Li, Peter J. Liu.

    After such an Encoder Decoder model has been trained/fine-tuned, it can be saved/loaded just like any other models
    (see the examples for more information).

    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)



    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.



    Parameters:
        config ([`EncoderDecoderConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""

ENCODER_DECODER_INPUTS_DOCSTRING = r"""
"""

# 定义一个函数，用于将输入的 token ids 向右移动一位
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
    """
    Shift input ids one token to the right.
    将输入的 token ids 向右移动一位。
    """
    # 创建一个与 input_ids 相同形状的零张量 shifted_input_ids
    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
    # 将 input_ids 的除第一列外的数据复制到 shifted_input_ids 的第二列开始
    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
    # 如果 decoder_start_token_id 为 None，则抛出 ValueError
    if decoder_start_token_id is None:
        raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
    # 将 shifted_input_ids 的第一列设置为 decoder_start_token_id
    shifted_input_ids[:, 0] = decoder_start_token_id

    # 如果 pad_token_id 为 None，则抛出 ValueError
    if pad_token_id is None:
        raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
    # 将 shifted_input_ids 中可能的 -100 值替换为 pad_token_id
    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

    # 返回向右移动后的 input ids
    return shifted_input_ids


@add_start_docstrings(ENCODER_DECODER_START_DOCSTRING)
# 定义 EncoderDecoderModel 类，继承自 PreTrainedModel
class EncoderDecoderModel(PreTrainedModel):
    r"""
    [`EncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with one
    of the base model classes of the library as encoder and another one as decoder when created with the
    :meth*~transformers.AutoModel.from_pretrained* class method for the encoder and
    :meth*~transformers.AutoModelForCausalLM.from_pretrained* class method for the decoder.
    
    EncoderDecoderModel 是一个通用的模型类，当使用 :meth:`~transformers.AutoModel.from_pretrained` 方法为编码器和
    :meth:`~transformers.AutoModelForCausalLM.from_pretrained` 方法为解码器创建时，它将被实例化为一个转换器架构。
    """

    # 类变量，指定配置类为 EncoderDecoderConfig
    config_class = EncoderDecoderConfig
    # 类变量，指定基础模型前缀为 "encoder_decoder"
    base_model_prefix = "encoder_decoder"
    # 类变量，主输入名称为 "input_ids"
    main_input_name = "input_ids"
    # 类变量，支持梯度检查点
    supports_gradient_checkpointing = True

    # 初始化方法
    def __init__(
        self,
        config: Optional[PretrainedConfig] = None,
        encoder: Optional[PreTrainedModel] = None,
        decoder: Optional[PreTrainedModel] = None,
    ):
        """
        Initialize the EncoderDecoderModel.
        初始化 EncoderDecoderModel。
        """
        # 如果需要，将编码器和解码器的权重绑定在一起
        def tie_weights(self):
            """
            Tie encoder & decoder if needed.
            如果需要，将编码器和解码器的权重绑定在一起。
            """
            if self.config.tie_encoder_decoder:
                # 获取解码器基础模型的前缀
                decoder_base_model_prefix = self.decoder.base_model_prefix
                # 调用 _tie_encoder_decoder_weights 方法，将编码器和解码器的权重绑定在一起
                self._tie_encoder_decoder_weights(
                    self.encoder, self.decoder._modules[decoder_base_model_prefix], self.decoder.base_model_prefix
                )

    # 获取编码器模型的方法
    def get_encoder(self):
        """
        Get the encoder model.
        获取编码器模型。
        """
        return self.encoder

    # 获取解码器模型的方法
    def get_decoder(self):
        """
        Get the decoder model.
        获取解码器模型。
        """
        return self.decoder

    # 获取输入嵌入的方法
    def get_input_embeddings(self):
        """
        Get the input embeddings.
        获取输入嵌入。
        """
        return self.encoder.get_input_embeddings()

    # 获取输出嵌入的方法
    def get_output_embeddings(self):
        """
        Get the output embeddings.
        获取输出嵌入。
        """
        return self.decoder.get_output_embeddings()

    # 设置输出嵌入的方法
    def set_output_embeddings(self, new_embeddings):
        """
        Set the output embeddings.
        设置输出嵌入。
        """
        return self.decoder.set_output_embeddings(new_embeddings)

    @classmethod
    @classmethod
    def from_encoder_decoder_pretrained(
        cls,
        encoder_pretrained_model_name_or_path: str = None,
        decoder_pretrained_model_name_or_path: str = None,
        *model_args,
        **kwargs,
    ):
        """
        Instantiate an encoder-decoder model from pretrained model configurations.
        从预训练模型配置实例化一个编码器-解码器模型。
        """
        pass

    @add_start_docstrings_to_model_forward(ENCODER_DECODER_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
    # 定义模型的前向传播方法，用于生成模型的输出
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.BoolTensor] = None,
        encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
        past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
    ):
        # 此处是模型前向传播的方法，接收多个输入参数，包括输入的 token IDs、注意力掩码等
        # 返回模型的输出结果，如生成的 token IDs、注意力分布等

    # 根据标签准备解码器的输入 token IDs
    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)

    # 准备用于生成的输入参数，构建生成过程所需的输入字典
    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
    ):
        # 调用解码器对象的准备输入方法，获取解码器的输入信息
        decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values)
        # 如果解码器输入中包含注意力掩码，则获取之
        decoder_attention_mask = decoder_inputs["attention_mask"] if "attention_mask" in decoder_inputs else None
        # 构建输入字典，包括输入的注意力掩码、解码器的注意力掩码、解码器的输入 token IDs、编码器的输出等
        input_dict = {
            "attention_mask": attention_mask,
            "decoder_attention_mask": decoder_attention_mask,
            "decoder_input_ids": decoder_inputs["input_ids"],
            "encoder_outputs": encoder_outputs,
            "past_key_values": decoder_inputs["past_key_values"],
            "use_cache": use_cache,
        }
        return input_dict

    # 调整 token embeddings 大小的方法，目前尚未实现
    def resize_token_embeddings(self, *args, **kwargs):
        raise NotImplementedError(
            "Resizing the embedding layers via the EncoderDecoderModel directly is not supported. Please use the"
            " respective methods of the wrapped objects (model.encoder.resize_token_embeddings(...) or"
            " model.decoder.resize_token_embeddings(...))"
        )

    # 重新排序缓存数据的方法，用于束搜索时的缓存重排
    def _reorder_cache(self, past_key_values, beam_idx):
        # 调用解码器对象的缓存重排方法
        return self.decoder._reorder_cache(past_key_values, beam_idx)

`.\models\encoder_decoder\modeling_flax_encoder_decoder.py`

# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Classes to support Flax Encoder-Decoder architectures"""


import os
from typing import Optional, Tuple, Union

import flax.linen as nn  # 导入 Flax 的 Linen 模块，用于定义神经网络结构
import jax  # 导入 JAX，用于自动求导和加速数值计算
import jax.numpy as jnp  # 导入 JAX 的 NumPy 接口，用于操作多维数组
from flax.core.frozen_dict import FrozenDict, freeze, unfreeze  # 导入 Flax 的冻结字典相关函数
from flax.traverse_util import flatten_dict, unflatten_dict  # 导入 Flax 的字典扁平化和反扁平化工具函数
from jax import lax  # 导入 JAX 的 lax 模块，提供了一些数值计算的基本操作
from jax.random import PRNGKey  # 导入 JAX 的随机数生成模块

from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutputWithCrossAttentions, FlaxSeq2SeqLMOutput  # 导入输出相关类
from ...modeling_flax_utils import FlaxPreTrainedModel  # 导入 Flax 预训练模型基类
from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings  # 导入工具函数和日志记录器
from ..auto.configuration_auto import AutoConfig  # 导入自动配置类
from ..auto.modeling_flax_auto import FlaxAutoModel, FlaxAutoModelForCausalLM  # 导入自动模型加载类
from .configuration_encoder_decoder import EncoderDecoderConfig  # 导入编码解码器配置类


logger = logging.get_logger(__name__)  # 获取当前模块的日志记录器实例

_CONFIG_FOR_DOC = "EncoderDecoderConfig"

ENCODER_DECODER_START_DOCSTRING = r"""
    This class can be used to initialize a sequence-to-sequence model with any pretrained autoencoding model as the
    encoder and any pretrained autoregressive model as the decoder. The encoder is loaded via
    [`~AutoModel.from_pretrained`] function and the decoder is loaded via [`~AutoModelForCausalLM.from_pretrained`]
    function. Cross-attention layers are automatically added to the decoder and should be fine-tuned on a downstream
    generative task, like summarization.

    The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation
    tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation
    Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
    Zhou, Wei Li, Peter J. Liu.

    After such an Encoder Decoder model has been trained/fine-tuned, it can be saved/loaded just like any other models
    (see the examples for more information).

    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a Flax Linen
    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
    # 定义 EncoderDecoder 类，继承自 FlaxPreTrainedModel 类
    class EncoderDecoder(FlaxPreTrainedModel):
        # 初始化方法，根据给定的配置 config 初始化模型
        def __init__(self, config: EncoderDecoderConfig):
            # 调用父类的初始化方法，传入配置 config
            super().__init__(config)
    
        # forward 方法用于模型推理，接收输入并返回输出
        def forward(
            self,
            # 输入数据
            input_ids: jnp.ndarray,
            # 注意力掩码
            attention_mask: jnp.ndarray,
            # token 类型 IDs
            token_type_ids: jnp.ndarray = None,
            # 位置编码
            position_ids: jnp.ndarray = None,
            # 校准
            inputs_embeds: jnp.ndarray = None,
            # 输出模型
            output_attentions: bool = False,
            # 输出层
            output_hidden_states: bool = False,
            # 返回结果
            return_dict: bool = False,
        ) -> Union[FlaxBaseModelOutput, Tuple[jnp.ndarray]]:
            # 参数解释
            """
            forward方法用于模型推理，接收一系列输入数据并返回模型输出结果。
    
            Parameters:
                input_ids (jax.numpy.ndarray): 输入的 token IDs.
                attention_mask (jax.numpy.ndarray): 注意力掩码，用于指示哪些位置是 padding 的.
                token_type_ids (jax.numpy.ndarray, optional): token 类型 IDs，默认为 None.
                position_ids (jax.numpy.ndarray, optional): 位置编码，默认为 None.
                inputs_embeds (jax.numpy.ndarray, optional): 输入的嵌入向量，默认为 None.
                output_attentions (bool, optional): 是否输出注意力权重，默认为 False.
                output_hidden_states (bool, optional): 是否输出所有隐藏状态，默认为 False.
                return_dict (bool, optional): 是否返回字典格式的输出，默认为 False.
    
            Returns:
                Union[FlaxBaseModelOutput, Tuple[jax.numpy.ndarray]]: 模型输出结果，可能为多种格式的返回值。
            """
            # 实现模型的前向推理过程
            raise NotImplementedError
    
        # 静态方法，用于从预训练模型加载权重
        @classmethod
        def from_pretrained(
            cls,
            # 模型路径或标识符
            pretrained_model_name_or_path: str,
            # 模型配置
            config: Optional[EncoderDecoderConfig] = None,
            # 数据类型，默认为 float32
            dtype: Optional[jax.numpy.dtype] = jnp.float32,
            # 本地缓存目录
            local_files_only: bool = False,
            # 使用显存
            use_auth_token: Optional[Union[bool, str]] = None,
            # 一系列附加关键字参数
            **kwargs,
        ) -> "FlaxPreTrainedModel":
            # 参数解释
            """
            从预训练模型加载模型权重和配置信息。
    
            Parameters:
                pretrained_model_name_or_path (str): 预训练模型的路径或标识符.
                config (Optional[EncoderDecoderConfig]): 模型配置，可选.
                dtype (Optional[jax.numpy.dtype]): 计算时使用的数据类型，默认为 jax.numpy.float32.
                local_files_only (bool): 是否只使用本地文件，默认为 False.
                use_auth_token (Optional[Union[bool, str]]): 是否使用授权令牌，默认为 None.
                **kwargs: 其他关键字参数.
    
            Returns:
                FlaxPreTrainedModel: 加载并返回预训练模型.
            """
            # 如果未提供配置，创建一个空的配置对象
            if config is None:
                config = EncoderDecoderConfig()
    
            # 获取模型的 URL 或本地路径
            resolved_model_path = hf_cache_or_filename(pretrained_model_name_or_path, kwargs)
    
            # 从 URL 或本地路径加载模型文件
            model_file = download_model_from_path(resolved_model_path, local_files_only=local_files_only)
    
            # 加载模型的配置信息，这里只加载配置而不加载权重
            model_config = cls.config_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
    
            # 如果指定了 dtype，则将模型的计算类型设置为给定的 dtype
            if dtype is not None:
                model_config.dtype = dtype
    
            # 根据配置创建模型实例
            model = cls(config=model_config, **kwargs)
    
            # 如果存在本地缓存，加载权重
            if os.path.isfile(model_file):
                # 使用 JAX 来加载权重
                model_params = load_flax_weights_in_model(model, model_file)
    
            # 返回加载好权重的模型实例
            return model
    
        # 方法用于将模型参数转换为半精度（float16）
        def to_fp16(self):
            # 参数解释
            """
            将模型参数转换为半精度（float16）.
    
            Returns:
                EncoderDecoder: 转换后的半精度模型实例.
            """
            # 实现方法体
            raise NotImplementedError
    
        # 方法用于将模型参数转换为 bfloat16
        def to_bf16(self):
            # 参数解释
            """
            将模型参数转换为 bfloat16.
    
            Returns:
                EncoderDecoder: 转换后的 bfloat16 模型实例.
            """
            # 实现方法体
            raise NotImplementedError
"""

ENCODER_DECODER_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            For sequence to sequence training, `decoder_input_ids` should be provided. `decoder_input_ids` should be
            created outside of the model by shifting the `labels` to the right, replacing -100 by the `pad_token_id`
            and prepending them with the `decoder_start_token_id`.
        decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.encoder.max_position_embeddings - 1]`.
        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
            range `[0, config.decoder.max_position_embeddings - 1]`.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            If set to `True`, the model will return a [`~utils.FlaxSeq2SeqLMOutput`] instead of a plain tuple.
"""

ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.encoder.max_position_embeddings - 1]`.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            If set to `True`, the model will return a [`~utils.FlaxSeq2SeqLMOutput`] instead of a plain tuple.
"""
    Args:
        input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.
            Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.
            [What are input IDs?](../glossary#input-ids)
        
        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        
        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.encoder.max_position_embeddings - 1]`.
        
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        
        return_dict (`bool`, *optional*):
            If set to `True`, the model will return a [`~utils.FlaxBaseModelOutput`] instead of a plain tuple.
"""

ENCODER_DECODER_DECODE_INPUTS_DOCSTRING = r"""
"""

# 定义一个 Flax 编码器解码器模块的类
class FlaxEncoderDecoderModule(nn.Module):
    # 类属性：配置信息为 EncoderDecoderConfig 类型，数据类型为 jnp.float32
    config: EncoderDecoderConfig
    dtype: jnp.dtype = jnp.float32

    # 初始化方法
    def setup(self):
        # 获取编码器和解码器的配置
        encoder_config = self.config.encoder
        decoder_config = self.config.decoder

        # 从 modeling_flax_auto 模块导入 FLAX_MODEL_MAPPING 和 FLAX_MODEL_FOR_CAUSAL_LM_MAPPING
        # encoder_module 是根据 encoder_config 类型从 FLAX_MODEL_MAPPING 中获取的模块类
        encoder_module = FLAX_MODEL_MAPPING[encoder_config.__class__].module_class
        # decoder_module 是根据 decoder_config 类型从 FLAX_MODEL_FOR_CAUSAL_LM_MAPPING 中获取的模块类
        decoder_module = FLAX_MODEL_FOR_CAUSAL_LM_MAPPING[decoder_config.__class__].module_class

        # 使用 encoder_module 和 decoder_module 初始化编码器和解码器实例
        self.encoder = encoder_module(encoder_config, dtype=self.dtype)
        self.decoder = decoder_module(decoder_config, dtype=self.dtype)

        # 如果编码器输出的隐藏状态维度与解码器不同，并且解码器的交叉注意力隐藏状态尺寸为 None
        # 则定义一个线性层 enc_to_dec_proj，用于将编码器输出投影到解码器所需的隐藏状态维度
        if (
            self.encoder.config.hidden_size != self.decoder.config.hidden_size
            and self.decoder.config.cross_attention_hidden_size is None
        ):
            self.enc_to_dec_proj = nn.Dense(
                self.decoder.config.hidden_size,
                kernel_init=jax.nn.initializers.normal(self.decoder.config.initializer_range),
                dtype=self.dtype,
            )
        else:
            self.enc_to_dec_proj = None

    # 获取编码器模块的方法
    def _get_encoder_module(self):
        return self.encoder

    # 获取投影模块的方法
    def _get_projection_module(self):
        return self.enc_to_dec_proj

    # 获取解码器模块的方法
    def _get_decoder_module(self):
        return self.decoder

    # 调用实例时的方法，用于执行编码解码过程
    def __call__(
        self,
        input_ids,
        attention_mask,
        decoder_input_ids,
        decoder_attention_mask,
        position_ids,
        decoder_position_ids,
        output_attentions: bool = False,
        output_hidden_states: bool = False,
        return_dict: bool = True,
        deterministic: bool = True,
        ):
            # 调用编码器模型，传入输入的编码器相关参数
            encoder_outputs = self.encoder(
                input_ids=input_ids,
                attention_mask=attention_mask,
                position_ids=position_ids,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
                deterministic=deterministic,
            )

            # 获取编码器的隐藏状态
            encoder_hidden_states = encoder_outputs[0]

            # 可选地投影编码器的隐藏状态到解码器
            if self.enc_to_dec_proj is not None:
                encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states)

            # 调用解码器模型，传入解码器相关参数以及编码器的隐藏状态
            decoder_outputs = self.decoder(
                input_ids=decoder_input_ids,
                attention_mask=decoder_attention_mask,
                position_ids=decoder_position_ids,
                encoder_hidden_states=encoder_hidden_states,
                encoder_attention_mask=attention_mask,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
                deterministic=deterministic,
            )

            # 如果 return_dict 为 False，则返回解码器和编码器的输出
            if not return_dict:
                return decoder_outputs + encoder_outputs

            # 如果 return_dict 为 True，则返回 FlaxSeq2SeqLMOutput 对象，包含解码器的输出和编码器的相关信息
            return FlaxSeq2SeqLMOutput(
                logits=decoder_outputs.logits,
                decoder_hidden_states=decoder_outputs.hidden_states,
                decoder_attentions=decoder_outputs.attentions,
                cross_attentions=decoder_outputs.cross_attentions,
                encoder_last_hidden_state=encoder_outputs.last_hidden_state,
                encoder_hidden_states=encoder_outputs.hidden_states,
                encoder_attentions=encoder_outputs.attentions,
            )
# 使用装饰器向FlaxEncoderDecoderModel类添加文档字符串
@add_start_docstrings(ENCODER_DECODER_START_DOCSTRING)
# 定义FlaxEncoderDecoderModel类，继承自FlaxPreTrainedModel
class FlaxEncoderDecoderModel(FlaxPreTrainedModel):
    """
    [`FlaxEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with
    the module (flax.nn.Module) of one of the base model classes of the library as encoder module and another one as
    decoder module when created with the :meth*~transformers.FlaxAutoModel.from_pretrained* class method for the
    encoder and :meth*~transformers.FlaxAutoModelForCausalLM.from_pretrained* class method for the decoder.
    """

    # 指定配置类为EncoderDecoderConfig
    config_class = EncoderDecoderConfig
    # 指定基础模型的前缀
    base_model_prefix = "encoder_decoder"
    # 指定模块类为FlaxEncoderDecoderModule
    module_class = FlaxEncoderDecoderModule

    # 初始化方法
    def __init__(
        self,
        config: EncoderDecoderConfig,           # 配置对象，类型为EncoderDecoderConfig
        input_shape: Optional[Tuple] = None,    # 输入形状，可选的元组
        seed: int = 0,                          # 随机种子，默认为0
        dtype: jnp.dtype = jnp.float32,         # 数据类型，默认为jnp.float32
        _do_init: bool = True,                  # 是否初始化的标志，默认为True
        **kwargs,                               # 其他关键字参数
    ):
        # 如果没有指定输入形状，则设置默认输入形状为((1, 1), (1, 1))
        if input_shape is None:
            input_shape = ((1, 1), (1, 1))

        # 如果_do_init为False，则抛出错误，不能创建未初始化的FlaxEncoderDecoderModel
        if not _do_init:
            raise ValueError(
                "`FlaxEncoderDecoderModel` cannot be created without initializing, `_do_init` must be `True`."
            )

        # 如果配置中decoder的cross_attention_hidden_size不为None
        if config.decoder.cross_attention_hidden_size is not None:
            # 检查decoder的cross_attention_hidden_size是否等于encoder的hidden_size
            if config.decoder.cross_attention_hidden_size != config.encoder.hidden_size:
                raise ValueError(
                    "If `cross_attention_hidden_size` is specified in the decoder's configuration, it has to be equal"
                    f" to the encoder's `hidden_size`. Got {config.decoder.cross_attention_hidden_size} for"
                    f" `config.decoder.cross_attention_hidden_size` and {config.encoder.hidden_size} for"
                    " `config.encoder.hidden_size`."
                )

        # 使用配置和其他关键字参数初始化模块对象
        module = self.module_class(config=config, dtype=dtype, **kwargs)
        # 调用父类FlaxPreTrainedModel的初始化方法
        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
        encoder_input_shape, decoder_input_shape = input_shape  # 解包输入形状元组

        # 初始化编码器的输入张量
        input_ids = jnp.zeros(encoder_input_shape, dtype="i4")  # 创建全零的整数张量
        attention_mask = jnp.ones_like(input_ids)  # 创建与input_ids形状相同的全1张量作为注意力掩码

        # 初始化解码器的输入张量
        decoder_input_ids = jnp.zeros(decoder_input_shape, dtype="i4")  # 创建全零的整数张量
        decoder_attention_mask = jnp.ones_like(decoder_input_ids)  # 创建与decoder_input_ids形状相同的全1张量作为注意力掩码

        batch_size, sequence_length = input_ids.shape  # 获取编码器输入张量的批量大小和序列长度
        position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))  # 根据序列长度广播位置编码

        decoder_batch_size, decoder_sequence_length = decoder_input_ids.shape  # 获取解码器输入张量的批量大小和序列长度
        if not decoder_batch_size == batch_size:  # 如果编码器和解码器的批量大小不相等，抛出值错误
            raise ValueError(
                f"The inputs of encoder and decoder should have the same batch size, but got {batch_size} for encoder"
                f" and {decoder_batch_size} for decoder."
            )
        decoder_position_ids = jnp.broadcast_to(
            jnp.arange(decoder_sequence_length)[None, :], (decoder_batch_size, decoder_sequence_length)
        )  # 根据解码器序列长度广播解码器的位置编码

        params_rng, dropout_rng = jax.random.split(rng)  # 使用随机数生成器拆分用于参数初始化和dropout的随机数种子
        rngs = {"params": params_rng, "dropout": dropout_rng}  # 组成随机数种子字典

        random_params = self.module.init(  # 使用模块的初始化方法初始化随机参数
            rngs,
            input_ids,
            attention_mask,
            decoder_input_ids,
            decoder_attention_mask,
            position_ids,
            decoder_position_ids,
        )["params"]  # 返回初始化后的参数

        if params is not None:  # 如果给定了预定义的参数
            random_params = flatten_dict(unfreeze(random_params))  # 展平和解冻随机参数
            params = flatten_dict(unfreeze(params))  # 展平和解冻预定义参数
            for missing_key in self._missing_keys:  # 对于每个缺失的键
                params[missing_key] = random_params[missing_key]  # 使用随机参数填充预定义参数的缺失部分
            self._missing_keys = set()  # 清空缺失键集合
            return freeze(unflatten_dict(params))  # 冻结和重构预定义参数并返回
        else:
            return random_params  # 否则直接返回随机初始化的参数
    # 初始化缓存函数，用于自动回归解码
    def init_cache(self, batch_size, max_length, encoder_outputs):
        r"""
        Args:
            batch_size (`int`):
                用于快速自动回归解码的批大小。定义了初始化缓存的批大小。
            max_length (`int`):
                自动回归解码的最大可能长度。定义了初始化缓存的序列长度。
            encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
                `encoder_outputs` 包括 (`last_hidden_state`, *可选*: `hidden_states`, *可选*: `attentions`)。
                `last_hidden_state` 的形状为 `(batch_size, sequence_length, hidden_size)`，*可选* 是编码器最后一层的隐藏状态输出，
                在解码器的交叉注意力中使用。
        """
        # 初始化解码器输入的变量以检索缓存
        decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4")
        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
        decoder_position_ids = jnp.broadcast_to(
            jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]), decoder_input_ids.shape
        )

        def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
            # 获取解码器模块
            decoder_module = module._get_decoder_module()
            return decoder_module(
                input_ids=decoder_input_ids,
                attention_mask=decoder_attention_mask,
                position_ids=decoder_position_ids,
                **kwargs,
            )

        # 使用解码器来初始化缓存，只需调用解码器来初始化缓存
        init_variables = self.module.init(
            jax.random.PRNGKey(0),
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            decoder_position_ids=decoder_position_ids,
            encoder_hidden_states=encoder_outputs[0],
            init_cache=True,
            method=_decoder_forward,
        )
        # 解冻并返回初始化的缓存
        return unfreeze(init_variables["cache"])

    @add_start_docstrings(ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=_CONFIG_FOR_DOC)
    # 编码函数，用于对输入进行编码
    def encode(
        self,
        input_ids: jnp.ndarray,
        attention_mask: Optional[jnp.ndarray] = None,
        position_ids: Optional[jnp.ndarray] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        train: bool = False,
        params: dict = None,
        dropout_rng: PRNGKey = None,
        ):
        r"""
        Returns:

        Example:

        ```
        >>> from transformers import FlaxEncoderDecoderModel, BertTokenizer

        >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
        >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-cased", "openai-community/gpt2")

        >>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")

        >>> text = "My friends are cool but they eat too many carbs."
        >>> input_ids = tokenizer.encode(text, return_tensors="np")
        >>> encoder_outputs = model.encode(input_ids)
        ```"""
        # 设置输出注意力机制参数，若未指定则使用配置文件中的默认设置
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        # 设置输出隐藏状态参数，若未指定则使用配置文件中的默认设置
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        # 设置返回字典参数，若未指定则使用配置文件中的默认设置
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        # 如果没有提供注意力掩码，则创建一个与输入相同形状的全1注意力掩码
        if attention_mask is None:
            attention_mask = jnp.ones_like(input_ids)
        # 如果没有提供位置编码，则根据输入的长度广播生成位置编码
        if position_ids is None:
            batch_size, sequence_length = input_ids.shape
            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))

        # 如果需要处理任何伪随机数生成器（PRNG）
        rngs = {}
        if dropout_rng is not None:
            rngs["dropout"] = dropout_rng

        # 定义编码器前向传播函数
        def _encoder_forward(module, input_ids, attention_mask, position_ids, **kwargs):
            # 获取编码器模块
            encode_module = module._get_encoder_module()
            # 调用编码器模块进行编码
            return encode_module(input_ids, attention_mask, position_ids, **kwargs)

        # 应用模型的前向传播，传入参数和配置
        outputs = self.module.apply(
            {"params": params or self.params},
            input_ids=jnp.array(input_ids, dtype="i4"),
            attention_mask=jnp.array(attention_mask, dtype="i4"),
            position_ids=jnp.array(position_ids, dtype="i4"),
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            deterministic=not train,
            rngs=rngs,
            method=_encoder_forward,
        )

        # 如果需要返回字典，则构建相应的输出对象
        if return_dict:
            outputs = FlaxBaseModelOutput(
                last_hidden_state=outputs.last_hidden_state,
                hidden_states=outputs.hidden_states,
                attentions=outputs.attentions,
            )

        # 返回模型的输出结果
        return outputs

    # 添加开始的文档字符串注释，指定输入的解码器解码文档字符串
    @add_start_docstrings(ENCODER_DECODER_DECODE_INPUTS_DOCSTRING)
    # 替换返回文档字符串，指定输出类型为带交叉注意力的FlaxCausalLMOutputWithCrossAttentions，配置类为_CONFIG_FOR_DOC
    @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
    # 定义一个解码方法，用于将编码器和解码器的输入转换为模型的输出
    def decode(
        self,
        decoder_input_ids,
        encoder_outputs,
        encoder_attention_mask: Optional[jnp.ndarray] = None,
        decoder_attention_mask: Optional[jnp.ndarray] = None,
        decoder_position_ids: Optional[jnp.ndarray] = None,
        past_key_values: dict = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        train: bool = False,
        params: dict = None,
        dropout_rng: PRNGKey = None,
    ):
        # 这里使用了自定义的函数装饰器，为模型的前向传播添加了文档字符串
        @add_start_docstrings_to_model_forward(ENCODER_DECODER_INPUTS_DOCSTRING)
        @replace_return_docstrings(output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
        def __call__(
            self,
            input_ids: jnp.ndarray,
            attention_mask: Optional[jnp.ndarray] = None,
            decoder_input_ids: Optional[jnp.ndarray] = None,
            decoder_attention_mask: Optional[jnp.ndarray] = None,
            position_ids: Optional[jnp.ndarray] = None,
            decoder_position_ids: Optional[jnp.ndarray] = None,
            output_attentions: Optional[bool] = None,
            output_hidden_states: Optional[bool] = None,
            return_dict: Optional[bool] = None,
            train: bool = False,
            params: dict = None,
            dropout_rng: PRNGKey = None,
        ):
            pass  # 此处省略了函数具体实现，由于是在类内部定义，可以访问类的其他成员变量和方法

    # 准备生成时的输入，初始化缓存和注意力掩码等
    def prepare_inputs_for_generation(
        self,
        decoder_input_ids,
        max_length,
        attention_mask: Optional[jax.Array] = None,
        decoder_attention_mask: Optional[jax.Array] = None,
        encoder_outputs=None,
        **kwargs,
    ):
        # 初始化缓存，通常用于存储解码器的过去键值对
        batch_size, seq_length = decoder_input_ids.shape
        past_key_values = self.init_cache(batch_size, max_length, encoder_outputs)

        # 创建一个扩展的注意力掩码，用于确保模型只关注当前生成位置之前的信息
        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
        if decoder_attention_mask is not None:
            # 根据解码器的注意力掩码动态更新扩展的注意力掩码
            decoder_position_ids = decoder_attention_mask.cumsum(axis=-1) - 1
            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
        else:
            # 如果没有提供解码器的注意力掩码，则使用默认的位置 IDs
            decoder_position_ids = jnp.broadcast_to(
                jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)
            )

        return {
            "past_key_values": past_key_values,
            "encoder_outputs": encoder_outputs,
            "encoder_attention_mask": attention_mask,
            "decoder_attention_mask": extended_attention_mask,
            "decoder_position_ids": decoder_position_ids,
        }
    # 更新生成过程中的模型参数
    def update_inputs_for_generation(self, model_outputs, model_kwargs):
        # 将模型输出中的过去键值添加到模型参数中
        model_kwargs["past_key_values"] = model_outputs.past_key_values
        # 更新解码器位置标识符，将其限制为最后一个位置加1
        model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1
        # 返回更新后的模型参数
        return model_kwargs

    # 从预训练的编码器-解码器模型中创建实例
    @classmethod
    def from_encoder_decoder_pretrained(
        cls,
        encoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
        decoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
        *model_args,
        **kwargs,

`.\models\encoder_decoder\modeling_tf_encoder_decoder.py`

# 设置文件编码为 UTF-8
# 版权声明，声明代码版权归 HuggingFace Inc. 团队所有，采用 Apache License, Version 2.0
# 只有在遵循许可证的情况下才能使用此文件
# 可以在以下网址获取许可证的副本：http://www.apache.org/licenses/LICENSE-2.0
# 除非法律要求或书面同意，否则不得使用本文件中的代码
# 本文件中的代码按"原样"提供，不提供任何形式的担保或条件，无论是明示的还是暗示的
# 详细信息请参阅许可证
""" Classes to support TF Encoder-Decoder architectures"""

from __future__ import annotations  # 支持在注解中使用自身类名

import inspect  # 导入用于获取对象信息的模块
import re  # 导入正则表达式模块
import warnings  # 导入警告处理模块
from typing import Optional, Tuple, Union  # 导入类型提示

import numpy as np  # 导入 NumPy 数学库
import tensorflow as tf  # 导入 TensorFlow 深度学习库

from ...configuration_utils import PretrainedConfig  # 导入预训练配置类
from ...modeling_tf_outputs import TFBaseModelOutput, TFSeq2SeqLMOutput  # 导入 TensorFlow 模型输出类
from ...modeling_tf_utils import (  # 导入 TensorFlow 模型工具函数
    TFCausalLanguageModelingLoss,
    TFModelInputType,
    TFPreTrainedModel,
    get_initializer,
    keras,
    unpack_inputs,
)
from ...tf_utils import shape_list  # 导入获取张量形状的函数
from ...utils import (  # 导入实用函数
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from ..auto.configuration_auto import AutoConfig  # 导入自动配置类
from ..auto.modeling_tf_auto import TFAutoModel, TFAutoModelForCausalLM  # 导入自动 TensorFlow 模型类
from .configuration_encoder_decoder import EncoderDecoderConfig  # 导入编码解码器配置类

logger = logging.get_logger(__name__)  # 获取日志记录器对象

_CONFIG_FOR_DOC = "EncoderDecoderConfig"  # 用于文档的配置名称

DEPRECATION_WARNING = (
    "Version v4.17.0 introduces a better way to train encoder-decoder models by computing the loss inside the"
    " encoder-decoder framework rather than in the decoder itself. You may observe training discrepancies if"
    " fine-tuning a model trained with versions anterior to 4.17.0. The decoder_input_ids are now created based on the"
    " labels, no need to pass them yourself anymore."
)  # 弃用警告信息

ENCODER_DECODER_START_DOCSTRING = r"""
    This class can be used to initialize a sequence-to-sequence model with any pretrained autoencoding model as the
    encoder and any pretrained autoregressive model as the decoder. The encoder is loaded via
    [`~TFAutoModel.from_pretrained`] function and the decoder is loaded via [`~TFAutoModelForCausalLM.from_pretrained`]
    function. Cross-attention layers are automatically added to the decoder and should be fine-tuned on a downstream
    generative task, like summarization.

    The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation
    tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation
    Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi
    Zhou, Wei Li, Peter J. Liu.

    After such an Encoder Decoder model has been trained/fine-tuned, it can be saved/loaded just like any other models

(see the examples for more information).



This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)



This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
behavior.



Parameters:
    config ([`EncoderDecoderConfig`]): Model configuration class with all the parameters of the model.
        Initializing with a config file does not load the weights associated with the model, only the
        configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.

“”"

ENCODER_DECODER_INPUTS_DOCSTRING = r"“”
“”"

定义一个函数，用于将输入的token_ids向右移动，模拟decoder端的输入

def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int):
# 如果pad_token_id未设置，则抛出数值错误
if pad_token_id is None:
raise ValueError(“Make sure to set the pad_token_id attribute of the model’s configuration.”)
# 将pad_token_id转换为与input_ids相同的数据类型
pad_token_id = tf.cast(pad_token_id, input_ids.dtype)

# 如果decoder_start_token_id未设置，则抛出数值错误
if decoder_start_token_id is None:
    raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
# 将decoder_start_token_id转换为与input_ids相同的数据类型
decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype)

# 创建一个形状为(input_ids的行数, 1)的张量，每个元素均为decoder_start_token_id
start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id)
# 将start_tokens与input_ids的前几列合并，构成向右移动后的输入token_ids
shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
# 将shifted_input_ids中可能的-100值替换为pad_token_id
shifted_input_ids = tf.where(
    shifted_input_ids == -100, tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids
)

# 断言shifted_input_ids中的值均大于等于0
assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype))

# 确保断言操作的调用，通过在结果中包装一个身份无操作
with tf.control_dependencies([assert_gte0]):
    shifted_input_ids = tf.identity(shifted_input_ids)

return shifted_input_ids

@add_start_docstrings(ENCODER_DECODER_START_DOCSTRING)

TFEncoderDecoderModel类，继承自TFPreTrainedModel和TFCausalLanguageModelingLoss

class TFEncoderDecoderModel(TFPreTrainedModel, TFCausalLanguageModelingLoss):
r"“”
[TFEncoderDecoderModel]是一个通用的模型类，创建时会使用库中的一个基础模型类作为encoder和另一个作为decoder，
使用[~TFAutoModel.from_pretrained]类方法创建encoder，和使用[~TFAutoModelForCausalLM.from_pretrained]类方法创建decoder。
“”"

# 类属性，指定配置类为EncoderDecoderConfig
config_class = EncoderDecoderConfig
# 模型前缀为"encoder_decoder"
base_model_prefix = "encoder_decoder"
# 加载权重时的前缀为"tf_encoder_decoder_model"
load_weight_prefix = "tf_encoder_decoder_model"

# 初始化方法
def __init__(
    self,
    config: Optional[PretrainedConfig] = None,
    encoder: Optional[TFPreTrainedModel] = None,
    decoder: Optional[TFPreTrainedModel] = None,
):
    super().__init__(config)
    self.encoder = encoder
    self.decoder = decoder

# 获取encoder方法
def get_encoder(self):
    return self.encoder

# 获取decoder方法
def get_decoder(self):
    return self.decoder

# 获取输入嵌入方法，委托给encoder的get_input_embeddings方法
def get_input_embeddings(self):
    return self.encoder.get_input_embeddings()

# 获取输出嵌入方法，委托给decoder的get_output_embeddings方法
def get_output_embeddings(self):
    return self.decoder.get_output_embeddings()

# 设置输出嵌入方法，委托给decoder的set_output_embeddings方法
def set_output_embeddings(self, new_embeddings):
    return self.decoder.set_output_embeddings(new_embeddings)
# Matt: The TF and PT weights don't align because our TF base classes have an extra layer compared to PT models
# (the main model stem is in the MainLayer class). If we remove that layer, then weight names sync up as normal.
# However, the name of that extra layer is the name of the MainLayer in the base model. We make the assumption
# here that the config model_type is the same as the name of the MainLayer. I don't know of anywhere that's
# not the case, and I wasn't sure how else to go from the config to the correct MainLayer name!
def tf_to_pt_weight_rename(self, tf_weight):
    # 函数用于重命名 TF 到 PT 权重的函数。由于 TF 基类比 PT 模型多了一个层（主模型干部在 MainLayer 类中），
    # 导致权重名称不匹配。如果去除这一层，则权重名称会正常对应。假设配置文件中的 model_type 和 MainLayer 名称相同，
    # 因此在此处进行这一假设。不清楚是否存在不符合这一假设的情况，也不确定如何从配置中得到正确的 MainLayer 名称。

    # This override is only needed in the case where we're crossloading weights from PT. However, since weights are
    # often safetensors now, we don't know if we're going to be crossloading until we sniff the weights file.
    # Therefore, we specify tf_to_pt_weight_rename anyway, and let the super method figure out if it needs it
    # or not.
    # 此重写仅在从 PT 加载权重时需要。但是，由于现在权重通常是 SafeTensor，因此在嗅探权重文件之前，我们不知道是否会进行跨加载。
    # 因此，我们仍然指定 tf_to_pt_weight_rename，让超类方法决定是否需要使用它。
    encoder_model_type = self.config.encoder.model_type
    if "encoder" in tf_weight and "decoder" not in tf_weight:
        # 如果权重名称中包含 "encoder" 但不包含 "decoder"，则替换名称中的 encoder.model_type 部分为 encoder.
        return (re.sub(rf"encoder\.{encoder_model_type}\.", "encoder.", tf_weight),)
    else:
        # 否则直接返回原始的 TF 权重名称
        return (tf_weight,)

@classmethod
def from_encoder_decoder_pretrained(
    cls,
    encoder_pretrained_model_name_or_path: str = None,
    decoder_pretrained_model_name_or_path: str = None,
    *model_args,
    **kwargs,
# 函数装饰器，用于从预训练的编码器-解码器模型名称或路径创建模型的类方法。
@unpack_inputs
@add_start_docstrings_to_model_forward(ENCODER_DECODER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
def call(
    self,
    input_ids: TFModelInputType | None = None,
    attention_mask: np.ndarray | tf.Tensor | None = None,
    decoder_input_ids: np.ndarray | tf.Tensor | None = None,
    decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
    encoder_outputs: np.ndarray | tf.Tensor | None = None,
    past_key_values: Tuple[Tuple[tf.Tensor]] | None = None,
    inputs_embeds: np.ndarray | tf.Tensor | None = None,
    decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
    labels: np.ndarray | tf.Tensor | None = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    training: bool = False,
    **kwargs,
# 函数装饰器，添加到 call 方法，用于指定输入、输出和返回文档字符串的生成和替换规则。
def prepare_inputs_for_generation(
    self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs
# 函数用于为生成准备输入，接受输入 ID、过去的关键值、注意力掩码、缓存标志、编码器输出等参数。
):
    # 准备解码器生成所需的输入
    decoder_inputs = self.decoder.prepare_inputs_for_generation(input_ids, past_key_values=past_key_values)
    # 获取解码器的注意力遮罩，如果存在的话
    decoder_attention_mask = decoder_inputs["attention_mask"] if "attention_mask" in decoder_inputs else None
    # 获取过去的关键值
    past_key_values = decoder_inputs.get("past_key_values")
    # 如果过去的关键值不存在，则在TF GPT2上获取过去的值
    if past_key_values is None:
        past_key_values = decoder_inputs.get("past")  # 例如在TF GPT2上
    # 构建输入字典，准备传递给Keras.layer.__call__以确保正常工作
    input_dict = {
        "input_ids": None,  # 需要传递以使Keras.layer.__call__工作正常
        "attention_mask": attention_mask,
        "decoder_attention_mask": decoder_attention_mask,
        "decoder_input_ids": decoder_inputs["input_ids"],
        # TODO (joao): 在生成重构完成后，不应再需要`TFBaseModelOutput`包装器
        "encoder_outputs": TFBaseModelOutput(last_hidden_state=encoder_outputs[0]),
        "past_key_values": past_key_values,
        "use_cache": use_cache,
    }
    return input_dict

def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
    # 根据标签为解码器的输入准备输入ID
    return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)

def resize_token_embeddings(self, *args, **kwargs):
    # 抛出未实现错误，直接调整嵌入层不支持通过TFEncoderDecoderModel
    raise NotImplementedError(
        "Resizing the embedding layers via the TFEncoderDecoderModel directly is not supported. Please use the"
        " respective methods of the wrapped objects (model.encoder.resize_token_embeddings(...) or"
        " model.decoder.resize_token_embeddings(...))"
    )

def _reorder_cache(self, past, beam_idx):
    # 在此应用解码器缓存重新排序
    return self.decoder._reorder_cache(past, beam_idx)

def build(self, input_shape=None):
    # 如果已经构建，则直接返回
    if self.built:
        return
    self.built = True
    # 如果存在enc_to_dec_proj属性，则构建编码器到解码器的投影
    if getattr(self, "enc_to_dec_proj", None) is not None:
        with tf.name_scope(self.enc_to_dec_proj.name):
            self.enc_to_dec_proj.build([None, None, self.encoder.config.hidden_size])
    # 如果存在encoder属性，则构建编码器
    if getattr(self, "encoder", None) is not None:
        with tf.name_scope(self.encoder.name):
            self.encoder.build(None)
    # 如果存在decoder属性，则构建解码器
    if getattr(self, "decoder", None) is not None:
        with tf.name_scope(self.decoder.name):
            self.decoder.build(None)


# `.\models\encoder_decoder\__init__.py`

```py
# 版权声明和许可证信息

# 引入类型检查标记
from typing import TYPE_CHECKING

# 从 utils 模块中导入必要的异常和工具函数
from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
    is_tf_available,
    is_torch_available,
)

# 定义模块的导入结构，包含配置模块中的 EncoderDecoderConfig
_import_structure = {"configuration_encoder_decoder": ["EncoderDecoderConfig"]}

# 检查是否支持 PyTorch，若不支持则抛出异常
try:
    if not is_torch_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果支持 PyTorch，则导入 EncoderDecoderModel 模型
    _import_structure["modeling_encoder_decoder"] = ["EncoderDecoderModel"]

# 检查是否支持 TensorFlow，若不支持则抛出异常
try:
    if not is_tf_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果支持 TensorFlow，则导入 TFEncoderDecoderModel 模型
    _import_structure["modeling_tf_encoder_decoder"] = ["TFEncoderDecoderModel"]

# 检查是否支持 Flax，若不支持则抛出异常
try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
    pass
else:
    # 如果支持 Flax，则导入 FlaxEncoderDecoderModel 模型
    _import_structure["modeling_flax_encoder_decoder"] = ["FlaxEncoderDecoderModel"]

# 如果当前为类型检查模式
if TYPE_CHECKING:
    # 从当前模块中导入 EncoderDecoderConfig 类型
    from .configuration_encoder_decoder import EncoderDecoderConfig

    # 检查是否支持 PyTorch，若支持则从 modeling_encoder_decoder 中导入 EncoderDecoderModel 类型
    try:
        if not is_torch_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_encoder_decoder import EncoderDecoderModel

    # 检查是否支持 TensorFlow，若支持则从 modeling_tf_encoder_decoder 中导入 TFEncoderDecoderModel 类型
    try:
        if not is_tf_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_tf_encoder_decoder import TFEncoderDecoderModel

    # 检查是否支持 Flax，若支持则从 modeling_flax_encoder_decoder 中导入 FlaxEncoderDecoderModel 类型
    try:
        if not is_flax_available():
            raise OptionalDependencyNotAvailable()
    except OptionalDependencyNotAvailable:
        pass
    else:
        from .modeling_flax_encoder_decoder import FlaxEncoderDecoderModel

# 如果不是类型检查模式，则将当前模块设为延迟加载模块
else:
    import sys

    # 动态设置当前模块为 _LazyModule 类型
    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)

`.\models\ernie\configuration_ernie.py`

# 导入必要的模块和类
from collections import OrderedDict  # 导入OrderedDict类，用于有序字典
from typing import Mapping  # 导入Mapping类，用于类型提示

# 从transformers库中导入所需的配置类和模块
from ...configuration_utils import PretrainedConfig  # 导入预训练配置类
from ...onnx import OnnxConfig  # 导入ONNX配置类
from ...utils import logging  # 导入日志模块

# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)

# ERNIE预训练模型配置的映射表，每个模型名称对应其配置文件的URL
ERNIE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "nghuyong/ernie-1.0-base-zh": "https://huggingface.co/nghuyong/ernie-1.0-base-zh/resolve/main/config.json",
    "nghuyong/ernie-2.0-base-en": "https://huggingface.co/nghuyong/ernie-2.0-base-en/resolve/main/config.json",
    "nghuyong/ernie-2.0-large-en": "https://huggingface.co/nghuyong/ernie-2.0-large-en/resolve/main/config.json",
    "nghuyong/ernie-3.0-base-zh": "https://huggingface.co/nghuyong/ernie-3.0-base-zh/resolve/main/config.json",
    "nghuyong/ernie-3.0-medium-zh": "https://huggingface.co/nghuyong/ernie-3.0-medium-zh/resolve/main/config.json",
    "nghuyong/ernie-3.0-mini-zh": "https://huggingface.co/nghuyong/ernie-3.0-mini-zh/resolve/main/config.json",
    "nghuyong/ernie-3.0-micro-zh": "https://huggingface.co/nghuyong/ernie-3.0-micro-zh/resolve/main/config.json",
    "nghuyong/ernie-3.0-nano-zh": "https://huggingface.co/nghuyong/ernie-3.0-nano-zh/resolve/main/config.json",
    "nghuyong/ernie-gram-zh": "https://huggingface.co/nghuyong/ernie-gram-zh/resolve/main/config.json",
    "nghuyong/ernie-health-zh": "https://huggingface.co/nghuyong/ernie-health-zh/resolve/main/config.json",
}

# 定义ERINE配置类，继承自PretrainedConfig类
class ErnieConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a [`ErnieModel`] or a [`TFErnieModel`]. It is used to
    instantiate a ERNIE model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the ERNIE
    [nghuyong/ernie-3.0-base-zh](https://huggingface.co/nghuyong/ernie-3.0-base-zh) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    """

    # 示例用法
    # 实例化一个与[nghuyong/ernie-3.0-base-zh](https://huggingface.co/nghuyong/ernie-3.0-base-zh)类似的配置
    # 参数设置为默认值将产生与ERNIE [nghuyong/ernie-3.0-base-zh](https://huggingface.co/nghuyong/ernie-3.0-base-zh)架构类似的配置

    # 示例代码
    # ```
    # >>> from transformers import ErnieConfig, ErnieModel
    #
    # >>> # Initializing a ERNIE nghuyong/ernie-3.0-base-zh style configuration
    # >>> configuration = ErnieConfig()
    # ```
    # 设置模型类型为ERNIE
    model_type = "ernie"
    
    # 定义ERNIE模型类
    def __init__(
        self,
        vocab_size=30522,
        hidden_size=768,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
        hidden_act="gelu",
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        max_position_embeddings=512,
        type_vocab_size=2,
        task_type_vocab_size=3,
        use_task_id=False,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
        pad_token_id=0,
        position_embedding_type="absolute",
        use_cache=True,
        classifier_dropout=None,
        **kwargs,
    ):
        # 调用父类的构造函数，初始化模型
        super().__init__(pad_token_id=pad_token_id, **kwargs)
    
        # 初始化模型参数
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.task_type_vocab_size = task_type_vocab_size
        self.use_task_id = use_task_id
        self.initializer_range = initializer_range
        self.layer_norm_eps = layer_norm_eps
        self.position_embedding_type = position_embedding_type
        self.use_cache = use_cache
        self.classifier_dropout = classifier_dropout
class ErnieOnnxConfig(OnnxConfig):
    # 定义 Ernie 模型的配置类，继承自 OnnxConfig 类

    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        # 定义 inputs 属性，返回一个映射，其键为字符串，值为映射的映射，其中键为整数，值为字符串

        if self.task == "multiple-choice":
            # 如果任务类型是多项选择
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        else:
            # 否则
            dynamic_axis = {0: "batch", 1: "sequence"}

        return OrderedDict(
            [
                ("input_ids", dynamic_axis),  # 返回包含 input_ids 的动态轴映射
                ("attention_mask", dynamic_axis),  # 返回包含 attention_mask 的动态轴映射
                ("token_type_ids", dynamic_axis),  # 返回包含 token_type_ids 的动态轴映射
                ("task_type_ids", dynamic_axis),  # 返回包含 task_type_ids 的动态轴映射
            ]
        )

`.\models\ernie\modeling_ernie.py`

# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch ERNIE model."""


import math
import warnings
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

# 导入来自HuggingFace库的模块和类
from ...activations import ACT2FN
from ...modeling_outputs import (
    BaseModelOutputWithPastAndCrossAttentions,
    BaseModelOutputWithPoolingAndCrossAttentions,
    CausalLMOutputWithCrossAttentions,
    MaskedLMOutput,
    MultipleChoiceModelOutput,
    NextSentencePredictorOutput,
    QuestionAnsweringModelOutput,
    SequenceClassifierOutput,
    TokenClassifierOutput,
)
from ...modeling_utils import PreTrainedModel
from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
from ...utils import (
    ModelOutput,
    add_code_sample_docstrings,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    logging,
    replace_return_docstrings,
)
from .configuration_ernie import ErnieConfig

# 获取logger对象用于记录日志
logger = logging.get_logger(__name__)

# 以下两行定义了文档中用到的一些模型和配置信息
_CHECKPOINT_FOR_DOC = "nghuyong/ernie-1.0-base-zh"
_CONFIG_FOR_DOC = "ErnieConfig"

# 预训练模型的存档列表
ERNIE_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "nghuyong/ernie-1.0-base-zh",
    "nghuyong/ernie-2.0-base-en",
    "nghuyong/ernie-2.0-large-en",
    "nghuyong/ernie-3.0-base-zh",
    "nghuyong/ernie-3.0-medium-zh",
    "nghuyong/ernie-3.0-mini-zh",
    "nghuyong/ernie-3.0-micro-zh",
    "nghuyong/ernie-3.0-nano-zh",
    "nghuyong/ernie-gram-zh",
    "nghuyong/ernie-health-zh",
    # 查看所有 ERNIE 模型：https://huggingface.co/models?filter=ernie
]

# ErnieEmbeddings 类的定义，用于构建来自词嵌入、位置嵌入和标记类型嵌入的嵌入层
class ErnieEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings."""
    # 初始化函数，用于初始化模型参数和配置
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 定义词嵌入层，根据配置参数设置词表大小、隐藏层大小和填充标记的索引
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        # 定义位置嵌入层，根据配置参数设置最大位置嵌入数和隐藏层大小
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        # 定义token类型嵌入层，根据配置参数设置token类型词表大小和隐藏层大小
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
        # 如果配置中使用任务ID，定义任务类型嵌入层，根据配置参数设置任务类型词表大小和隐藏层大小
        if config.use_task_id:
            self.task_type_embeddings = nn.Embedding(config.task_type_vocab_size, config.hidden_size)

        # self.LayerNorm 没有使用蛇形命名法以保持与 TensorFlow 模型变量名的一致性，并能够加载任何 TensorFlow 检查点文件
        # 定义Layer Normalization层，根据配置参数设置隐藏层大小和epsilon值
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 定义Dropout层，根据配置参数设置隐藏层的dropout概率
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # 定义位置嵌入类型，根据配置参数获取绝对位置编码类型或其他类型
        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
        # 注册位置ID张量，用于序列化时持久化存储，长度为最大位置嵌入数
        self.register_buffer(
            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
        )
        # 注册token类型ID张量，用于序列化时持久化存储，形状与位置ID相同，类型为长整型
        self.register_buffer(
            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
        )

    # 前向传播函数，接受多个输入参数，返回模型的输出结果
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        task_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        past_key_values_length: int = 0,
    ) -> torch.Tensor:
        # 如果给定了 input_ids，则获取其形状作为 input_shape
        if input_ids is not None:
            input_shape = input_ids.size()
        else:
            # 否则，从 inputs_embeds 获取形状，排除最后一个维度（通常是 batch 维度）
            input_shape = inputs_embeds.size()[:-1]

        # 获取序列长度
        seq_length = input_shape[1]

        # 如果未提供 position_ids，则从预设的 position_ids 中切片出相应长度的部分
        if position_ids is None:
            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]

        # 设置 token_type_ids 为注册的缓冲区，默认为全零，当其自动生成时有效，用于在模型追踪过程中解决问题 #5664
        if token_type_ids is None:
            if hasattr(self, "token_type_ids"):
                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

        # 如果 inputs_embeds 为 None，则从 word_embeddings 中获取对应 input_ids 的嵌入向量
        if inputs_embeds is None:
            inputs_embeds = self.word_embeddings(input_ids)
        
        # 获取 token_type_ids 对应的 token type embeddings
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # 将输入的嵌入向量与 token type embeddings 相加得到最终的 embeddings
        embeddings = inputs_embeds + token_type_embeddings

        # 如果使用绝对位置编码，则将位置编码 embeddings 加到当前 embeddings 中
        if self.position_embedding_type == "absolute":
            position_embeddings = self.position_embeddings(position_ids)
            embeddings += position_embeddings

        # 如果模型使用 task_id，则将 task_type_ids 加入 embeddings
        if self.use_task_id:
            if task_type_ids is None:
                task_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
            task_type_embeddings = self.task_type_embeddings(task_type_ids)
            embeddings += task_type_embeddings

        # 对 embeddings 进行 LayerNorm 处理
        embeddings = self.LayerNorm(embeddings)
        # 对 embeddings 进行 dropout 处理
        embeddings = self.dropout(embeddings)
        # 返回最终的 embeddings
        return embeddings
# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Ernie
class ErnieSelfAttention(nn.Module):
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 检查隐藏大小是否能被注意力头数整除，否则抛出数值错误
        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # 创建查询、键、值的线性层
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        # dropout 层
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
        self.position_embedding_type = position_embedding_type or getattr(
            config, "position_embedding_type", "absolute"
        )
        # 如果使用相对位置编码，创建距离嵌入层
        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
            self.max_position_embeddings = config.max_position_embeddings
            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)

        # 是否作为解码器使用
        self.is_decoder = config.is_decoder

    # 调整形状以便进行注意力计算
    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(new_x_shape)
        return x.permute(0, 2, 1, 3)

    # 前向传播函数定义
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->Ernie
class ErnieSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 密集层
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 层归一化
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # dropout 层
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播函数定义
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 密集层计算
        hidden_states = self.dense(hidden_states)
        # dropout 计算
        hidden_states = self.dropout(hidden_states)
        # 层归一化计算并与输入张量相加
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Ernie
# 定义一个名为 ErnieAttention 的自定义神经网络模块，继承自 nn.Module 类
class ErnieAttention(nn.Module):
    # 初始化函数，接受配置参数 config 和位置嵌入类型 position_embedding_type
    def __init__(self, config, position_embedding_type=None):
        super().__init__()
        # 创建 ErnieSelfAttention 层，并赋值给 self.self 属性
        self.self = ErnieSelfAttention(config, position_embedding_type=position_embedding_type)
        # 创建 ErnieSelfOutput 层，并赋值给 self.output 属性
        self.output = ErnieSelfOutput(config)
        # 初始化一个空集合，用于存储被剪枝的注意力头信息
        self.pruned_heads = set()

    # 定义一个方法，用于剪枝注意力头
    def prune_heads(self, heads):
        # 如果 heads 列表为空，则直接返回
        if len(heads) == 0:
            return
        # 调用 find_pruneable_heads_and_indices 函数，找到可以剪枝的头部和对应的索引
        heads, index = find_pruneable_heads_and_indices(
            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
        )

        # 剪枝线性层中的权重
        self.self.query = prune_linear_layer(self.self.query, index)
        self.self.key = prune_linear_layer(self.self.key, index)
        self.self.value = prune_linear_layer(self.self.value, index)
        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)

        # 更新超参数并存储剪枝的头部信息
        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
        self.pruned_heads = self.pruned_heads.union(heads)

    # 前向传播函数，接受多个输入张量并返回一个张量元组
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # 调用 self.self 的前向传播，获取自注意力输出
        self_outputs = self.self(
            hidden_states,
            attention_mask,
            head_mask,
            encoder_hidden_states,
            encoder_attention_mask,
            past_key_value,
            output_attentions,
        )
        # 将自注意力输出和输入 hidden_states 传入 self.output 层，获取注意力输出
        attention_output = self.output(self_outputs[0], hidden_states)
        # 构建输出元组，包含注意力输出和可能的注意力权重
        outputs = (attention_output,) + self_outputs[1:]  # 如果有的话，加入注意力权重
        return outputs


# 从 transformers.models.bert.modeling_bert.BertIntermediate 复制并改为 Ernie
# 定义一个名为 ErnieIntermediate 的神经网络模块，继承自 nn.Module 类
class ErnieIntermediate(nn.Module):
    # 初始化函数，接受配置参数 config
    def __init__(self, config):
        super().__init__()
        # 创建一个线性层，将输入特征大小 config.hidden_size 映射到 config.intermediate_size
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        # 如果 config.hidden_act 是字符串类型，则使用 ACT2FN 字典映射的激活函数，否则直接使用配置中的激活函数
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    # 前向传播函数，接受输入张量 hidden_states 并返回张量
    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 将输入张量经过线性层 dense，得到中间状态 hidden_states
        hidden_states = self.dense(hidden_states)
        # 将中间状态 hidden_states 经过激活函数 intermediate_act_fn
        hidden_states = self.intermediate_act_fn(hidden_states)
        # 返回激活后的中间状态 hidden_states
        return hidden_states


# 从 transformers.models.bert.modeling_bert.BertOutput 复制并改为 Ernie
# 定义一个名为 ErnieOutput 的神经网络模块，继承自 nn.Module 类
class ErnieOutput(nn.Module):
    # 初始化方法，用于创建一个新的对象实例
    def __init__(self, config):
        # 调用父类的初始化方法
        super().__init__()
        # 创建一个全连接层，输入维度为config.intermediate_size，输出维度为config.hidden_size
        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
        # 创建一个 LayerNorm 层，对输入进行归一化处理，设置epsilon为config.layer_norm_eps
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        # 创建一个 Dropout 层，用于在训练过程中随机置零输入张量的部分元素，概率为config.hidden_dropout_prob
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    # 前向传播方法，定义了如何从输入计算输出
    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
        # 将隐藏状态输入全连接层，进行线性变换
        hidden_states = self.dense(hidden_states)
        # 对全连接层的输出进行 Dropout 操作，以防止过拟合
        hidden_states = self.dropout(hidden_states)
        # 对加上输入张量的结果进行 LayerNorm 归一化处理
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        # 返回处理后的张量作为输出
        return hidden_states
# 从 transformers.models.bert.modeling_bert.BertLayer 复制并修改为 ErnieLayer
class ErnieLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 设置前向传播的块大小（feed forward chunk size）
        self.chunk_size_feed_forward = config.chunk_size_feed_forward
        # 序列长度维度设为1
        self.seq_len_dim = 1
        # 初始化 Ernie 注意力层
        self.attention = ErnieAttention(config)
        # 是否作为解码器使用
        self.is_decoder = config.is_decoder
        # 是否添加跨注意力（cross attention）
        self.add_cross_attention = config.add_cross_attention
        # 如果添加了跨注意力，检查是否作为解码器使用，否则引发异常
        if self.add_cross_attention:
            if not self.is_decoder:
                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
            # 初始化 Ernie 跨注意力层，使用绝对位置嵌入
            self.crossattention = ErnieAttention(config, position_embedding_type="absolute")
        # 初始化 Ernie 中间层
        self.intermediate = ErnieIntermediate(config)
        # 初始化 Ernie 输出层
        self.output = ErnieOutput(config)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        output_attentions: Optional[bool] = False,
    ) -> Tuple[torch.Tensor]:
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
        # 使用过去的键/值缓存（如果存在）的前两个位置来初始化自注意力机制的过去键/值
        self_attention_outputs = self.attention(
            hidden_states,
            attention_mask,
            head_mask,
            output_attentions=output_attentions,
            past_key_value=self_attn_past_key_value,
        )
        # 执行自注意力机制
        attention_output = self_attention_outputs[0]

        # 如果是解码器，最后一个输出是自注意力缓存的元组
        if self.is_decoder:
            outputs = self_attention_outputs[1:-1]
            present_key_value = self_attention_outputs[-1]
        else:
            outputs = self_attention_outputs[1:]  # 如果需要输出注意力权重，则添加自注意力
                                                  
        cross_attn_present_key_value = None
        if self.is_decoder and encoder_hidden_states is not None:
            if not hasattr(self, "crossattention"):
                raise ValueError(
                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
                    " by setting `config.add_cross_attention=True`"
                )

            # 交叉注意力缓存的键/值元组位于过去键/值元组的第3、4个位置
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
            # 执行交叉注意力机制
            cross_attention_outputs = self.crossattention(
                attention_output,
                attention_mask,
                head_mask,
                encoder_hidden_states,
                encoder_attention_mask,
                cross_attn_past_key_value,
                output_attentions,
            )
            attention_output = cross_attention_outputs[0]
            outputs = outputs + cross_attention_outputs[1:-1]  # 如果需要输出注意力权重，则添加交叉注意力

            # 将交叉注意力的缓存添加到现在键/值元组的第3、4个位置
            cross_attn_present_key_value = cross_attention_outputs[-1]
            present_key_value = present_key_value + cross_attn_present_key_value

        # 将注意力输出应用于前向传播的分块处理
        layer_output = apply_chunking_to_forward(
            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
        )
        outputs = (layer_output,) + outputs

        # 如果是解码器，将注意力的键/值作为最后一个输出返回
        if self.is_decoder:
            outputs = outputs + (present_key_value,)

        return outputs

    def feed_forward_chunk(self, attention_output):
        # 将注意力输出应用于前向传播的分块处理
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output
# 从transformers.models.bert.modeling_bert.BertEncoder复制并修改为ErnieEncoder
class ErnieEncoder(nn.Module):
    # 初始化方法，接受一个config对象作为参数
    def __init__(self, config):
        super().__init__()
        # 将config对象保存到实例的self.config属性中
        self.config = config
        # 创建一个包含多个ErnieLayer对象的列表，列表长度为config.num_hidden_layers
        self.layer = nn.ModuleList([ErnieLayer(config) for _ in range(config.num_hidden_layers)])
        # 是否使用梯度检查点，默认为False
        self.gradient_checkpointing = False

    # 前向传播方法，接受多个输入参数
    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.FloatTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
        encoder_attention_mask: Optional[torch.FloatTensor] = None,
        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = False,
        output_hidden_states: Optional[bool] = False,
        return_dict: Optional[bool] = True,
        ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
        # 如果不需要输出隐藏状态，设置一个空元组
        all_hidden_states = () if output_hidden_states else None
        # 如果不需要输出注意力权重，设置一个空元组
        all_self_attentions = () if output_attentions else None
        # 如果不需要输出交叉注意力权重或者配置不支持，设置一个空元组
        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None

        # 如果启用了梯度检查点并且在训练阶段，检查是否与使用缓存参数冲突，如有冲突则警告并强制关闭使用缓存
        if self.gradient_checkpointing and self.training:
            if use_cache:
                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False

        # 如果不使用缓存，初始化一个空元组以保存下一个解码器缓存
        next_decoder_cache = () if use_cache else None
        # 遍历所有层次的解码器层
        for i, layer_module in enumerate(self.layer):
            # 如果需要输出隐藏状态，将当前隐藏状态加入到所有隐藏状态元组中
            if output_hidden_states:
                all_hidden_states = all_hidden_states + (hidden_states,)

            # 获取当前层的头部掩码，如果没有指定头部掩码，则设为None
            layer_head_mask = head_mask[i] if head_mask is not None else None
            # 获取当前层的过去键值对，如果没有指定过去键值对，则设为None
            past_key_value = past_key_values[i] if past_key_values is not None else None

            # 如果启用了梯度检查点并且在训练阶段，使用梯度检查点函数来计算当前层的输出
            if self.gradient_checkpointing and self.training:
                layer_outputs = self._gradient_checkpointing_func(
                    layer_module.__call__,
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )
            else:
                # 否则，直接调用当前层模块来计算当前层的输出
                layer_outputs = layer_module(
                    hidden_states,
                    attention_mask,
                    layer_head_mask,
                    encoder_hidden_states,
                    encoder_attention_mask,
                    past_key_value,
                    output_attentions,
                )

            # 更新当前隐藏状态为当前层的输出的第一个元素
            hidden_states = layer_outputs[0]
            # 如果使用缓存，将当前层的缓存信息加入到下一个解码器缓存中
            if use_cache:
                next_decoder_cache += (layer_outputs[-1],)
            # 如果需要输出注意力权重，将当前层的自注意力权重加入到所有自注意力权重元组中
            if output_attentions:
                all_self_attentions = all_self_attentions + (layer_outputs[1],)
                # 如果模型配置支持添加交叉注意力，将当前层的交叉注意力权重加入到所有交叉注意力权重元组中
                if self.config.add_cross_attention:
                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)

        # 如果需要输出隐藏状态，将最终的隐藏状态加入到所有隐藏状态元组中
        if output_hidden_states:
            all_hidden_states = all_hidden_states + (hidden_states,)

        # 如果不返回字典形式的结果，将需要返回的各项整合成一个元组并返回
        if not return_dict:
            return tuple(
                v
                for v in [
                    hidden_states,
                    next_decoder_cache,
                    all_hidden_states,
                    all_self_attentions,
                    all_cross_attentions,
                ]
                if v is not None
            )
        # 否则，返回一个带有过去键值对和交叉注意力的基础模型输出对象
        return BaseModelOutputWithPastAndCrossAttentions(
            last_hidden_state=hidden_states,
            past_key_values=next_decoder_cache,
            hidden_states=all_hidden_states,
            attentions=all_self_attentions,
            cross_attentions=all_cross_attentions,
        )
# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->Ernie
class ErniePooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义一个全连接层，输入和输出维度均为 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 激活函数使用双曲正切函数
        self.activation = nn.Tanh()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 取出隐藏状态中的第一个 token 对应的隐藏状态
        first_token_tensor = hidden_states[:, 0]
        # 将第一个 token 的隐藏状态输入全连接层
        pooled_output = self.dense(first_token_tensor)
        # 使用激活函数处理全连接层的输出
        pooled_output = self.activation(pooled_output)
        # 返回池化后的输出
        return pooled_output


# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->Ernie
class ErniePredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 定义一个全连接层，输入和输出维度均为 config.hidden_size
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        # 根据配置选择激活函数
        if isinstance(config.hidden_act, str):
            self.transform_act_fn = ACT2FN[config.hidden_act]
        else:
            self.transform_act_fn = config.hidden_act
        # LayerNorm 层，输入维度为 config.hidden_size，epsilon 为 config.layer_norm_eps
        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # 输入隐藏状态经过全连接层
        hidden_states = self.dense(hidden_states)
        # 使用指定的激活函数处理全连接层的输出
        hidden_states = self.transform_act_fn(hidden_states)
        # 输入 LayerNorm 层处理后返回
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->Ernie
class ErnieLMPredictionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 声明一个 ErniePredictionHeadTransform 对象，用于转换隐藏状态
        self.transform = ErniePredictionHeadTransform(config)

        # 输出权重与输入嵌入相同，但每个 token 都有一个独立的输出偏置
        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # 声明一个 bias 参数，用于输出层每个 token 的偏置
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))

        # 需要建立 decoder.bias 与 self.bias 之间的关联，以便在调整 token embeddings 时正确调整偏置
        self.decoder.bias = self.bias

    def forward(self, hidden_states):
        # 隐藏状态经过 transform 转换
        hidden_states = self.transform(hidden_states)
        # 转换后的隐藏状态经过线性层，输出预测分数
        hidden_states = self.decoder(hidden_states)
        return hidden_states


# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->Ernie
class ErnieOnlyMLMHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 声明一个 ErnieLMPredictionHead 对象，用于预测 MLM 的结果
        self.predictions = ErnieLMPredictionHead(config)

    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
        # 序列的输出经过预测层，得到预测分数
        prediction_scores = self.predictions(sequence_output)
        return prediction_scores


# Copied from transformers.models.bert.modeling_bert.BertOnlyNSPHead with Bert->Ernie
class ErnieOnlyNSPHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 一个线性层，用于预测 NSP（Next Sentence Prediction）
        self.seq_relationship = nn.Linear(config.hidden_size, 2)
    # 定义一个方法 `forward`，用于执行前向传播
    def forward(self, pooled_output):
        # 调用 `seq_relationship` 方法计算序列关系分数
        seq_relationship_score = self.seq_relationship(pooled_output)
        # 返回计算得到的序列关系分数
        return seq_relationship_score
# 从 transformers.models.bert.modeling_bert.BertPreTrainingHeads 复制的代码，将 Bert 替换为 Ernie
class ErniePreTrainingHeads(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 创建 ErnieLMPredictionHead 对象，用于预测下一个词的概率分布
        self.predictions = ErnieLMPredictionHead(config)
        # 创建线性层，用于预测两个句子之间的关系
        self.seq_relationship = nn.Linear(config.hidden_size, 2)

    def forward(self, sequence_output, pooled_output):
        # 调用 predictions 对象进行预测，生成预测分数
        prediction_scores = self.predictions(sequence_output)
        # 使用 seq_relationship 层预测句子之间的关系得分
        seq_relationship_score = self.seq_relationship(pooled_output)
        # 返回预测的语言模型分数和句子关系分数
        return prediction_scores, seq_relationship_score


class ErniePreTrainedModel(PreTrainedModel):
    """
    处理权重初始化和预训练模型下载加载的抽象类。
    """

    config_class = ErnieConfig
    base_model_prefix = "ernie"
    supports_gradient_checkpointing = True

    def _init_weights(self, module):
        """初始化权重"""
        if isinstance(module, nn.Linear):
            # 使用正态分布初始化线性层的权重，标准差为 config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                # 如果存在偏置，则将其初始化为零
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            # 使用正态分布初始化嵌入层的权重，标准差为 config.initializer_range
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                # 如果有 padding_idx，则将对应的权重初始化为零
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            # 将 LayerNorm 层的偏置初始化为零，权重初始化为 1.0
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


@dataclass
# 从 transformers.models.bert.modeling_bert.BertForPreTrainingOutput 复制的代码，将 Bert 替换为 Ernie
class ErnieForPreTrainingOutput(ModelOutput):
    """
    [`ErnieForPreTraining`] 的输出类型。
    """
    # 定义函数的参数说明和类型注解
    Args:
        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
            # 如果提供了 `labels` 参数，则返回的可选参数，表示总损失，包括掩码语言建模损失和下一个序列预测（分类）损失。
        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            # 语言建模头部的预测分数（softmax之前的每个词汇标记的分数）。
        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
            # 下一个序列预测（分类）头部的预测分数（softmax之前的True/False连续性得分）。
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            # 模型在每层输出后的隐藏状态，以及初始嵌入输出的元组。
            # 如果传递了 `output_hidden_states=True` 或者 `config.output_hidden_states=True`，则返回。
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            # 自注意力头部中的注意力权重，经过注意力softmax后的权重，用于计算加权平均值。
            # 如果传递了 `output_attentions=True` 或者 `config.output_attentions=True`，则返回。
    """

    # 损失值，类型为可选的浮点张量
    loss: Optional[torch.FloatTensor] = None
    # 语言建模头部的预测分数，张量形状为 `(batch_size, sequence_length, config.vocab_size)`
    prediction_logits: torch.FloatTensor = None
    # 下一个序列预测头部的预测分数，张量形状为 `(batch_size, 2)`
    seq_relationship_logits: torch.FloatTensor = None
    # 隐藏状态，类型为可选的浮点张量元组，形状为 `(batch_size, sequence_length, hidden_size)`
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    # 注意力权重，类型为可选的浮点张量元组，形状为 `(batch_size, num_heads, sequence_length, sequence_length)`
    attentions: Optional[Tuple[torch.FloatTensor]] = None
"""
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`ErnieConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""



"""
"""
"""

@add_start_docstrings(
    "The bare Ernie Model transformer outputting raw hidden-states without any specific head on top.",
    ERNIE_START_DOCSTRING,
)
"""
# 定义 ErnieModel 类，继承自 ErniePreTrainedModel
class ErnieModel(ErniePreTrainedModel):
    """
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    """



    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Ernie
    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config)
        self.config = config

        self.embeddings = ErnieEmbeddings(config)  # 初始化 ErnieEmbeddings，用于处理输入的词嵌入
        self.encoder = ErnieEncoder(config)  # 初始化 ErnieEncoder，用于进行编码器的编码

        self.pooler = ErniePooler(config) if add_pooling_layer else None  # 如果 add_pooling_layer 为真，初始化 ErniePooler，用于池化层处理

        # Initialize weights and apply final processing
        self.post_init()  # 执行初始化权重和最终处理



    # Copied from transformers.models.bert.modeling_bert.BertModel.get_input_embeddings
    def get_input_embeddings(self):
        return self.embeddings.word_embeddings  # 返回输入嵌入的词嵌入



    # Copied from transformers.models.bert.modeling_bert.BertModel.set_input_embeddings
    def set_input_embeddings(self, value):
        self.embeddings.word_embeddings = value  # 设置输入嵌入的词嵌入为给定的值



    # Copied from transformers.models.bert.modeling_bert.BertModel._prune_heads
    # 定义一个方法 `_prune_heads`，用于修剪模型中的注意力头部
    def _prune_heads(self, heads_to_prune):
        """
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        """
        # 遍历 heads_to_prune 字典中的每个层及对应要修剪的注意力头部列表
        for layer, heads in heads_to_prune.items():
            # 在模型的编码器（encoder）中定位到指定层的注意力（attention）对象，并执行修剪操作
            self.encoder.layer[layer].attention.prune_heads(heads)

    # 声明一个前向传播方法 `forward`，并应用装饰器添加文档字符串和代码示例文档字符串
    @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        task_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
@add_start_docstrings(
    """
    Ernie Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
    sentence prediction (classification)` head.
    """,
    ERNIE_START_DOCSTRING,
)
class ErnieForPreTraining(ErniePreTrainedModel):
    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]

    # 从 transformers.models.bert.modeling_bert.BertForPreTraining.__init__ 复制而来，将 Bert 替换为 Ernie，bert 替换为 ernie
    def __init__(self, config):
        super().__init__(config)

        self.ernie = ErnieModel(config)
        self.cls = ErniePreTrainingHeads(config)

        # 初始化权重并进行最终处理
        self.post_init()

    # 从 transformers.models.bert.modeling_bert.BertForPreTraining.get_output_embeddings 复制而来
    def get_output_embeddings(self):
        return self.cls.predictions.decoder

    # 从 transformers.models.bert.modeling_bert.BertForPreTraining.set_output_embeddings 复制而来
    def set_output_embeddings(self, new_embeddings):
        self.cls.predictions.decoder = new_embeddings

    @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=ErnieForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        task_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        next_sentence_label: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        """
        前向传播方法，接受多个输入参数，执行 Ernie 模型的预测任务。

        Args:
            input_ids (Optional[torch.Tensor], optional): 输入 token IDs. Defaults to None.
            attention_mask (Optional[torch.Tensor], optional): 注意力掩码，指示哪些元素是填充项. Defaults to None.
            token_type_ids (Optional[torch.Tensor], optional): token 类型 IDs，用于区分句子 A 和句子 B. Defaults to None.
            task_type_ids (Optional[torch.Tensor], optional): 任务类型 IDs，用于特定任务的区分. Defaults to None.
            position_ids (Optional[torch.Tensor], optional): 位置 IDs，指示每个 token 的位置. Defaults to None.
            head_mask (Optional[torch.Tensor], optional): 头部掩码，用于指定哪些注意力头应该被屏蔽. Defaults to None.
            inputs_embeds (Optional[torch.Tensor], optional): 直接输入的嵌入表示. Defaults to None.
            labels (Optional[torch.Tensor], optional): 模型的标签，用于 MLM 损失计算. Defaults to None.
            next_sentence_label (Optional[torch.Tensor], optional): 下一个句子预测的标签. Defaults to None.
            output_attentions (Optional[bool], optional): 是否输出注意力权重. Defaults to None.
            output_hidden_states (Optional[bool], optional): 是否输出隐藏状态. Defaults to None.
            return_dict (Optional[bool], optional): 是否返回字典格式的输出. Defaults to None.

        Returns:
            ErnieForPreTrainingOutput or torch.Tensor: 根据 return_dict 决定返回 ErnieForPreTrainingOutput 对象或直接的张量输出.
        """
        # 实现具体的前向传播逻辑，包括输入处理、模型计算和输出处理
        pass


@add_start_docstrings(
    """Ernie Model with a `language modeling` head on top for CLM fine-tuning.""", ERNIE_START_DOCSTRING
)
class ErnieForCausalLM(ErniePreTrainedModel):
    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]

    # 从 transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ 复制而来，将 BertLMHeadModel->ErnieForCausalLM, Bert->Ernie, bert->ernie
    def __init__(self, config):
        super().__init__(config)

        if not config.is_decoder:
            logger.warning("If you want to use `ErnieForCausalLM` as a standalone, add `is_decoder=True.`")

        # 初始化 Ernie 模型和仅含 MLM 头部的头部
        self.ernie = ErnieModel(config, add_pooling_layer=False)
        self.cls = ErnieOnlyMLMHead(config)

        # 初始化权重并进行最终处理
        self.post_init()

    # 从 transformers.models.bert.modeling_bert.BertLMHeadModel.get_output_embeddings 复制而来
    def get_output_embeddings(self):
        return self.cls.predictions.decoder
    # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.set_output_embeddings
    def set_output_embeddings(self, new_embeddings):
        # 将预测层的解码器替换为新的嵌入层
        self.cls.predictions.decoder = new_embeddings
    
    @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=CausalLMOutputWithCrossAttentions,
        config_class=_CONFIG_FOR_DOC,
    )
    # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.forward
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        task_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        past_key_values: Optional[List[torch.Tensor]] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
    # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.prepare_inputs_for_generation
    def prepare_inputs_for_generation(
        self, input_ids, past_key_values=None, attention_mask=None, use_cache=True, **model_kwargs
    ):
        input_shape = input_ids.shape
        # 如果没有提供注意力掩码，则创建全为1的掩码
        if attention_mask is None:
            attention_mask = input_ids.new_ones(input_shape)
    
        # 如果使用了过去的键值对，根据需要截取输入的decoder_input_ids
        if past_key_values is not None:
            past_length = past_key_values[0][0].shape[2]
    
            # 一些生成方法已经只传递了最后一个输入ID
            if input_ids.shape[1] > past_length:
                remove_prefix_length = past_length
            else:
                # 默认行为：保留最后一个ID
                remove_prefix_length = input_ids.shape[1] - 1
    
            input_ids = input_ids[:, remove_prefix_length:]
    
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "past_key_values": past_key_values,
            "use_cache": use_cache,
        }
    
    # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel._reorder_cache
    def _reorder_cache(self, past_key_values, beam_idx):
        reordered_past = ()
        # 重新排序过去的键值对，以匹配新的beam索引
        for layer_past in past_key_values:
            reordered_past += (
                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
            )
        return reordered_past
@add_start_docstrings("""Ernie Model with a `language modeling` head on top.""", ERNIE_START_DOCSTRING)
class ErnieForMaskedLM(ErniePreTrainedModel):
    _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"]

    # 从transformers.models.bert.modeling_bert.BertForMaskedLM.__init__复制而来，将Bert->Ernie，bert->ernie
    def __init__(self, config):
        super().__init__(config)

        # 如果配置为decoder，发出警告，因为ErnieForMaskedLM需要使用双向self-attention，所以要求config.is_decoder=False
        if config.is_decoder:
            logger.warning(
                "If you want to use `ErnieForMaskedLM` make sure `config.is_decoder=False` for "
                "bi-directional self-attention."
            )

        # 初始化Ernie模型，不添加池化层
        self.ernie = ErnieModel(config, add_pooling_layer=False)
        # 初始化仅包含MLM头部的ErnieOnlyMLMHead
        self.cls = ErnieOnlyMLMHead(config)

        # 初始化权重并进行最终处理
        self.post_init()

    # 从transformers.models.bert.modeling_bert.BertForMaskedLM.get_output_embeddings复制而来
    def get_output_embeddings(self):
        # 返回MLM头部的预测解码器
        return self.cls.predictions.decoder

    # 从transformers.models.bert.modeling_bert.BertForMaskedLM.set_output_embeddings复制而来
    def set_output_embeddings(self, new_embeddings):
        # 设置MLM头部的预测解码器为新的嵌入层
        self.cls.predictions.decoder = new_embeddings

    # 使用add_start_docstrings_to_model_forward装饰器添加文档字符串到forward方法
    # 使用add_code_sample_docstrings添加代码示例和期望输出的文档字符串
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        task_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """

        # 如果 return_dict 不为 None，则使用给定的值；否则使用 self.config.use_return_dict 的值
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 ERNIE 模型进行前向传播
        outputs = self.ernie(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            task_type_ids=task_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型输出的序列输出
        sequence_output = outputs[0]
        # 通过分类器获取预测得分
        prediction_scores = self.cls(sequence_output)

        masked_lm_loss = None
        # 如果提供了标签，则计算 masked language modeling 的损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()  # 使用交叉熵损失函数，-100 代表填充标记
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        # 如果 return_dict 为 False，则返回元组形式的输出
        if not return_dict:
            output = (prediction_scores,) + outputs[2:]
            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output

        # 如果 return_dict 为 True，则返回 MaskedLMOutput 对象
        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    # 从 transformers.models.bert.modeling_bert.BertForMaskedLM.prepare_inputs_for_generation 复制而来
    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
        input_shape = input_ids.shape
        effective_batch_size = input_shape[0]

        # 添加一个虚拟的 token
        if self.config.pad_token_id is None:
            raise ValueError("The PAD token should be defined for generation")

        # 扩展 attention_mask，在最后添加一个全零的列
        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
        # 创建一个全是 PAD token 的虚拟 token
        dummy_token = torch.full(
            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
        )
        # 在 input_ids 后面添加虚拟 token
        input_ids = torch.cat([input_ids, dummy_token], dim=1)

        # 返回输入字典，包括修改后的 input_ids 和 attention_mask
        return {"input_ids": input_ids, "attention_mask": attention_mask}
# 定义 ErnieForNextSentencePrediction 类，它在 ERNIE 模型的基础上添加了一个“下一个句子预测（分类）”的头部。
@add_start_docstrings(
    """Ernie Model with a `next sentence prediction (classification)` head on top.""",
    ERNIE_START_DOCSTRING,
)
class ErnieForNextSentencePrediction(ErniePreTrainedModel):
    # 从 transformers.models.bert.modeling_bert.BertForNextSentencePrediction.__init__ 复制而来，将其中的 Bert 改为 Ernie，bert 改为 ernie
    def __init__(self, config):
        super().__init__(config)

        # 初始化 Ernie 模型
        self.ernie = ErnieModel(config)
        # 初始化仅包含 NSP 头部的 ErnieOnlyNSPHead
        self.cls = ErnieOnlyNSPHead(config)

        # 初始化权重并应用最终处理
        self.post_init()

    # 向前传播函数，接受多个输入参数并返回一个输出结果，使用了 add_start_docstrings_to_model_forward 和 replace_return_docstrings 进行文档字符串的注释和替换
    @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        task_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        **kwargs,
):
        ) -> Union[Tuple[torch.Tensor], NextSentencePredictorOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring). Indices should be in `[0, 1]`:

            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.

        Returns:
            Tuple containing either logits or a full NextSentencePredictorOutput if configured.

        Example:

        ```
        >>> from transformers import AutoTokenizer, ErnieForNextSentencePrediction
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0-base-zh")
        >>> model = ErnieForNextSentencePrediction.from_pretrained("nghuyong/ernie-1.0-base-zh")

        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")

        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
        >>> logits = outputs.logits
        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
        ```
        """

        if "next_sentence_label" in kwargs:
            # 如果传入了过时的参数 `next_sentence_label`，发出警告并使用 `labels` 替代
            warnings.warn(
                "The `next_sentence_label` argument is deprecated and will be removed in a future version, use"
                " `labels` instead.",
                FutureWarning,
            )
            labels = kwargs.pop("next_sentence_label")

        # 确定是否返回字典格式的输出
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 ERNIE 模型进行前向传播
        outputs = self.ernie(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            task_type_ids=task_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从 ERNIE 模型输出中提取池化后的输出
        pooled_output = outputs[1]

        # 使用分类器对池化输出进行预测下一个句子关系的分数
        seq_relationship_scores = self.cls(pooled_output)

        next_sentence_loss = None
        if labels is not None:
            # 如果提供了标签，计算下一个句子预测的损失
            loss_fct = CrossEntropyLoss()
            next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))

        if not return_dict:
            # 如果不返回字典，则按照旧版格式构造输出
            output = (seq_relationship_scores,) + outputs[2:]
            return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output

        # 返回包含损失、分数、隐藏状态和注意力权重的 NextSentencePredictorOutput 对象
        return NextSentencePredictorOutput(
            loss=next_sentence_loss,
            logits=seq_relationship_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    Ernie Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    """,
    ERNIE_START_DOCSTRING,
)
class ErnieForSequenceClassification(ErniePreTrainedModel):
    # 从 transformers.models.bert.modeling_bert.BertForSequenceClassification.__init__ 复制并修改为 Ernie 模型的序列分类/回归头部
    def __init__(self, config):
        super().__init__(config)
        # 初始化时设置标签数量和配置
        self.num_labels = config.num_labels
        self.config = config

        # 使用 ErnieModel 初始化 Ernie 模型
        self.ernie = ErnieModel(config)
        # 根据配置设置分类器的丢弃率，如果未指定，则使用隐藏层丢弃率
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        # 使用丢弃率初始化 Dropout 层
        self.dropout = nn.Dropout(classifier_dropout)
        # 设置线性分类器层，输入大小为隐藏层大小，输出大小为标签数量
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并应用最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        task_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        # 初始化返回字典，根据是否已定义确定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用ERNIE模型进行前向传播，获取输出
        outputs = self.ernie(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            task_type_ids=task_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从ERNIE模型的输出中获取池化后的表示
        pooled_output = outputs[1]

        # 应用Dropout层到池化后的表示
        pooled_output = self.dropout(pooled_output)
        
        # 通过分类器获取预测的逻辑回归
        logits = self.classifier(pooled_output)

        # 初始化损失为None
        loss = None

        # 如果提供了标签，则计算损失
        if labels is not None:
            # 如果问题类型未定义，则根据标签的数据类型和类别数量确定问题类型
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            # 根据问题类型选择损失函数并计算损失
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        # 如果不使用返回字典，则返回输出和损失
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 使用返回字典对象封装损失和模型输出
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
"""
Ernie Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
softmax) e.g. for RocStories/SWAG tasks.
"""

# 继承自 ErniePreTrainedModel 的 ErnieForMultipleChoice 类，用于多项选择任务的 Ernie 模型
class ErnieForMultipleChoice(ErniePreTrainedModel):
    
    # 从 transformers.models.bert.modeling_bert.BertForMultipleChoice.__init__ 复制而来，将其中的 Bert 替换为 Ernie
    def __init__(self, config):
        super().__init__(config)
        
        # 初始化 ErnieModel
        self.ernie = ErnieModel(config)
        
        # 分类器的 dropout 率，默认使用 config 中的 hidden_dropout_prob
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        
        # 线性分类器，将隐藏状态大小（hidden_size）映射到1，用于多项选择任务
        self.classifier = nn.Linear(config.hidden_size, 1)

        # 初始化权重并应用最终处理
        self.post_init()

    # 添加输入文档字符串和示例代码文档字符串到模型前向传播方法
    @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_DOC,
        output_type=MultipleChoiceModelOutput,
        config_class=_CONFIG_FOR_DOC,
    )
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        task_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        """
        # 根据函数声明，接受输入并返回包含损失或输出的元组或多选模型输出对象
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # 确定 num_choices，如果没有提供 input_ids，则从 inputs_embeds 计算
        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]

        # 根据是否为 None，重新形状化输入张量
        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
        inputs_embeds = (
            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
            if inputs_embeds is not None
            else None
        )

        # 调用 ERNIE 模型，获取输出
        outputs = self.ernie(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            task_type_ids=task_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 从 ERNIE 输出中获取汇聚后的输出
        pooled_output = outputs[1]

        # 对汇聚后的输出应用 dropout
        pooled_output = self.dropout(pooled_output)
        # 使用分类器计算 logits
        logits = self.classifier(pooled_output)
        # 重新形状化 logits 以匹配 num_choices
        reshaped_logits = logits.view(-1, num_choices)

        # 初始化损失为 None
        loss = None
        # 如果提供了标签，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        # 如果不要求返回字典形式的输出，构建输出元组
        if not return_dict:
            output = (reshaped_logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 返回多选模型输出对象，包括损失、logits、隐藏状态和注意力权重
        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
"""
Ernie Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
Named-Entity-Recognition (NER) tasks.
"""
# 导入所需的库
@add_start_docstrings(
    """
    添加一个头部的令牌分类器（在隐藏状态输出的顶部添加一个线性层），例如用于命名实体识别（NER）任务。
    """,
    ERNIE_START_DOCSTRING,
)
# 定义 ErnieForTokenClassification 类，继承自 ErniePreTrainedModel
class ErnieForTokenClassification(ErniePreTrainedModel):
    # 从 transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ 复制而来，将 Bert 替换为 Ernie
    def __init__(self, config):
        # 调用父类的构造函数
        super().__init__(config)
        # 设置标签数目
        self.num_labels = config.num_labels

        # 创建 Ernie 模型，不添加池化层
        self.ernie = ErnieModel(config, add_pooling_layer=False)
        
        # 根据配置设置分类器的 dropout，如果未指定则使用隐藏层 dropout
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        # 创建 Dropout 层
        self.dropout = nn.Dropout(classifier_dropout)
        # 创建线性层作为分类器
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并进行后续处理
        self.post_init()

    @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    # 定义前向传播方法
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        task_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        # 输入参数说明文档字符串

        batch_size, sequence_length
        """
        # 确保返回的字典选项
        if return_dict is None:
            return_dict = self.config.use_return_dict

        # 执行 Ernie 模型的前向传播，获取输出
        outputs = self.ernie(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            task_type_ids=task_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 若标签存在，将输出传递给分类器进行分类
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        # 若返回字典，将 logits 加入到输出中
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((logits,) + outputs[2:]) if return_dict else output

        # 创建命名元组并返回
        return TokenClassifierOutput(
            loss=None if labels is None else self.compute_loss(logits, labels),
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        # 如果 return_dict 为 None，则根据配置决定是否使用返回字典
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 调用 ERNIE 模型进行前向传播
        outputs = self.ernie(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            task_type_ids=task_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # 获取模型输出中的序列输出
        sequence_output = outputs[0]

        # 对序列输出进行 dropout 处理
        sequence_output = self.dropout(sequence_output)
        # 使用分类器对处理后的序列输出进行分类预测
        logits = self.classifier(sequence_output)

        # 初始化损失为 None
        loss = None
        # 如果存在标签，则计算交叉熵损失
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        # 如果不使用返回字典，则构造输出元组
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        # 如果使用返回字典，则构造 TokenClassifierOutput 对象返回
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
@add_start_docstrings(
    """
    Ernie Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    """,
    ERNIE_START_DOCSTRING,
)
class ErnieForQuestionAnswering(ErniePreTrainedModel):
    # 从 transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ 复制过来，将其中的 Bert 修改为 Ernie
    def __init__(self, config):
        # 调用父类的初始化方法，传入配置参数
        super().__init__(config)
        # 设置分类标签的数量
        self.num_labels = config.num_labels

        # 创建 Ernie 模型，不添加池化层
        self.ernie = ErnieModel(config, add_pooling_layer=False)
        # 创建一个线性层，用于输出答案起始位置和结束位置的 logit
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        # 初始化权重并进行最终处理
        self.post_init()

    @add_start_docstrings_to_model_forward(ERNIE_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        task_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        start_positions: Optional[torch.Tensor] = None,
        end_positions: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        ):
    ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
        r"""
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # Determine if return_dict should be set to self.config.use_return_dict if not provided
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Perform forward pass through the ERNIE model
        outputs = self.ernie(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            task_type_ids=task_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract the sequence output from the model outputs
        sequence_output = outputs[0]

        # Compute logits for the question answering task
        logits = self.qa_outputs(sequence_output)

        # Split logits into start and end logits
        start_logits, end_logits = logits.split(1, dim=-1)

        # Squeeze unnecessary dimensions and ensure contiguous memory layout
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If start_positions or end_positions have extra dimensions, squeeze them
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)

            # Clamp positions to avoid out-of-bound errors
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # Define CrossEntropyLoss with ignored index
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)

            # Compute start and end loss
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)

            # Calculate total loss as the average of start and end loss
            total_loss = (start_loss + end_loss) / 2

        # If return_dict is False, return outputs in a tuple
        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        # If return_dict is True, return structured QuestionAnsweringModelOutput
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
quence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        """
        # Determine if return_dict should be set to self.config.use_return_dict if not provided
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Perform forward pass through the ERNIE model
        outputs = self.ernie(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            task_type_ids=task_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # Extract the sequence output from the model outputs
        sequence_output = outputs[0]

        # Compute logits for the question answering task
        logits = self.qa_outputs(sequence_output)

        # Split logits into start and end logits
        start_logits, end_logits = logits.split(1, dim=-1)

        # Squeeze unnecessary dimensions and ensure contiguous memory layout
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If start_positions or end_positions have extra dimensions, squeeze them
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)

            # Clamp positions to avoid out-of-bound errors
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            # Define CrossEntropyLoss with ignored index
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)

            # Compute start and end loss
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)

            # Calculate total loss as the average of start and end loss
            total_loss = (start_loss + end_loss) / 2

        # If return_dict is False, return outputs in a tuple
        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        # If return_dict is True, return structured QuestionAnsweringModelOutput
        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )