# 导入必要的模块和库import json # 导入 json 模块,用于处理 JSON 格式数据from typing import List, Optional, Tuple # 导入类型提示相关的模块from tokenizers import normalizers # 导入 tokenizers 库中的 normalizers 模块from tokenizers.pre_tokenizers import BertPreTokenizer, PreTokenizer # 导入 tokenizers 库中的预分词器类from...tokenization_utils_fast import PreTrainedTokenizerFast # 从上级目录导入 PreTrainedTokenizerFast 类from...utils import logging # 从上级目录导入 logging 模块from.tokenization_roformer import RoFormerTokenizer # 从当前目录导入 RoFormerTokenizer 类from.tokenization_utils import JiebaPreTokenizer # 从当前目录导入 JiebaPreTokenizer 类# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)# 定义用于 RoFormer 的词汇文件和 tokenizer 文件的名称映射
VOCAB_FILES_NAMES ={"vocab_file":"vocab.txt","tokenizer_file":"tokenizer.json"}# 定义预训练模型的词汇文件映射,以及它们对应的下载链接
PRETRAINED_VOCAB_FILES_MAP ={"vocab_file":{"junnyu/roformer_chinese_small":"https://huggingface.co/junnyu/roformer_chinese_small/resolve/main/vocab.txt","junnyu/roformer_chinese_base":"https://huggingface.co/junnyu/roformer_chinese_base/resolve/main/vocab.txt","junnyu/roformer_chinese_char_small":("https://huggingface.co/junnyu/roformer_chinese_char_small/resolve/main/vocab.txt"),"junnyu/roformer_chinese_char_base":("https://huggingface.co/junnyu/roformer_chinese_char_base/resolve/main/vocab.txt"),"junnyu/roformer_small_discriminator":("https://huggingface.co/junnyu/roformer_small_discriminator/resolve/main/vocab.txt"),"junnyu/roformer_small_generator":("https://huggingface.co/junnyu/roformer_small_generator/resolve/main/vocab.txt"),}}# 定义预训练模型的位置编码大小映射
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES ={"junnyu/roformer_chinese_small":1536,"junnyu/roformer_chinese_base":1536,"junnyu/roformer_chinese_char_small":512,"junnyu/roformer_chinese_char_base":512,"junnyu/roformer_small_discriminator":128,"junnyu/roformer_small_generator":128,}# 定义预训练模型的初始化配置映射,指定是否小写化
PRETRAINED_INIT_CONFIGURATION ={"junnyu/roformer_chinese_small":{"do_lower_case":True},"junnyu/roformer_chinese_base":{"do_lower_case":True},"junnyu/roformer_chinese_char_small":{"do_lower_case":True},"junnyu/roformer_chinese_char_base":{"do_lower_case":True},"junnyu/roformer_small_discriminator":{"do_lower_case":True},"junnyu/roformer_small_generator":{"do_lower_case":True},}classRoFormerTokenizerFast(PreTrainedTokenizerFast):r"""
Construct a "fast" RoFormer tokenizer (backed by HuggingFace's *tokenizers* library).
# `RoFormerTokenizerFast`几乎与`BertTokenizerFast`相同,实现端到端的分词:
# 标点符号分割和WordPiece。它们在处理中文时有些差异。
# 此分词器继承自`PreTrainedTokenizerFast`,其中包含大部分主要方法。用户应该
# 参考这个超类以获取有关这些方法的更多信息。
# 示例:
#
# ```
# >>> from transformers import RoFormerTokenizerFast
#
# >>> tokenizer = RoFormerTokenizerFast.from_pretrained("junnyu/roformer_chinese_base")
# >>> tokenizer.tokenize("今天天气非常好。")
# ['今', '天', '天', '气', '非常', '好', '。']
# ```
vocab_files_names = VOCAB_FILES_NAMES # 获取词汇文件的名称列表
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP # 获取预训练词汇文件的映射
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES # 获取预训练位置嵌入的最大模型输入尺寸
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION # 获取预训练初始化配置
slow_tokenizer_class = RoFormerTokenizer # 慢速分词器类为RoFormerTokenizer
def __init__(
self,
vocab_file=None,
tokenizer_file=None,
do_lower_case=True,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs,
):
# 调用父类的初始化方法,设置基本的分词器参数
super().__init__(
vocab_file,
tokenizer_file=tokenizer_file,
do_lower_case=do_lower_case,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs,
)
# 从后端分词器的normalizer状态中加载JSON数据
normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
# 如果normalizer的lowercase属性与当前设置不符,则更新
if (
normalizer_state.get("lowercase", do_lower_case) != do_lower_case
or normalizer_state.get("strip_accents", strip_accents) != strip_accents
):
normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
normalizer_state["lowercase"] = do_lower_case
normalizer_state["strip_accents"] = strip_accents
# 更新后端分词器的normalizer
self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
# 确保正确设置自定义的PreTokenizer
vocab = self.backend_tokenizer.get_vocab()
self.backend_tokenizer.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer(vocab))
self.do_lower_case = do_lower_case
def __getstate__(self):
state = self.__dict__.copy()
# 将分词器的pre_tokenizer设置为BertPreTokenizer()
state["_tokenizer"].pre_tokenizer = BertPreTokenizer()
return state
def __setstate__(self, d):
self.__dict__ = d
# 获取当前分词器的词汇表
vocab = self.__dict__["_tokenizer"].get_vocab()
# 将分词器的pre_tokenizer设置为自定义的JiebaPreTokenizer
self.__dict__["_tokenizer"].pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer(vocab))
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A RoFormer sequence has the following format:- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`,*optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens."""
# Initialize output with CLS token ID, token_ids_0, and SEP token ID
output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
# If token_ids_1 is provided, concatenate token_ids_1 and SEP token ID
if token_ids_1 is not None:
output += token_ids_1 + [self.sep_token_id]
return output
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A RoFormer
sequence pair mask has the following format:
```
00000000000111111111| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`,*optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s)."""
# Define SEP and CLS tokens as lists
sep = [self.sep_token_id]
cls = [self.cls_token_id]
# If token_ids_1 is None, return a list of zeros corresponding to token_ids_0 + CLS + SEP
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
# Return a concatenated list of zeros for token_ids_0 + CLS + SEP and ones for token_ids_1 + SEP
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the tokenizer's vocabulary to a directory.
Args:
save_directory (str):
Directory to save the vocabulary files.
filename_prefix (str,*optional*):
Prefix for the vocabulary files.
Returns:
`Tuple[str]`: Tuple of file paths where the vocabulary was saved."""
# Save the model vocabulary using the tokenizer's save method
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)
def save_pretrained(
self,
save_directory,
legacy_format=None,
filename_prefix=None,
push_to_hub=False,
**kwargs,
):
"""
Save the pretrained model and its tokenizer.
Args:
save_directory (str):
Directory to save the pretrained model.
legacy_format (str,*optional*):
Legacy format compatibility.
filename_prefix (str,*optional*):
Prefix for the saved files.
push_to_hub (bool):
Whether to push the saved model to the Hugging Face model hub.**kwargs:
Additional arguments passed to the superclass method.
Returns:
`Any`: Output of the superclass's `save_pretrained` method."""
# Set the pre_tokenizer to BertPreTokenizer before saving
self.backend_tokenizer.pre_tokenizer = BertPreTokenizer()# Call the superclass's save_pretrained method with the specified argumentsreturnsuper().save_pretrained(save_directory, legacy_format, filename_prefix, push_to_hub,**kwargs)
.\models\roformer\tokenization_utils.py
# coding=utf-8# Copyright 2021 The HuggingFace Inc. team. All rights reserved.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License."""Tokenization utils for RoFormer."""from typing import List
from tokenizers import NormalizedString, PreTokenizedString, normalizers
classJiebaPreTokenizer:def__init__(self, vocab)->None:
self.vocab = vocab
# 初始化BERT风格的文本规范化器,用于清理文本,处理中文字符,不去除重音符号,不转换为小写
self.normalizers = normalizers.BertNormalizer(
clean_text=False,
handle_chinese_chars=True,
strip_accents=False,
lowercase=False,)try:import rjieba
except ImportError:# 如果导入rjieba失败,引发ImportError并提供安装链接raise ImportError("You need to install rjieba to use RoFormerTokenizer. ""See https://pypi.org/project/rjieba/ for installation.")# 导入成功后,将rjieba赋值给self.jieba
self.jieba = rjieba
defjieba_split(self, i:int, normalized_string: NormalizedString)-> List[NormalizedString]:
splits =[]# 使用rjieba对normalized_string进行分词,hmm参数设为False以提高速度for token, start, end in self.jieba.tokenize(str(normalized_string), hmm=False):# 如果分词结果在词汇表中,则将对应的NormalizedString加入splits列表if token in self.vocab:
splits.append(normalized_string[start:end])else:# 否则,对token进行文本规范化处理,并按照处理后的结果拆分为多个token加入splits列表
token_list = self.normalizers.normalize_str(token).split()for token in token_list:if token:
end = start +len(token)
splits.append(normalized_string[start:end])
start = end
# 返回分词后的NormalizedString列表return splits
defpre_tokenize(self, pretok: PreTokenizedString):# 使用jieba_split方法对PreTokenizedString对象进行分词处理
pretok.split(self.jieba_split)
# coding=utf-8# Copyright 2023 The OpenAI Team Authors and HuggingFace Inc. team.# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.""" RWKV configuration"""# 导入配置基类 PretrainedConfig 和日志工具 loggingfrom...configuration_utils import PretrainedConfig
from...utils import logging
# 获取当前模块的日志记录器
logger = logging.get_logger(__name__)# 定义 RWKV 预训练模型的配置文件映射字典
RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP ={"RWKV/rwkv-4-169m-pile":"https://huggingface.co/RWKV/rwkv-4-169m-pile/resolve/main/config.json","RWKV/rwkv-4-430m-pile":"https://huggingface.co/RWKV/rwkv-4-430m-pile/resolve/main/config.json","RWKV/rwkv-4-1b5-pile":"https://huggingface.co/RWKV/rwkv-4-1b5-pile/resolve/main/config.json","RWKV/rwkv-4-3b-pile":"https://huggingface.co/RWKV/rwkv-4-3b-pile/resolve/main/config.json","RWKV/rwkv-4-7b-pile":"https://huggingface.co/RWKV/rwkv-4-7b-pile/resolve/main/config.json","RWKV/rwkv-4-14b-pile":"https://huggingface.co/RWKV/rwkv-4-14b-pile/resolve/main/config.json","RWKV/rwkv-raven-1b5":"https://huggingface.co/RWKV/rwkv-raven-1b5/resolve/main/config.json","RWKV/rwkv-raven-3b":"https://huggingface.co/RWKV/rwkv-raven-3b/resolve/main/config.json","RWKV/rwkv-raven-7b":"https://huggingface.co/RWKV/rwkv-raven-7b/resolve/main/config.json","RWKV/rwkv-raven-14b":"https://huggingface.co/RWKV/rwkv-raven-14b/resolve/main/config.json",}# RWKV 配置类,用于存储 RWKV 模型的配置信息classRwkvConfig(PretrainedConfig):"""
This is the configuration class to store the configuration of a [`RwkvModel`]. It is used to instantiate a RWKV
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the RWVK-4
[RWKV/rwkv-4-169m-pile](https://huggingface.co/RWKV/rwkv-4-169m-pile) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
"""# 定义 RWKV 模型类型
model_type ="rwkv"# 映射模型属性,将 "max_position_embeddings" 映射到类中的 "context_length"
attribute_map ={"max_position_embeddings":"context_length"}# RWKV 模型的配置类,包含了模型的各种参数设置def__init__(
self,
vocab_size=50277,# 词汇表大小,默认为 50277
context_length=1024,# 模型可以处理的最大序列长度,默认为 1024
hidden_size=4096,# 嵌入层和隐藏状态的维度
num_hidden_layers=32,# 模型中的隐藏层数量,默认为 32
attention_hidden_size=None,# 注意力机制隐藏状态的维度,默认为 hidden_size
intermediate_size=None,# 内部前馈层的维度,默认为 hidden_size 的四倍
layer_norm_epsilon=1e-5,# 层归一化层使用的 epsilon 值,默认为 1e-5
bos_token_id=0,# 词汇表中句子开头 token 的 id,默认为 0
eos_token_id=0,# 词汇表中句子结尾 token 的 id,默认为 0
rescale_every=6,# 推断时,每隔多少层将隐藏状态和对应输出层的权重除以 2,默认为 6
tie_word_embeddings=False,# 是否将词嵌入与输入 token 的嵌入进行绑定,默认为 False
use_cache=True,# 模型是否应返回最后状态,默认为 True**kwargs,# 允许接受任意其他参数):):# 初始化模型的参数:词汇表大小、上下文长度、隐藏层大小、隐藏层数量、注意力隐藏大小# 如果注意力隐藏大小未指定,则使用隐藏层大小作为默认值
self.vocab_size = vocab_size
self.context_length = context_length
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.attention_hidden_size = attention_hidden_size if attention_hidden_size isnotNoneelse hidden_size
# 如果中间层大小未指定,则使用隐藏层大小的四倍作为默认值
self.intermediate_size = intermediate_size if intermediate_size isnotNoneelse4* hidden_size
self.layer_norm_epsilon = layer_norm_epsilon
self.rescale_every = rescale_every
self.use_cache = use_cache
# 设置模型的特殊令牌(起始和结束令牌)的标识符
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
# 调用父类的初始化方法,传递一些参数,如是否共享词嵌入、起始和结束令牌的标识符等super().__init__(
tie_word_embeddings=tie_word_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id,**kwargs
)
.\models\rwkv\convert_rwkv_checkpoint_to_hf.py
# coding=utf-8# Copyright 2023 The HuggingFace Inc. team.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License."""Convert a RWKV checkpoint from BlinkDL to the Hugging Face format."""import argparse # 导入处理命令行参数的模块import gc # 导入垃圾回收模块import json # 导入处理 JSON 格式的模块import os # 导入与操作系统交互的模块import re # 导入处理正则表达式的模块import torch # 导入 PyTorch 深度学习框架from huggingface_hub import hf_hub_download # 导入从 HF Hub 下载模型的功能from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerFast, RwkvConfig
from transformers.modeling_utils import WEIGHTS_INDEX_NAME, shard_checkpoint
NUM_HIDDEN_LAYERS_MAPPING ={# 定义模型尺寸与隐藏层映射关系的字典"169M":12,"430M":24,"1B5":24,"3B":32,"7B":32,"14B":40,}
HIDEN_SIZE_MAPPING ={# 定义模型尺寸与隐藏单元大小映射关系的字典"169M":768,"430M":1024,"1B5":2048,"3B":2560,"7B":4096,"14B":5120,}defconvert_state_dict(state_dict):
state_dict_keys =list(state_dict.keys())for name in state_dict_keys:
weight = state_dict.pop(name)# 对模型参数名称进行转换,适配 Hugging Face 模型格式# emb -> embeddingif name.startswith("emb."):
name = name.replace("emb.","embeddings.")# ln_0 -> pre_ln (only present at block 0)if name.startswith("blocks.0.ln0"):
name = name.replace("blocks.0.ln0","blocks.0.pre_ln")# att -> attention
name = re.sub(r"blocks\.(\d+)\.att",r"blocks.\1.attention", name)# ffn -> feed_forward
name = re.sub(r"blocks\.(\d+)\.ffn",r"blocks.\1.feed_forward", name)# time_mix_k -> time_mix_key and reshapeif name.endswith(".time_mix_k"):
name = name.replace(".time_mix_k",".time_mix_key")# time_mix_v -> time_mix_value and reshapeif name.endswith(".time_mix_v"):
name = name.replace(".time_mix_v",".time_mix_value")# time_mix_r -> time_mix_key and reshapeif name.endswith(".time_mix_r"):
name = name.replace(".time_mix_r",".time_mix_receptance")if name !="head.weight":
name ="rwkv."+ name # 添加前缀以标识 RWKV 格式的参数
state_dict[name]= weight
return state_dict
defconvert_rmkv_checkpoint_to_hf_format(
repo_id, checkpoint_file, output_dir, size=None, tokenizer_file=None, push_to_hub=False, model_name=None):# 1. If possible, build the tokenizer.if tokenizer_file isNone:print("No `--tokenizer_file` provided, we will use the default tokenizer.")
vocab_size =50277
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")# 使用默认的分词器模型else:# 如果没有指定 tokenizer_file,则使用 PreTrainedTokenizerFast 加载默认的分词器
tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_file)# 获取分词器的词汇表大小
vocab_size =len(tokenizer)# 将 tokenizer 保存到输出目录
tokenizer.save_pretrained(output_dir)# 2. 构建配置文件# 定义可能的隐藏层大小列表
possible_sizes =list(NUM_HIDDEN_LAYERS_MAPPING.keys())if size isNone:# 尝试从 checkpoint 文件名推断 sizefor candidate in possible_sizes:if candidate in checkpoint_file:
size = candidate
breakif size isNone:# 如果无法推断出 size,则抛出错误raise ValueError("Could not infer the size, please provide it with the `--size` argument.")if size notin possible_sizes:# 如果 size 不在可能的大小列表中,则抛出错误raise ValueError(f"`size` should be one of {possible_sizes}, got {size}.")# 创建 RwkvConfig 对象,配置模型的参数
config = RwkvConfig(
vocab_size=vocab_size,
num_hidden_layers=NUM_HIDDEN_LAYERS_MAPPING[size],
hidden_size=HIDEN_SIZE_MAPPING[size],)# 将配置保存到输出目录
config.save_pretrained(output_dir)# 3. 下载模型文件并转换 state_dict# 从 HF Hub 下载模型文件
model_file = hf_hub_download(repo_id, checkpoint_file)# 加载模型的 state_dict
state_dict = torch.load(model_file, map_location="cpu")# 转换 state_dict
state_dict = convert_state_dict(state_dict)# 4. 分割成片段并保存# 将 state_dict 拆分成多个片段
shards, index = shard_checkpoint(state_dict)for shard_file, shard in shards.items():# 保存每个片段到输出目录
torch.save(shard, os.path.join(output_dir, shard_file))if index isnotNone:# 如果存在 index,则保存 index 到输出目录
save_index_file = os.path.join(output_dir, WEIGHTS_INDEX_NAME)withopen(save_index_file,"w", encoding="utf-8")as f:# 将 index 写入文件
content = json.dumps(index, indent=2, sort_keys=True)+"\n"
f.write(content)# 5. 清理片段(有时 PyTorch 保存的文件会占用与完整 state_dict 相同的空间)print("Cleaning up shards. This may error with an OOM error, it this is the case don't worry you still have converted the model.")# 获取所有片段文件名列表
shard_files =list(shards.keys())# 清理变量以释放内存del state_dict
del shards
gc.collect()# 重新加载每个片段并保存(确保在 CPU 上)for shard_file in shard_files:
state_dict = torch.load(os.path.join(output_dir, shard_file))
torch.save({k: v.cpu().clone()for k, v in state_dict.items()}, os.path.join(output_dir, shard_file))# 清理 state_dict 变量以释放内存del state_dict
gc.collect()# 如果需要推送到 HF Hubif push_to_hub:if model_name isNone:# 如果未提供 model_name,则抛出错误raise ValueError("Please provide a `model_name` to push the model to the Hub.")# 加载模型并推送到 HF Hub
model = AutoModelForCausalLM.from_pretrained(output_dir)
model.push_to_hub(model_name, max_shard_size="2GB")# 将分词器也推送到 HF Hub
tokenizer.push_to_hub(model_name)if __name__ =="__main__":# 如果当前脚本作为主程序运行,则执行以下代码块
parser = argparse.ArgumentParser()# 创建参数解析器对象# 必需参数
parser.add_argument("--repo_id", default=None,type=str, required=True,help="Repo ID from which to pull the checkpoint.")# repo_id 参数,从中获取检查点的仓库 ID
parser.add_argument("--checkpoint_file", default=None,type=str, required=True,help="Name of the checkpoint file in the repo.")# checkpoint_file 参数,检查点文件在仓库中的名称
parser.add_argument("--output_dir", default=None,type=str, required=True,help="Where to save the converted model.")# output_dir 参数,用于保存转换后模型的目录路径
parser.add_argument("--tokenizer_file",
default=None,type=str,help="Path to the tokenizer file to use (if not provided, only the model is converted).",)# tokenizer_file 参数,用于指定要使用的分词器文件路径(如果未提供,则仅转换模型)
parser.add_argument("--size",
default=None,type=str,help="Size of the model. Will be inferred from the `checkpoint_file` if not passed.",)# size 参数,指定模型的大小;如果未传入,则将从 checkpoint_file 推断大小
parser.add_argument("--push_to_hub",
action="store_true",help="Push to the Hub the converted model.",)# push_to_hub 参数,如果设置,则推送转换后的模型到 Hub 上
parser.add_argument("--model_name",
default=None,type=str,help="Name of the pushed model on the Hub, including the username / organization.",)# model_name 参数,指定推送到 Hub 上的模型名称,包括用户名或组织名
args = parser.parse_args()# 解析命令行参数并返回一个命名空间对象 args
convert_rmkv_checkpoint_to_hf_format(
args.repo_id,
args.checkpoint_file,
args.output_dir,
size=args.size,
tokenizer_file=args.tokenizer_file,
push_to_hub=args.push_to_hub,
model_name=args.model_name,)# 调用 convert_rmkv_checkpoint_to_hf_format 函数,传递解析后的参数作为函数的输入
.\models\rwkv\modeling_rwkv.py
# 设置文件编码为 UTF-8# 版权声明:2023 年 Bo Peng 和 HuggingFace 公司团队版权所有# 版权声明:2018 年 NVIDIA 公司版权所有## 根据 Apache 许可证 2.0 版本许可,除非符合许可协议,否则不得使用本文件# 您可以在以下网址获取许可协议的副本:## http://www.apache.org/licenses/LICENSE-2.0## 除非适用法律要求或书面同意,否则按“原样”分发软件# 不提供任何形式的担保或条件,无论是明示的还是默示的# 有关详细信息,请参阅许可协议"""PyTorch RWKV 模型."""import math
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Tuple, Union
import torch
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss
# 从模型工具中导入预训练模型类from...modeling_utils import PreTrainedModel
# 从工具中导入文档字符串生成函数和其它实用函数from...utils import(
ModelOutput,
add_code_sample_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_bitsandbytes_available,
is_ninja_available,
is_torch_cuda_available,
logging,)# 从相应模块导入 RWKV 配置类from.configuration_rwkv import RwkvConfig
# 获取日志记录器实例
logger = logging.get_logger(__name__)# 用于文档的检查点和配置信息
_CHECKPOINT_FOR_DOC ="RWKV/rwkv-4-169m-pile"
_CONFIG_FOR_DOC ="RwkvConfig"# 预训练模型归档列表
RWKV_PRETRAINED_MODEL_ARCHIVE_LIST =["RWKV/rwkv-4-169m-pile","RWKV/rwkv-4-430m-pile","RWKV/rwkv-4-1b5-pile","RWKV/rwkv-4-3b-pile","RWKV/rwkv-4-7b-pile","RWKV/rwkv-4-14b-pile","RWKV/rwkv-raven-1b5","RWKV/rwkv-raven-3b","RWKV/rwkv-raven-7b","RWKV/rwkv-raven-14b",# 查看所有 RWKV 模型:https://huggingface.co/models?filter=rwkv]# RWKV CUDA 核心初始化为 None
rwkv_cuda_kernel =Nonedefload_wkv_cuda_kernel(context_length):# 从 torch.utils.cpp_extension 中加载 CUDA 核心from torch.utils.cpp_extension import load as load_kernel
global rwkv_cuda_kernel
# 获取 CUDA 核心文件夹路径
kernel_folder = Path(__file__).resolve().parent.parent.parent /"kernels"/"rwkv"
cuda_kernel_files =[kernel_folder / f for f in["wkv_op.cpp","wkv_cuda.cu","wkv_cuda_bf16.cu"]]# 如果已加载的 CUDA 核心存在且上下文长度未更改,则直接返回if rwkv_cuda_kernel isnotNoneand rwkv_cuda_kernel.max_seq_length == context_length:return# 记录加载 RWKV CUDA 核心的信息
logger.info(f"Loading CUDA kernel for RWKV at context length of {context_length}.")# CUDA 编译标志
flags =["-res-usage","--maxrregcount 60","--use_fast_math","-O3","-Xptxas -O3","--extra-device-vectorization",f"-DTmax={context_length}",]# 加载 CUDA 核心
rwkv_cuda_kernel = load_kernel(
name=f"wkv_{context_length}",
sources=cuda_kernel_files,
verbose=(logging.get_verbosity()== logging.DEBUG),
extra_cuda_cflags=flags,)
rwkv_cuda_kernel.max_seq_length = context_length
classRwkvLinearAttention(torch.autograd.Function):@staticmethod# 定义一个静态方法 `forward`,接受多个参数和可选的状态信息,执行前向传播计算defforward(ctx, time_decay, time_first, key, value, state=None, return_state=False):# 获取输入张量的批量大小、序列长度和隐藏层大小
batch_size, seq_len, hidden_size = key.size()# 如果序列长度超过最大允许长度,抛出异常if seq_len > rwkv_cuda_kernel.max_seq_length:raise ValueError(f"Cannot process a batch with {seq_len} tokens at the same time, use a maximum of "f"{rwkv_cuda_kernel.max_seq_length} with this model.")# 如果批量大小乘以隐藏层大小不能整除最小值(32),抛出异常if batch_size * hidden_size %min(hidden_size,32)!=0:raise ValueError(f"The product of batch size ({batch_size}) and hidden size ({hidden_size}) needs to be a round "f"multiple of {min(hidden_size,32)}.")# 设置上下文对象的输入数据类型为 key 的数据类型
ctx.input_dtype = key.dtype
# 检查时间衰减、时间优先、key 和 value 张量是否都在 CUDA 设备上,否则抛出异常if(
time_decay.device.type!="cuda"or time_first.device.type!="cuda"or key.device.type!="cuda"or value.device.type!="cuda"):raise ValueError("Calling the CUDA kernel for wkv attention requires all tensors to be on CUDA devices.")# 将时间衰减张量取负指数,转换为 float 类型并保证连续内存布局
time_decay =-torch.exp(time_decay.float().contiguous())# 如果 key 的数据类型为 float16,将 time_first、key 和 value 转换为 float32 类型if key.dtype == torch.float16:
time_first = time_first.float()
key = key.float()
value = value.float()# 确保 time_first、key 和 value 的连续内存布局
time_first = time_first.contiguous()
key = key.contiguous()
value = value.contiguous()# 根据 key 的内存布局创建一个空的输出张量,保证其连续内存布局# CUDA 内核将填充这个张量
output = torch.empty_like(key, memory_format=torch.contiguous_format)# 如果需要返回状态信息或者已提供状态信息if return_state or state isnotNone:# 如果未提供状态信息,则创建全零状态张量,并初始化最后一维度为 -1e38if state isNone:
state = torch.zeros(
batch_size,
hidden_size,3,
dtype=torch.float32,
device=key.device,
memory_format=torch.contiguous_format,)
state[:,:,2]-=1e38else:# 否则,将现有状态信息按最后一维度拼接,并保证连续内存布局
state = torch.cat([s.unsqueeze(2)for s in state], dim=2).contiguous()# 根据 key 的数据类型选择前向传播函数,处理状态信息if key.dtype == torch.bfloat16:
forward_func = rwkv_cuda_kernel.forward_with_state_bf16
else:
forward_func = rwkv_cuda_kernel.forward_with_state
# 调用 CUDA 内核执行前向传播计算,并传递状态信息
forward_func(time_decay, time_first, key, value, output, state)else:# 否则,根据 key 的数据类型选择相应的前向传播函数,不处理状态信息
forward_func = rwkv_cuda_kernel.forward_bf16 if key.dtype == torch.bfloat16 else rwkv_cuda_kernel.forward
# 调用 CUDA 内核执行前向传播计算,不传递状态信息
forward_func(time_decay, time_first, key, value, output)# 将输入的关键数据和输出保存在上下文对象的备份中
ctx.save_for_backward(time_decay, time_first, key, value, output)# 如果提供了状态信息,将其拆分并返回if state isnotNone:
state =[s.squeeze(2)for s in torch.chunk(state,3, dim=2)]# 返回计算结果的输出张量,并保证其数据类型与输入一致,同时返回状态信息return output.to(ctx.input_dtype), state
@staticmethod# 静态方法的注释,g 代表梯度defbackward(ctx, g_output, g_state=None):# 获取输入数据类型
input_dtype = ctx.input_dtype
# 从上下文中恢复保存的张量数据
time_decay, time_first, key, value, output = ctx.saved_tensors
# CUDA核心将填充这些张量。# 根据输入数据类型创建对应的梯度张量
g_time_decay = torch.empty_like(
time_decay,
memory_format=torch.contiguous_format,
dtype=torch.bfloat16 if input_dtype == torch.bfloat16 else torch.float32,)
g_time_first = torch.empty_like(time_first, memory_format=torch.contiguous_format)
g_key = torch.empty_like(key, memory_format=torch.contiguous_format)
g_value = torch.empty_like(value, memory_format=torch.contiguous_format)# 如果输入数据类型是torch.float16,则将g_output转换为float类型if input_dtype == torch.float16:
g_output = g_output.float()# 选择对应的CUDA函数进行反向传播计算
backward_func = rwkv_cuda_kernel.backward_bf16 if input_dtype == torch.bfloat16 else rwkv_cuda_kernel.backward
backward_func(
time_decay,
time_first,
key,
value,
output,
g_output.contiguous(),# 获取g_output的连续内存视图
g_time_decay,
g_time_first,
g_key,
g_value,)# 将计算得到的梯度张量转换回输入数据类型并返回return(
g_time_decay.to(input_dtype),
g_time_first.to(input_dtype),
g_key.to(input_dtype),
g_value.to(input_dtype),None,None,)# 使用线性键值注意力的 CPU 版本实现。如果不在 torch.no_grad 下执行,可能比自定义 CUDA 内核更慢且消耗更多内存。defrwkv_linear_attention_cpu(time_decay, time_first, key, value, state=None, return_state=False):
_, seq_length, _ = key.size()# 获取键张量的序列长度
output = torch.zeros_like(key)# 初始化输出张量,与键张量相同形状if state isNone:# 如果状态为空,初始化状态张量
num_state = torch.zeros_like(key[:,0], dtype=torch.float32)
den_state = torch.zeros_like(key[:,0], dtype=torch.float32)
max_state = torch.zeros_like(key[:,0], dtype=torch.float32)-1e38else:
num_state, den_state, max_state = state # 否则使用提供的状态张量# 对数值稳定性的考虑
time_decay =-torch.exp(time_decay)# 迭代序列长度for current_index inrange(seq_length):
current_key = key[:, current_index].float()# 当前时间步的键张量
current_value = value[:, current_index]# 当前时间步的值张量# 在时间步 t 计算线性键值注意力
max_for_output = torch.maximum(max_state, current_key + time_first)
e1 = torch.exp(max_state - max_for_output)
e2 = torch.exp(current_key + time_first - max_for_output)
numerator = e1 * num_state + e2 * current_value
denominator = e1 * den_state + e2
output[:, current_index]=(numerator / denominator).to(output.dtype)# 更新状态以备下一次迭代
max_for_state = torch.maximum(max_state + time_decay, current_key)
e1 = torch.exp(max_state + time_decay - max_for_state)
e2 = torch.exp(current_key - max_for_state)
num_state = e1 * num_state + e2 * current_value
den_state = e1 * den_state + e2
max_state = max_for_state
# 如果需要返回状态或者状态不为空,则返回更新后的状态张量if return_state or state isnotNone:
state =[num_state, den_state, max_state]return output, state
# 使用线性键值注意力的入口函数,根据硬件支持情况选择 CPU 或 CUDA 实现defrwkv_linear_attention(time_decay, time_first, key, value, state=None, return_state=False):# 检查是否存在不支持 CUDA 的硬件,或者键张量的长度为 1
no_cuda =any(t.device.type!="cuda"for t in[time_decay, time_first, key, value])
one_token = key.size(1)==1# 如果没有 CUDA 内核、不支持 CUDA 的硬件或者键张量的长度为 1,则调用 CPU 版本实现if rwkv_cuda_kernel isNoneor no_cuda or one_token:return rwkv_linear_attention_cpu(time_decay, time_first, key, value, state=state, return_state=return_state)else:# 否则调用 CUDA 版本实现return RwkvLinearAttention.apply(time_decay, time_first, key, value, state, return_state)# 初始化函数,用于初始化一个自定义的注意力层对象def__init__(self, config, layer_id=0):# 调用父类的初始化方法super().__init__()# 将配置信息保存在对象属性中
self.config = config
# 检查是否已经加载了CUDA内核,并且内核支持的最大序列长度符合配置中的上下文长度
kernel_loaded = rwkv_cuda_kernel isnotNoneand rwkv_cuda_kernel.max_seq_length == config.context_length
# 如果可以使用Ninja编译器、有可用的CUDA设备,并且尚未加载CUDA内核,则尝试加载自定义CUDA内核if is_ninja_available()and is_torch_cuda_available()andnot kernel_loaded:try:
load_wkv_cuda_kernel(config.context_length)except Exception:
logger.info("Could not load the custom CUDA kernel for RWKV attention.")# 将层的ID保存在对象属性中
self.layer_id = layer_id
# 获取隐藏层的大小
hidden_size = config.hidden_size
# 获取注意力隐藏层的大小,如果未指定,则默认与隐藏层大小相同
attention_hidden_size =(
config.attention_hidden_size if config.attention_hidden_size isnotNoneelse hidden_size
)# 将注意力隐藏层的大小保存在对象属性中
self.attention_hidden_size = attention_hidden_size
# 初始化时间衰减参数,用于注意力机制
self.time_decay = nn.Parameter(torch.empty(attention_hidden_size))# 初始化时间首参数,用于注意力机制
self.time_first = nn.Parameter(torch.empty(attention_hidden_size))# 初始化时间混合关键字参数,用于注意力机制
self.time_mix_key = nn.Parameter(torch.empty(1,1, hidden_size))# 初始化时间混合数值参数,用于注意力机制
self.time_mix_value = nn.Parameter(torch.empty(1,1, hidden_size))# 初始化时间混合接收参数,用于注意力机制
self.time_mix_receptance = nn.Parameter(torch.empty(1,1, hidden_size))# 初始化时间偏移层,使用2D零填充,只在垂直方向(时间维度)上进行
self.time_shift = nn.ZeroPad2d((0,0,1,-1))# 初始化关键字线性层,将隐藏层映射到注意力隐藏层,无偏置
self.key = nn.Linear(hidden_size, attention_hidden_size, bias=False)# 初始化数值线性层,将隐藏层映射到注意力隐藏层,无偏置
self.value = nn.Linear(hidden_size, attention_hidden_size, bias=False)# 初始化接收线性层,将隐藏层映射到注意力隐藏层,无偏置
self.receptance = nn.Linear(hidden_size, attention_hidden_size, bias=False)# 初始化输出线性层,将注意力隐藏层映射回隐藏层大小,无偏置
self.output = nn.Linear(attention_hidden_size, hidden_size, bias=False)# TODO: maybe jit, otherwise move inside forward# 提取关键字和数值,可能使用jit,否则将其移动到前向传播方法内defextract_key_value(self, hidden, state=None):# 将当前隐藏状态与上一时间步状态混合,生成关键字、数值、接收参数if hidden.size(1)==1and state isnotNone:# 如果隐藏状态的时间步为1且状态不为空,则从状态中提取上一时间步的值
shifted = state[1][:,:, self.layer_id]else:# 否则,使用时间偏移层处理当前隐藏状态
shifted = self.time_shift(hidden)# 如果状态不为空,则将上一时间步的值混合到当前时间步if state isnotNone:
shifted[:,0]= state[1][:,:, self.layer_id]# 使用时间混合关键字参数混合当前隐藏状态和上一时间步状态,生成关键字
key = hidden * self.time_mix_key + shifted *(1- self.time_mix_key)# 使用时间混合数值参数混合当前隐藏状态和上一时间步状态,生成数值
value = hidden * self.time_mix_value + shifted *(1- self.time_mix_value)# 使用时间混合接收参数混合当前隐藏状态和上一时间步状态,生成接收参数,并使用Sigmoid函数处理
receptance = hidden * self.time_mix_receptance + shifted *(1- self.time_mix_receptance)# 将关键字、数值、接收参数分别通过线性层映射到注意力隐藏层
key = self.key(key)
value = self.value(value)
receptance = torch.sigmoid(self.receptance(receptance))# 如果状态不为空,则更新状态中的上一时间步隐藏状态if state isnotNone:
state[1][:,:, self.layer_id]= hidden[:,-1]# 返回接收参数、关键字、数值、状态return receptance, key, value, state
# 前向传播函数,用于处理输入隐藏状态,可选地使用缓存defforward(self, hidden, state=None, use_cache=False):# 从隐藏状态中提取接受度、键和值,同时更新状态
receptance, key, value, state = self.extract_key_value(hidden, state=state)# 如果存在状态,则从状态中提取当前层的状态信息
layer_state =tuple(s[:,:, self.layer_id]for s in state[2:])if state isnotNoneelseNone# 使用 RWKV 线性注意力计算,考虑时间衰减和时间维度
rwkv, layer_state = rwkv_linear_attention(
self.time_decay,
self.time_first,
key,
value,
state=layer_state,
return_state=use_cache,)# 如果存在层状态信息,则更新整体状态的当前层信息if layer_state isnotNone:
state[2][:,:, self.layer_id]= layer_state[0]
state[3][:,:, self.layer_id]= layer_state[1]
state[4][:,:, self.layer_id]= layer_state[2]# 返回经过输出层处理后的结果以及更新后的状态return self.output(receptance * rwkv), state
# 定义一个名为 RwkvFeedForward 的新神经网络模块,继承自 nn.Module 类classRwkvFeedForward(nn.Module):# 初始化函数,接受配置参数 config 和层编号 layer_iddef__init__(self, config, layer_id=0):super().__init__()# 保存配置信息和层编号到对象属性中
self.config = config
self.layer_id = layer_id
# 从配置中获取隐藏层大小和中间层大小
hidden_size = config.hidden_size
intermediate_size =(
config.intermediate_size if config.intermediate_size isnotNoneelse4* config.hidden_size
)# 创建一个沿时间轴零填充的二维零填充层
self.time_shift = nn.ZeroPad2d((0,0,1,-1))# 创建一个时间混合关键字的可训练参数
self.time_mix_key = nn.Parameter(torch.empty(1,1, hidden_size))# 创建一个时间混合接受度的可训练参数
self.time_mix_receptance = nn.Parameter(torch.empty(1,1, hidden_size))# 创建一个线性层对象,用于生成关键字
self.key = nn.Linear(hidden_size, intermediate_size, bias=False)# 创建一个线性层对象,用于生成接受度
self.receptance = nn.Linear(hidden_size, hidden_size, bias=False)# 创建一个线性层对象,用于生成值
self.value = nn.Linear(intermediate_size, hidden_size, bias=False)# 前向传播函数,接受隐藏层输入和状态信息defforward(self, hidden, state=None):# 如果隐藏层的第二维大小为1且状态不为空,则获取状态中的相应层次的偏移量if hidden.size(1)==1and state isnotNone:
shifted = state[0][:,:, self.layer_id]else:# 否则,对隐藏层进行时间轴零填充操作,并根据状态调整填充结果
shifted = self.time_shift(hidden)if state isnotNone:
shifted[:,0]= state[0][:,:, self.layer_id]# 计算关键字和接受度,根据时间混合参数和偏移量
key = hidden * self.time_mix_key + shifted *(1- self.time_mix_key)
receptance = hidden * self.time_mix_receptance + shifted *(1- self.time_mix_receptance)# 对关键字进行非负整数平方操作,并使用 ReLU 激活函数
key = torch.square(torch.relu(self.key(key)))# 将处理后的关键字输入值生成线性层,并输出值
value = self.value(key)# 对接受度应用 sigmoid 激活函数
receptance = torch.sigmoid(self.receptance(receptance))# 如果状态不为空,则更新状态中的隐藏层信息if state isnotNone:
state[0][:,:, self.layer_id]= hidden[:,-1]# 返回接受度乘以值和更新后的状态return receptance * value, state
# 定义一个名为 RwkvBlock 的新神经网络模块,继承自 nn.Module 类classRwkvBlock(nn.Module):# 初始化函数,接受配置参数 config 和层编号 layer_iddef__init__(self, config, layer_id):super().__init__()# 保存配置信息和层编号到对象属性中
self.config = config
self.layer_id = layer_id
# 如果层编号为0,则创建一个 LayerNorm 层对象,对隐藏层进行预处理if layer_id ==0:
self.pre_ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)# 创建两个 LayerNorm 层对象,用于注意力机制前后的归一化处理
self.ln1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)
self.ln2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_epsilon)# 创建 RwkvSelfAttention 和 RwkvFeedForward 的实例对象,用于注意力机制和前向传播
self.attention = RwkvSelfAttention(config, layer_id)
self.feed_forward = RwkvFeedForward(config, layer_id)# 前向传播函数,接受隐藏层输入、状态信息、是否使用缓存和是否输出注意力矩阵的参数defforward(self, hidden, state=None, use_cache=False, output_attentions=False):# 如果层编号为0,则对隐藏层进行预处理if self.layer_id ==0:
hidden = self.pre_ln(hidden)# 将隐藏层输入传入注意力机制,获取注意力结果和更新后的状态
attention, state = self.attention(self.ln1(hidden), state=state, use_cache=use_cache)# 将注意力结果加上原始隐藏层输入,得到新的隐藏层输出
hidden = hidden + attention
# 将新的隐藏层输入传入前向传播模块,获取前向传播结果和更新后的状态
feed_forward, state = self.feed_forward(self.ln2(hidden), state=state)# 将前向传播结果加上原始隐藏层输入,得到最终的隐藏层输出
hidden = hidden + feed_forward
# 将隐藏层输出和状态信息作为元组返回
outputs =(hidden, state)# 如果需要输出注意力矩阵,则将注意力矩阵加入返回的元组中if output_attentions:
outputs +=(attention,)else:
outputs +=(None,)# 返回最终的输出元组return outputs
# 定义一个名为 RwkvPreTrainedModel 的抽象神经网络模型类,继承自 PreTrainedModelclassRwkvPreTrainedModel(PreTrainedModel):"""
一个抽象类,处理权重初始化和简单的预训练模型下载与加载接口。
"""# 类属性,配置类为 RwkvConfig
config_class = RwkvConfig
# 基础模型前缀为 "rwkv"
base_model_prefix ="rwkv"# 不需要分割的模块名称列表中包含 "RwkvBlock"
_no_split_modules =["RwkvBlock"]# 定义需要保留在 FP32 模块中的模块名称列表
_keep_in_fp32_modules =["time_decay","time_first"]# 支持梯度检查点
supports_gradient_checkpointing =Truedef_init_weights(self, module):"""初始化权重"""# 如果模块是 RwkvSelfAttention 类型ifisinstance(module, RwkvSelfAttention):# 获取当前层的编号和总隐藏层数
layer_id = module.layer_id
num_hidden_layers = module.config.num_hidden_layers
hidden_size = module.config.hidden_size
attention_hidden_size = module.attention_hidden_size
# 计算比率 0 到 1,表示当前层在所有隐藏层中的位置
ratio_0_to_1 = layer_id /(num_hidden_layers -1)# 0 到 1# 计算比率 1 到 接近 0,表示当前层在所有隐藏层中的位置的反向比率
ratio_1_to_almost0 =1.0-(layer_id / num_hidden_layers)# 1 到 ~0# 创建时间权重张量,用于调整时间相关的关键字
time_weight = torch.tensor([i / hidden_size for i inrange(hidden_size)],
dtype=module.time_mix_key.dtype,
device=module.time_mix_key.device,)
time_weight = time_weight[None,None,:]# 计算时间衰减速度,根据注意力隐藏层大小和层位置动态调整
decay_speed =[-5+8*(h /(attention_hidden_size -1))**(0.7+1.3* ratio_0_to_1)for h inrange(attention_hidden_size)]
decay_speed = torch.tensor(decay_speed, dtype=module.time_decay.dtype, device=module.time_decay.device)# 创建用于时间优先标记的波动
zigzag =(
torch.tensor([(i +1)%3-1for i inrange(attention_hidden_size)],
dtype=module.time_first.dtype,
device=module.time_first.device,)*0.5)# 使用无梯度操作设置模块的时间衰减、时间优先和时间权重混合with torch.no_grad():
module.time_decay.data = decay_speed
module.time_first.data = torch.ones_like(module.time_first * math.log(0.3)+ zigzag)
module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0)
module.time_mix_value.data = torch.pow(time_weight, ratio_1_to_almost0)+0.3* ratio_0_to_1
module.time_mix_receptance.data = torch.pow(time_weight,0.5* ratio_1_to_almost0)# 如果模块是 RwkvFeedForward 类型elifisinstance(module, RwkvFeedForward):# 获取当前层的编号和总隐藏层数
layer_id = module.layer_id
num_hidden_layers = module.config.num_hidden_layers
hidden_size = module.config.hidden_size
# 计算比率 1 到 接近 0,表示当前层在所有隐藏层中的位置的反向比率
ratio_1_to_almost0 =1.0-(layer_id / num_hidden_layers)# 1 到 ~0# 创建时间权重张量,用于调整时间相关的关键字
time_weight = torch.tensor([i / hidden_size for i inrange(hidden_size)],
dtype=module.time_mix_key.dtype,
device=module.time_mix_key.device,)
time_weight = time_weight[None,None,:]# 使用无梯度操作设置模块的时间权重混合和时间接受度with torch.no_grad():
module.time_mix_key.data = torch.pow(time_weight, ratio_1_to_almost0)
module.time_mix_receptance.data = torch.pow(time_weight, ratio_1_to_almost0)# 使用 @dataclass 装饰器声明一个数据类,用于封装 RWKV 模型的输出结果@dataclassclassRwkvOutput(ModelOutput):"""
Class for the RWKV model outputs.
Args:
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
Sequence of hidden-states at the output of the last layer of the model.
state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
avoid providing the old `input_ids`.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""# 定义 RWKV 模型的输出属性
last_hidden_state: torch.FloatTensor =None# 最后一层模型的隐藏状态
state: Optional[List[torch.FloatTensor]]=None# 模型在最后时间步的状态
hidden_states: Optional[Tuple[torch.FloatTensor,...]]=None# 每层模型的隐藏状态
attentions: Optional[Tuple[torch.FloatTensor,...]]=None# 每层注意力权重"""
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
avoid providing the old `input_ids`.
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
"""# 损失值,用于语言建模任务中的下一个标记预测,当提供了`labels`时返回
loss: Optional[torch.FloatTensor]=None# 语言建模头部的预测分数,即在应用SoftMax之前每个词汇标记的分数,形状为`(batch_size, sequence_length, config.vocab_size)`
logits: torch.FloatTensor =None# 模型在最后一个时间步的状态,可以在下一个`input_ids`的前向方法中使用,避免提供旧的`input_ids`
state: Optional[List[torch.FloatTensor]]=None# 模型每一层的隐藏状态的元组,包括(如果存在)嵌入层的输出,形状为`(batch_size, sequence_length, hidden_size)`
hidden_states: Optional[Tuple[torch.FloatTensor,...]]=None# 注意力权重的元组,用于自注意力头部中的加权平均计算,形状为`(batch_size, num_heads, sequence_length, sequence_length)`
attentions: Optional[Tuple[torch.FloatTensor,...]]=None# RWKV_START_DOCSTRING 定义了一个多行字符串,用于描述某个模型类的文档字符串。# 文档字符串解释了该模型继承自 PreTrainedModel,列出了该库对所有模型实现的通用方法(如下载或保存模型、调整输入嵌入、剪枝头部等)。# 这个模型也是 PyTorch 的 torch.nn.Module 的子类,可以像普通的 PyTorch 模块一样使用,所有与一般使用和行为相关的事项请参考 PyTorch 文档。
RWKV_INPUTS_DOCSTRING =r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
# `input_ids` 是输入序列的 token 索引,在词汇表中进行查找得到。
`input_ids_length` = `sequence_length` if `past_key_values` is `None` else
`past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
sequence tokens in the vocabulary.
If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
`input_ids`.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
# 用来避免对填充 token 索引执行注意力操作的掩码。掩码值选择在 `[0, 1]` 范围内:
- 1 表示**未被掩码**的 token,
- 0 表示**被掩码**的 token。
This is currently not used by `RwkvModel`, but will be supported in the future.
[What are attention masks?](../glossary#attention-mask)
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
# 可选参数,代替 `input_ids` 直接传递嵌入表示。如果希望更好地控制如何将 `input_ids` 索引转换为关联向量,
这是非常有用的,比如使用自定义的嵌入查找矩阵。
This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
model's internal embedding lookup matrix.
state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
# 如果提供,模型将在所有块中使用先前状态(这将给出模型对提供的 `input_ids` 和 `state_input_ids` 作为上下文的输出)。
If passed along, the model uses the previous state in all the blocks (which will give the output for the
`input_ids` provided as if the model add `state_input_ids + input_ids` as context).
use_cache (`bool`, *optional*):
# 如果设置为 `True`,则返回最后的状态,并且可以用于快速生成下一个 logits。
If set to `True`, the last state is returned and can be used to quickly generate the next logits.
output_attentions (`bool`, *optional*):
# 是否返回所有注意力层的注意力张量。详细信息请参见返回的张量中的 `attentions`。
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
# 是否返回所有层的隐藏状态。详细信息请参见返回的张量中的 `hidden_states`。
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
# 是否返回 [`~utils.ModelOutput`] 而不是普通的元组。
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
定义一个 RwkvModel 类,继承自 RwkvPreTrainedModel 类。
@add_start_docstrings("The bare RWKV Model transformer outputting raw hidden-states without any specific head on top.",
RWKV_START_DOCSTRING,)
添加文档字符串,描述该模型是一个裸的 RWKV 模型,输出未经特定顶层处理的原始隐藏状态。
classRwkvModel(RwkvPreTrainedModel):def__init__(self, config):super().__init__(config)# 初始化嵌入层,使用给定的词汇量大小和隐藏层大小
self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)# 创建包含多个 RwkvBlock 的层列表,每个块的配置由传入的 config 控制
self.blocks = nn.ModuleList([RwkvBlock(config, layer_id=idx)for idx inrange(config.num_hidden_layers)])# 初始化 LayerNorm 层,对隐藏状态进行归一化处理
self.ln_out = nn.LayerNorm(config.hidden_size)# 初始化标志:层是否被重新缩放
self.layers_are_rescaled =False# 初始化标志:是否使用梯度检查点
self.gradient_checkpointing =False# 执行额外的初始化操作# 这可能包括权重初始化和最终处理
self.post_init()# 返回嵌入层defget_input_embeddings(self):return self.embeddings
# 设置新的嵌入层defset_input_embeddings(self, new_embeddings):
self.embeddings = new_embeddings
@add_start_docstrings_to_model_forward(RWKV_INPUTS_DOCSTRING)@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=RwkvOutput,
config_class=_CONFIG_FOR_DOC,)
定义 forward 方法,接收多个输入参数,执行模型的前向传播过程。
defforward(
self,
input_ids: Optional[torch.LongTensor]=None,
attention_mask: Optional[torch.LongTensor]=None,# noqa
inputs_embeds: Optional[torch.FloatTensor]=None,
state: Optional[List[torch.FloatTensor]]=None,
use_cache: Optional[bool]=None,
output_attentions: Optional[bool]=None,
output_hidden_states: Optional[bool]=None,
return_dict: Optional[bool]=None,def_rescale_layers(self):# Layers should be rescaled for inference only.if self.layers_are_rescaled ==(not self.training):return# Check if rescaling interval is specifiedif self.config.rescale_every >0:# Perform rescaling without gradient trackingwith torch.no_grad():# Iterate over blocks in the modelfor block_id, block inenumerate(self.blocks):if self.training:# Scale weights during training
block.attention.output.weight.mul_(2**int(block_id // self.config.rescale_every))
block.feed_forward.value.weight.mul_(2**int(block_id // self.config.rescale_every))else:# Handle quantization statistics during inferenceifhasattr(block.attention.output.weight,"SCB"):
block.attention.output.weight.SCB.div_(2**int(block_id // self.config.rescale_every))
block.feed_forward.value.weight.SCB.div_(2**int(block_id // self.config.rescale_every))elifhasattr(block.attention.output.weight,"quant_state"):# Perform 4-bit dequantization and rescaling
self._bnb_4bit_dequantize_and_rescale(block.attention.output, block_id)
self._bnb_4bit_dequantize_and_rescale(block.feed_forward.value, block_id)else:# Default case: rescale weights
block.attention.output.weight.div_(2**int(block_id // self.config.rescale_every))
block.feed_forward.value.weight.div_(2**int(block_id // self.config.rescale_every))# Update rescaling status
self.layers_are_rescaled =not self.training
def_bnb_4bit_dequantize_and_rescale(self, target_layer, block_id):r"""
Perform the dequantization and rescaling of the weights of a given layer. After that operation the layer will
be quantized again.
"""# Check if bitsandbytes library is availableifnot is_bitsandbytes_available():raise ImportError("Please install bitsandbytes to use this method.")import bitsandbytes as bnb
# Dequantize 4-bit weights
dequant_weights = bnb.functional.dequantize_4bit(target_layer.weight.data, target_layer.weight.quant_state)# Rescale weights
dequant_weights.div_(2**int(block_id // self.config.rescale_every))# Re-quantize the weights# Move weights to CPU and back to device to handle quantization
quant_weight = bnb.nn.Params4bit(dequant_weights.to("cpu"), requires_grad=False).to(dequant_weights.device)setattr(target_layer,"weight", quant_weight)@add_start_docstrings("""
The RWKV Model transformer with a language modeling head on top (linear layer with weights tied to the input
embeddings).
""",
RWKV_START_DOCSTRING,)classRwkvForCausalLM(RwkvPreTrainedModel):
_tied_weights_keys =["head.weight"]def__init__(self, config):super().__init__(config)
self.rwkv = RwkvModel(config)# 初始化 RWKV 模型
self.head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)# 创建线性层作为语言建模的输出层# Initialize weights and apply final processing
self.post_init()# 执行初始化权重和最终处理defget_output_embeddings(self):return self.head # 返回输出层的权重defset_output_embeddings(self, new_embeddings):
self.head = new_embeddings # 设置新的输出层权重defgenerate(self,*args,**kwargs):# Thin wrapper to raise exceptions when trying to generate with methods that manipulate `past_key_values`.# RWKV is one of the few models that don't have it (it has `state` instead, which has different properties and# usage).try:
gen_output =super().generate(*args,**kwargs)# 调用父类的 generate 方法except AttributeError as exc:# Expected exception: "AttributeError: '(object name)' object has no attribute 'past_key_values'"if"past_key_values"instr(exc):raise AttributeError("You tried to call `generate` with a decoding strategy that manipulates `past_key_values`. RWKV ""doesn't have that attribute, try another generation strategy instead. For the available ""generation strategies, check this doc: https://huggingface.co/docs/transformers/en/generation_strategies#decoding-strategies")else:raise exc
return gen_output
defprepare_inputs_for_generation(self, input_ids, state=None, inputs_embeds=None,**kwargs):# only last token for inputs_ids if the state is passed along.if state isnotNone:
input_ids = input_ids[:,-1].unsqueeze(-1)# 只使用输入的最后一个标记作为生成输入# if `inputs_embeds` are passed, we only want to use them in the 1st generation stepif inputs_embeds isnotNoneand state isNone:
model_inputs ={"inputs_embeds": inputs_embeds}# 如果传入了 inputs_embeds,则只在第一个生成步骤中使用它们else:
model_inputs ={"input_ids": input_ids}
model_inputs["state"]= state # 将状态信息添加到模型输入中return model_inputs
@add_start_docstrings_to_model_forward(RWKV_INPUTS_DOCSTRING)@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=RwkvCausalLMOutput,
config_class=_CONFIG_FOR_DOC,)defforward(
self,
input_ids: Optional[torch.LongTensor]=None,
attention_mask: Optional[torch.LongTensor]=None,# noqa
inputs_embeds: Optional[torch.FloatTensor]=None,
state: Optional[List[torch.FloatTensor]]=None,
labels: Optional[torch.LongTensor]=None,
use_cache: Optional[bool]=None,
output_attentions: Optional[bool]=None,
output_hidden_states: Optional[bool]=None,
return_dict: Optional[bool]=None,)-> Union[Tuple, RwkvCausalLMOutput]:r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
`labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
"""# 如果 return_dict 为 None,则使用模型配置中的默认值
return_dict = return_dict if return_dict isnotNoneelse self.config.use_return_dict
# 调用 rwkv 方法进行前向传播
rwkv_outputs = self.rwkv(
input_ids,
inputs_embeds=inputs_embeds,
state=state,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,)# 获取 rwkv 输出中的隐藏状态
hidden_states = rwkv_outputs[0]# 将隐藏状态传入头部模型计算 logits
logits = self.head(hidden_states)# 初始化损失为 None
loss =None# 如果存在标签,则计算损失if labels isnotNone:# 将标签移动到与 logits 相同的设备上,以便进行模型并行计算
labels = labels.to(logits.device)# 将 logits 向左移动一个位置,以对齐标签
shift_logits = logits[...,:-1,:].contiguous()
shift_labels = labels[...,1:].contiguous()# 使用交叉熵损失函数计算损失
loss_fct = CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))# 如果 return_dict 为 False,则返回一个元组ifnot return_dict:
output =(logits,)+ rwkv_outputs[1:]return((loss,)+ output)if loss isnotNoneelse output
# 如果 return_dict 为 True,则返回 RwkvCausalLMOutput 对象return RwkvCausalLMOutput(
loss=loss,
logits=logits,
state=rwkv_outputs.state,
hidden_states=rwkv_outputs.hidden_states,
attentions=rwkv_outputs.attentions,)
# 设置文件编码为 UTF-8# 版权声明,指出版权归 HuggingFace Inc. 团队所有## 根据 Apache 许可证 2.0 版本授权使用本文件,除非遵守许可证的条款,否则不得使用此文件# 可以在以下网址获取许可证的副本:## http://www.apache.org/licenses/LICENSE-2.0## 除非适用法律要求或书面同意,本软件是基于“原样”分发的,不提供任何形式的担保或条件,无论是明示的还是隐含的# 请参阅许可证了解具体的法律规定""" SAM 模型配置"""# 从配置工具中导入预训练配置类from...configuration_utils import PretrainedConfig
# 从工具包中导入日志记录模块from...utils import logging
# 获取名为 __name__ 的日志记录器
logger = logging.get_logger(__name__)# 定义 SAM 预训练配置文件映射字典
SAM_PRETRAINED_CONFIG_ARCHIVE_MAP ={"facebook/sam-vit-huge":"https://huggingface.co/facebook/sam-vit-huge/resolve/main/config.json","facebook/sam-vit-large":"https://huggingface.co/facebook/sam-vit-large/resolve/main/config.json","facebook/sam-vit-base":"https://huggingface.co/facebook/sam-vit-base/resolve/main/config.json",}# 定义 SamPromptEncoderConfig 类,继承自 PretrainedConfigclassSamPromptEncoderConfig(PretrainedConfig):r"""
这是用于存储 [`SamPromptEncoder`] 配置的配置类。[`SamPromptEncoder`] 模块用于编码输入的 2D 点和边界框。
实例化配置默认将生成与 SAM-vit-h
[facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) 架构类似的配置。
配置对象继承自 [`PretrainedConfig`],可用于控制模型输出。有关更多信息,请阅读 [`PretrainedConfig`] 的文档。
Args:
hidden_size (`int`, *optional*, 默认为 256):
隐藏状态的维度。
image_size (`int`, *optional*, 默认为 1024):
图像的预期输出分辨率。
patch_size (`int`, *optional*, 默认为 16):
每个补丁的大小(分辨率)。
mask_input_channels (`int`, *optional*, 默认为 16):
要馈送到 `MaskDecoder` 模块的通道数。
num_point_embeddings (`int`, *optional*, 默认为 4):
要使用的点嵌入数量。
hidden_act (`str`, *optional*, 默认为 `"gelu"`):
编码器和池化器中的非线性激活函数。
"""def__init__(
self,
hidden_size=256,
image_size=1024,
patch_size=16,
mask_input_channels=16,
num_point_embeddings=4,
hidden_act="gelu",
layer_norm_eps=1e-6,**kwargs,):# 调用父类的构造函数,传递所有的关键字参数super().__init__(**kwargs)# 设置隐藏层大小
self.hidden_size = hidden_size
# 设置图像大小
self.image_size = image_size
# 设置补丁大小
self.patch_size = patch_size
# 计算图像嵌入大小,等于图像大小除以补丁大小
self.image_embedding_size = image_size // patch_size
# 设置掩码输入通道数
self.mask_input_channels = mask_input_channels
# 设置点嵌入数量
self.num_point_embeddings = num_point_embeddings
# 设置隐藏层激活函数
self.hidden_act = hidden_act
# 设置层归一化的 epsilon 值
self.layer_norm_eps = layer_norm_eps
# `SamMaskDecoderConfig` 类,用于存储 `SamMaskDecoder` 的配置信息。# 继承自 `PretrainedConfig`,用于控制模型输出。# 该配置类用于实例化一个 `SamMaskDecoder`,定义模型的架构。# 默认情况下,实例化配置类将生成类似于 `facebook/sam-vit-huge` 架构的配置。classSamMaskDecoderConfig(PretrainedConfig):r"""
This is the configuration class to store the configuration of a [`SamMaskDecoder`]. It is used to instantiate a SAM
mask decoder to the specified arguments, defining the model architecture. Instantiating a configuration defaults
will yield a similar configuration to that of the SAM-vit-h
[facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
hidden_size (`int`, *optional*, defaults to 256):
Dimensionality of the hidden states.
hidden_act (`str`, *optional*, defaults to `"relu"`):
The non-linear activation function used inside the `SamMaskDecoder` module.
mlp_dim (`int`, *optional*, defaults to 2048):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
num_hidden_layers (`int`, *optional*, defaults to 2):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 8):
Number of attention heads for each attention layer in the Transformer encoder.
attention_downsample_rate (`int`, *optional*, defaults to 2):
The downsampling rate of the attention layer.
num_multimask_outputs (`int`, *optional*, defaults to 3):
The number of outputs from the `SamMaskDecoder` module. In the Segment Anything paper, this is set to 3.
iou_head_depth (`int`, *optional*, defaults to 3):
The number of layers in the IoU head module.
iou_head_hidden_dim (`int`, *optional*, defaults to 256):
The dimensionality of the hidden states in the IoU head module.
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the layer normalization layers.
"""# 初始化方法,用于设置配置参数def__init__(
self,
hidden_size=256,
hidden_act="relu",
mlp_dim=2048,
num_hidden_layers=2,
num_attention_heads=8,
attention_downsample_rate=2,
num_multimask_outputs=3,
iou_head_depth=3,
iou_head_hidden_dim=256,
layer_norm_eps=1e-6,**kwargs,):# 调用父类的初始化方法super().__init__(**kwargs)# 设置各个配置参数
self.hidden_size = hidden_size
self.hidden_act = hidden_act
self.mlp_dim = mlp_dim
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.attention_downsample_rate = attention_downsample_rate
self.num_multimask_outputs = num_multimask_outputs
self.iou_head_depth = iou_head_depth
self.iou_head_hidden_dim = iou_head_hidden_dim
self.layer_norm_eps = layer_norm_eps
classSamVisionConfig(PretrainedConfig):r"""
This is the configuration class to store the configuration of a [`SamVisionModel`]. It is used to instantiate a SAM
vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
defaults will yield a similar configuration to that of the SAM ViT-h
[facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
# 设置编码器层和池化层的维度大小,默认为768
hidden_size (`int`, *optional*, defaults to 768):
# Patch Encoder 中输出通道的维度大小,默认为256
output_channels (`int`, *optional*, defaults to 256):
# Transformer 编码器中隐藏层的数量,默认为12
num_hidden_layers (`int`, *optional*, defaults to 12):
# Transformer 编码器中每个注意力层的注意力头数,默认为12
num_attention_heads (`int`, *optional*, defaults to 12):
# 输入图像的通道数,默认为3
num_channels (`int`, *optional*, defaults to 3):
# 期望的输入图像分辨率,默认为1024
image_size (`int`, *optional*, defaults to 1024):
# 从输入图像中提取的补丁大小,默认为16
patch_size (`int`, *optional*, defaults to 16):
# 非线性激活函数的类型,默认为 "gelu"
hidden_act (`str`, *optional*, defaults to `"gelu"`):
# 层归一化层中使用的 epsilon 值,默认为 1e-06
layer_norm_eps (`float`, *optional*, defaults to 1e-06):
# 注意力概率的 dropout 比率,默认为0.0(不使用 dropout)
attention_dropout (`float`, *optional*, defaults to 0.0):
# 初始化所有权重矩阵的截断正态分布的标准差,默认为1e-10
initializer_range (`float`, *optional*, defaults to 1e-10):
# 是否向查询、键、值的投影中添加偏置,默认为 True
qkv_bias (`bool`, *optional*, defaults to `True`):
# MLP 隐藏层维度与嵌入维度之比,默认为4.0
mlp_ratio (`float`, *optional*, defaults to 4.0):
# 是否使用绝对位置编码,默认为 True
use_abs_pos (`bool`, *optional*, defaults to `True`):
# 是否使用相对位置编码,默认为 True
use_rel_pos (`bool`, *optional*, defaults to `True`):
# 相对位置的窗口大小,默认为14
window_size (`int`, *optional*, defaults to 14):
# 全局注意力层的索引列表,默认为 `[2, 5, 8, 11]`
global_attn_indexes (`List[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
# 位置嵌入的维度大小,默认为128
num_pos_feats (`int`, *optional*, defaults to 128):
# Transformer 编码器中 MLP 层的维度大小。如果为 `None`,则默认为 `mlp_ratio * hidden_size`
mlp_dim (`int`, *optional*):
# 初始化函数,设置Transformer模型的各项参数
def __init__(
self,
hidden_size=768, # 隐藏层大小,默认为768
output_channels=256, # 输出通道数,默认为256
num_hidden_layers=12, # 隐藏层的数量,默认为12
num_attention_heads=12, # 注意力头的数量,默认为12
num_channels=3, # 输入图像的通道数,默认为3(RGB)
image_size=1024, # 输入图像的大小,默认为1024x1024像素
patch_size=16, # 图像分块的大小,默认为16x16像素
hidden_act="gelu", # 隐藏层激活函数,默认为GELU
layer_norm_eps=1e-06, # Layer Normalization的epsilon,默认为1e-06
attention_dropout=0.0, # 注意力机制的dropout率,默认为0.0(不使用dropout)
initializer_range=1e-10, # 参数初始化的范围,默认为1e-10
qkv_bias=True, # 是否在QKV矩阵中使用偏置,默认为True
mlp_ratio=4.0, # MLP的维度扩展比例,默认为4.0
use_abs_pos=True, # 是否使用绝对位置编码,默认为True
use_rel_pos=True, # 是否使用相对位置编码,默认为True
window_size=14, # 局部注意力窗口大小,默认为14
global_attn_indexes=[2, 5, 8, 11], # 全局注意力层的索引,默认为[2, 5, 8, 11]
num_pos_feats=128, # 位置特征的数量,默认为128
mlp_dim=None, # MLP的维度,默认为hidden_size * mlp_ratio,若给定mlp_dim则使用给定值
**kwargs, # 其他未指定的参数
):
super().__init__(**kwargs) # 调用父类的初始化方法
self.hidden_size = hidden_size # 设置隐藏层大小属性
self.output_channels = output_channels # 设置输出通道数属性
self.num_hidden_layers = num_hidden_layers # 设置隐藏层数量属性
self.num_attention_heads = num_attention_heads # 设置注意力头数量属性
self.num_channels = num_channels # 设置输入图像通道数属性
self.image_size = image_size # 设置输入图像大小属性
self.patch_size = patch_size # 设置图像分块大小属性
self.hidden_act = hidden_act # 设置隐藏层激活函数属性
self.layer_norm_eps = layer_norm_eps # 设置Layer Normalization的epsilon属性
self.attention_dropout = attention_dropout # 设置注意力dropout率属性
self.initializer_range = initializer_range # 设置参数初始化范围属性
self.qkv_bias = qkv_bias # 设置是否使用QKV偏置属性
self.mlp_ratio = mlp_ratio # 设置MLP维度扩展比例属性
self.use_abs_pos = use_abs_pos # 设置是否使用绝对位置编码属性
self.use_rel_pos = use_rel_pos # 设置是否使用相对位置编码属性
self.window_size = window_size # 设置局部注意力窗口大小属性
self.global_attn_indexes = global_attn_indexes # 设置全局注意力层的索引属性
self.num_pos_feats = num_pos_feats # 设置位置特征数量属性
self.mlp_dim = int(hidden_size * mlp_ratio) if mlp_dim is None else mlp_dim # 设置MLP的维度属性,如果mlp_dim未指定则计算为hidden_size * mlp_ratio
# 定义 `SamConfig` 类,用于存储 `SamModel` 的配置信息,继承自 `PretrainedConfig`。
class SamConfig(PretrainedConfig):
# 文档字符串,描述了 `SamConfig` 的作用和用法,以及如何实例化 SAM 模型的相关参数。
r"""[`SamConfig`]is the configuration classto store the configuration of a [`SamModel`]. It is used to instantiate a
SAM model according to the specified arguments, defining the vision model, prompt-encoder model and mask decoder
configs. Instantiating a configuration with the defaults will yield a similar configuration to that of the
SAM-ViT-H [facebook/sam-vit-huge](https://huggingface.co/facebook/sam-vit-huge) architecture.
Configuration objects inherit from[`PretrainedConfig`]and can be used to control the model outputs. Read the
documentation from[`PretrainedConfig`]for more information.
Args:
vision_config (Union[`dict`, `SamVisionConfig`],*optional*):
Dictionary of configuration options used to initialize [`SamVisionConfig`].
prompt_encoder_config (Union[`dict`, `SamPromptEncoderConfig`],*optional*):
Dictionary of configuration options used to initialize [`SamPromptEncoderConfig`].
mask_decoder_config (Union[`dict`, `SamMaskDecoderConfig`],*optional*):
Dictionary of configuration options used to initialize [`SamMaskDecoderConfig`].
kwargs (*optional*):
Dictionary of keyword arguments.
Example:
```
>>>from transformers import(... SamVisionConfig,... SamPromptEncoderConfig,... SamMaskDecoderConfig,... SamModel,...)>>># Initializing a SamConfig with `"facebook/sam-vit-huge"` style configuration>>> configuration = SamConfig()>>># Initializing a SamModel (with random weights) from the `"facebook/sam-vit-huge"` style configuration>>> model = SamModel(configuration)>>># Accessing the model configuration>>> configuration = model.config
>>># We can also initialize a SamConfig from a SamVisionConfig, SamPromptEncoderConfig, and SamMaskDecoderConfig>>># Initializing SAM vision, SAM Q-Former and language model configurations>>> vision_config = SamVisionConfig()>>> prompt_encoder_config = SamPromptEncoderConfig()>>> mask_decoder_config = SamMaskDecoderConfig()>>> config = SamConfig(vision_config, prompt_encoder_config, mask_decoder_config)
```"""
# 类属性 `model_type`,指定模型类型为 "sam"。
model_type ="sam"# 构造函数 `__init__`,用于初始化 `SamConfig` 类的实例。def__init__(
self,
vision_config=None,
prompt_encoder_config=None,
mask_decoder_config=None,
initializer_range=0.02,**kwargs,):# 调用父类的构造方法,传递所有的关键字参数super().__init__(**kwargs)# 如果 vision_config 不为 None,则使用其值;否则使用空字典
vision_config = vision_config if vision_config isnotNoneelse{}# 如果 prompt_encoder_config 不为 None,则使用其值;否则使用空字典
prompt_encoder_config = prompt_encoder_config if prompt_encoder_config isnotNoneelse{}# 如果 mask_decoder_config 不为 None,则使用其值;否则使用空字典
mask_decoder_config = mask_decoder_config if mask_decoder_config isnotNoneelse{}# 如果 vision_config 是 SamVisionConfig 类的实例,则转换为字典ifisinstance(vision_config, SamVisionConfig):
vision_config = vision_config.to_dict()# 如果 prompt_encoder_config 是 SamPromptEncoderConfig 类的实例,则转换为字典ifisinstance(prompt_encoder_config, SamPromptEncoderConfig):
prompt_encoder_config = prompt_encoder_config.to_dict()# 如果 mask_decoder_config 是 SamMaskDecoderConfig 类的实例,则转换为字典ifisinstance(mask_decoder_config, SamMaskDecoderConfig):
mask_decoder_config = mask_decoder_config.to_dict()# 使用 vision_config 字典创建 SamVisionConfig 对象
self.vision_config = SamVisionConfig(**vision_config)# 使用 prompt_encoder_config 字典创建 SamPromptEncoderConfig 对象
self.prompt_encoder_config = SamPromptEncoderConfig(**prompt_encoder_config)# 使用 mask_decoder_config 字典创建 SamMaskDecoderConfig 对象
self.mask_decoder_config = SamMaskDecoderConfig(**mask_decoder_config)# 设置 initializer_range 实例变量
self.initializer_range = initializer_range