transformers/tokenization_utils_base.py＞ PreTrainedTokenizerBase＞from_pretrained[transformers源码解析]

强化学习曾小健

已于 2024-07-18 14:14:41 修改

阅读量1k

点赞数 11

分类专栏： # transformers源码解析文章标签：深度学习机器学习

于 2024-07-18 14:11:54 首次发布

本文链接：https://blog.csdn.net/sinat_37574187/article/details/140521429

版权

transformers源码解析专栏收录该内容

16 篇文章

订阅专栏

@property
    def default_chat_template(self):
        """
        This template formats inputs in the standard ChatML format. See
        https://github.com/openai/openai-python/blob/main/chatml.md
        """
        return (
            "{% for message in messages %}"
            "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
            "{% endfor %}"
            "{% if add_generation_prompt %}"
            "{{ '<|im_start|>assistant\n' }}"
            "{% endif %}"
        )

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: Union[str, os.PathLike],
        *init_inputs,
        cache_dir: Optional[Union[str, os.PathLike]] = None,
        force_download: bool = False,
        local_files_only: bool = False,
        token: Optional[Union[str, bool]] = None,
        revision: str = "main",
        trust_remote_code=False,
        **kwargs,
    ):
        r"""
        Instantiate a [`~tokenization_utils_base.PreTrainedTokenizerBase`] (or a derived class) from a predefined
        tokenizer.

        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                Can be either:

                - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
                - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
                  using the [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`] method, e.g.,
                  `./my_model_directory/`.
                - (**Deprecated**, not applicable to all derived classes) A path or url to a single saved vocabulary
                  file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g.,
                  `./my_model_directory/vocab.txt`.
            cache_dir (`str` or `os.PathLike`, *optional*):
                Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the
                standard cache should not be used.
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download the vocabulary files and override the cached versions if they
                exist.
            resume_download:
                Deprecated and ignored. All downloads are now resumed by default when possible.
                Will be removed in v5 of Transformers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            token (`str` or *bool*, *optional*):
                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
                when running `huggingface-cli login` (stored in `~/.huggingface`).
            local_files_only (`bool`, *optional*, defaults to `False`):
                Whether or not to only rely on local files and not to attempt to download any files.
            revision (`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                identifier allowed by git.
            subfolder (`str`, *optional*):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
                facebook/rag-token-base), specify it here.
            inputs (additional positional arguments, *optional*):
                Will be passed along to the Tokenizer `__init__` method.
            trust_remote_code (`bool`, *optional*, defaults to `False`):
                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
                should only be set to `True` for repositories you trust and in which you have read the code, as it will
                execute code present on the Hub on your local machine.
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the Tokenizer `__init__` method. Can be used to set special tokens like `bos_token`,
                `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
                `additional_special_tokens`. See parameters in the `__init__` for more details.

        <Tip>

        Passing `token=True` is required when you want to use a private model.

        </Tip>

        Examples:

        ```python
        # We can't instantiate directly the base class *PreTrainedTokenizerBase* so let's show our examples on a derived class: BertTokenizer
        # Download vocabulary from huggingface.co and cache.
        tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")

        # Download vocabulary from huggingface.co (user-uploaded) and cache.
        tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-german-cased")

        # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
        tokenizer = BertTokenizer.from_pretrained("./test/saved_model/")

        # If the tokenizer uses a single vocabulary file, you can point directly to this file
        tokenizer = BertTokenizer.from_pretrained("./test/saved_model/my_vocab.txt")

        # You can link tokens to special vocabulary when instantiating
        tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased", unk_token="<unk>")
        # You should be sure '<unk>' is in the vocabulary when doing that.
        # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
        assert tokenizer.unk_token == "<unk>"
        ```"""
        resume_download = kwargs.pop("resume_download", None)
        proxies = kwargs.pop("proxies", None)
        use_auth_token = kwargs.pop("use_auth_token", None)
        subfolder = kwargs.pop("subfolder", None)
        from_pipeline = kwargs.pop("_from_pipeline", None)
        from_auto_class = kwargs.pop("_from_auto", False)
        commit_hash = kwargs.pop("_commit_hash", None)
        gguf_file = kwargs.get("gguf_file", None)

"""
实例化一个 [`~tokenization_utils_base.PreTrainedTokenizerBase`]（或其派生类）从预定义的分词器中。

参数:
pretrained_model_name_or_path (`str` 或 `os.PathLike`):
可以是以下之一:

- 一个字符串，预定义分词器的 *model id*，托管在 huggingface.co 上的模型库中。
- 包含分词器所需词汇表文件的 *目录* 的路径，例如使用 [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`] 方法保存的，
例如 `./my_model_directory/`。
- (**已弃用**, 不适用于所有派生类) 单个保存的词汇表文件的路径或 URL（仅当分词器只需要一个词汇表文件时，如 Bert 或 XLNet），例如
`./my_model_directory/vocab.txt`。
cache_dir (`str` 或 `os.PathLike`, *可选*):
下载的预定义分词器词汇表文件应缓存到的目录路径，如果不使用标准缓存。
force_download (`bool`, *可选*, 默认为 `False`):
是否强制重新下载词汇表文件并覆盖缓存版本（如果存在）。
resume_download:
已弃用且被忽略。所有下载现在默认在可能时恢复。
将在 Transformers v5 中移除。
proxies (`Dict[str, str]`, *可选*):
按协议或端点使用的代理服务器字典，例如 `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`。代理在每个请求中使用。
token (`str` 或 *bool*, *可选*):
用作远程文件 HTTP 承载授权的令牌。如果为 `True`，将使用运行 `huggingface-cli login` 时生成的令牌（存储在 `~/.huggingface` 中）。
local_files_only (`bool`, *可选*, 默认为 `False`):
是否仅依赖本地文件，不尝试下载任何文件。
revision (`str`, *可选*, 默认为 `"main"`):
要使用的具体模型版本。可以是分支名、标签名或提交 ID，因为我们使用基于 git 的系统来存储 huggingface.co 上的模型和其他工件，
因此 `revision` 可以是 git 允许的任何标识符。
subfolder (`str`, *可选*):
如果相关文件位于 huggingface.co 上的模型库的子文件夹中（例如对于 facebook/rag-token-base），请在此指定。
inputs (额外的位置参数, *可选*):
将传递给 Tokenizer 的 `__init__` 方法。
trust_remote_code (`bool`, *可选*, 默认为 `False`):
是否允许自定义模型在 Hub 上定义自己建模文件。如果该选项设置为 `True`，仅对您信任的仓库使用，并且您已阅读其代码，
因为它将在您的本地机器上执行 Hub 上的代码。
kwargs (额外的关键字参数, *可选*):
将传递给 Tokenizer 的 `__init__` 方法。可以用于设置特殊标记，如 `bos_token`、`eos_token`、`unk_token`、
`sep_token`、`pad_token`、`cls_token`、`mask_token`、`additional_special_tokens`。详见 `__init__` 中的参数。

<提示>

如果您想使用私有模型，传递 `token=True` 是必须的。

</提示>

示例:

```python
# 不能直接实例化基类 *PreTrainedTokenizerBase*，所以我们在派生类上展示示例: BertTokenizer
# 从 huggingface.co 下载词汇表并缓存。
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")

# 从 huggingface.co （用户上传）下载词汇表并缓存。
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-german-cased")

# 如果词汇表文件在目录中（例如分词器使用 *save_pretrained('./test/saved_model/')* 保存）
tokenizer = BertTokenizer.from_pretrained("./test/saved_model/")

# 如果分词器使用单个词汇表文件，可以直接指向该文件
tokenizer = BertTokenizer.from_pretrained("./test/saved_model/my_vocab.txt")

# 实例化时可以将标记链接到特殊词汇表
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased", unk_token="<unk>")
# 在这样做时，您应该确保 '<unk>' 在词汇表中。
# 否则请使用 tokenizer.add_special_tokens({'unk_token': '<unk>'}) 代替。
assert tokenizer.unk_token == "<unk>"

我会逐行解释这段代码：

@property 装饰器:
这表明 default_chat_template 是一个属性，而不是普通方法。
def default_chat_template(self)::
定义了 default_chat_template 属性的 getter 方法。
文档字符串:
解释了这个模板使用标准的 ChatML 格式，并提供了相关链接。

4-10. 返回的模板字符串:

使用 Jinja2 模板语法。
遍历 messages 列表，为每条消息添加适当的标记。
如果 add_generation_prompt 为真，添加助手回复的起始标记。

@classmethod 装饰器:
表示 from_pretrained 是一个类方法，可以直接通过类调用，而不需要实例。

12-24. def from_pretrained(...)::
定义了 from_pretrained 方法，用于从预训练模型加载分词器。

接受多个参数，如模型路径、缓存目录、下载选项等。
使用 Union 类型提示允许某些参数接受多种类型。

25-31. 参数处理:
从 kwargs 中提取并移除特定参数，如 resume_download, proxies 等。

32-42. 废弃警告和参数检查:

警告 use_auth_token 将在未来版本中被移除。
检查 token 和 use_auth_token 是否同时被指定。

43-46. 用户代理设置:
创建一个字典来存储用户代理信息，包括文件类型、是否来自自动类等。

47-49. 离线模式检查:
如果处于离线模式，强制使用本地文件。

50-52. 初始化变量:

将路径转换为字符串。
初始化 vocab_files 和 init_configuration 字典。

这段代码主要定义了两个重要的方法：

default_chat_template：提供了默认的聊天模板，使用 ChatML 格式。
from_pretrained：一个类方法，用于从预训练模型加载分词器，处理各种加载选项和参数。

这些方法是分词器类的一部分，用于处理模型输入的格式化和从预训练模型加载分词器的过程。

让我们逐行解释这段 transformers 库中的源代码。

`default_chat_template` 属性

python

复制

@property
def default_chat_template(self):
    """
    This template formats inputs in the standard ChatML format. See
    https://github.com/openai/openai-python/blob/main/chatml.md
    """
    return (
        "{% for message in messages %}"
        "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '好的，让我们逐行解释这段 `transformers` 库中的源代码。

### `default_chat_template` 属性

```python
@property
def default_chat_template(self):
    """
    This template formats inputs in the standard ChatML format. See
    https://github.com/openai/openai-python/blob/main/chatml.md
    """
    return (
        "{% for message in messages %}"
        "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '好的，让我们逐行解释这段 `transformers` 库中的源代码。

### `default_chat_template` 属性

```python
@property
def default_chat_template(self):
    """
    This template formats inputs in the standard ChatML format. See
    https://github.com/openai/openai-python/blob/main/chatml.md
    """
    return (
        "{% for message in messages %}"
        "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '### `default_chat_template` 属性

```python
@property
def default_chat_template(self):
    """
    This template formats inputs in the standard ChatML format. See
    https://github.com/openai/openai-python/blob/main/chatml.md
    """
    return (
        "{% for message in messages %}"
        "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '### `default_chat_template` 属性

```python
@property
def default_chat_template(self):
    """
    This template formats inputs in the standard ChatML format. See
    https://github.com/openai/openai-python/blob/main/chatml.md
    """
    return (
        "{% for message in messages %}"
        "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '### `default_chat_template` 属性

```python
@property
def default_chat_template(self):
    """
    This template formats inputs in the standard ChatML format. See
    https://github.com/openai/openai-python/blob/main/chatml.md
    """
    return (
        "{% for message in messages %}"
        "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '好的，让我们逐行解释这段 `transformers` 库中的源代码。

### `default_chat_template` 属性

```python
@property
def default_chat_template(self):
    """
    This template formats inputs in the standard ChatML format. See
    https://github.com/openai/openai-python/blob/main/chatml.md
    """
    return (
        "{% for message in messages %}"
        "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '好的，让我们逐行解释这段 `transformers` 库中的源代码。

### `default_chat_template` 属性

```python
@property
def default_chat_template(self):
    """
    This template formats inputs in the standard ChatML format. See
    https://github.com/openai/openai-python/blob/main/chatml.md
    """
    return (
        "{% for message in messages %}"
        "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '让我们逐行解释这段 `transformers` 库中的源代码。

### `default_chat_template` 属性

```python
@property
def default_chat_template(self):
    """
    This template formats inputs in the standard ChatML format. See
    https://github.com/openai/openai-python/blob/main/chatml.md
    """
    return (
        "{% for message in messages %}"
        "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '好的，让我们逐行解释这段 `transformers` 库中的源代码。

### `default_chat_template` 属性

```python
@property
def default_chat_template(self):
    """
    This template formats inputs in the standard ChatML format. See
    https://github.com/openai/openai-python/blob/main/chatml.md
    """
    return (
        "{% for message in messages %}"
        "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '### `default_chat_template` 属性

```python
@property
def default_chat_template(self):
    """
    This template formats inputs in the standard ChatML format. See
    https://github.com/openai/openai-python/blob/main/chatml.md
    """
    return (
        "{% for message in messages %}"
        "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '好的，让我们逐行解释这段 `transformers` 库中的源代码。

### `default_chat_template` 属性

```python
@property
def default_chat_template(self):
    """
    This template formats inputs in the standard ChatML format. See
    https://github.com/openai/openai-python/blob/main/chatml.md
    """
    return (
        "{% for message in messages %}"
        "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '让我们逐行解释这段 `transformers` 库中的源代码。

### `default_chat_template` 属性

```python
@property
def default_chat_template(self):
    """
    This template formats inputs in the standard ChatML format. See
    https://github.com/openai/openai-python/blob/main/chatml.md
    """
    # 这里定义了一个属性方法 `default_chat_template`，它返回一个格式化的字符串模板，用于将输入格式化为标准的 ChatML 格式。
    return (
        "{% for message in messages %}"
        "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '好的，让我们逐行解释这段 `transformers` 库中的源代码。

### `default_chat_template` 属性

```python
@property
def default_chat_template(self):
    """
    This template formats inputs in the standard ChatML format. See
    https://github.com/openai/openai-python/blob/main/chatml.md
    """
    # 这里定义了一个属性方法 `default_chat_template`，它返回一个格式化的字符串模板，用于将输入格式化为标准的 ChatML 格式。
    return (
        # 这个模板使用 Jinja2 模板语法格式化消息列表 messages
        "{% for message in messages %}"
        # 对每个消息 message，输出开始标签 `<|im_start|>`，接着是消息的角色（role）和内容（content），最后是结束标签 `### `default_chat_template` 属性

```python
@property
def default_chat_template(self):
    """
    This template formats inputs in the standard ChatML format. See
    https://github.com/openai/openai-python/blob/main/chatml.md
    """
    # 定义一个属性方法 `default_chat_template`，返回一个格式化的字符串模板，用于将输入格式化为标准的 ChatML 格式。
    return (
        # 使用 Jinja2 模板语法格式化消息列表 `messages`
        "{% for message in messages %}"
        # 对每个消息 `message`，输出开始标签 `<|im_start|>`，接着是消息的角色（role）和内容（content），最后是结束标签 `### `default_chat_template` 属性

```python
@property
def default_chat_template(self):
    """
    This template formats inputs in the standard ChatML format. See
    https://github.com/openai/openai-python/blob/main/chatml.md
    """
    # 这里定义了一个属性方法 `default_chat_template`，它返回一个格式化的字符串模板，用于将输入格式化为标准的 ChatML 格式。
    return (
        # 这个模板使用 Jinja2 模板语法格式化消息列表 messages
        "{% for message in messages %}"
        # 对每个消息 message，输出开始标签 `<|im_start|>`，接着是消息的角色（role）和内容（content），最后是结束标签 `好的，让我们逐行解释这段 `transformers` 库中的源代码。

### `default_chat_template` 属性

```python
@property
def default_chat_template(self):
    """
    This template formats inputs in the standard ChatML format. See
    https://github.com/openai/openai-python/blob/main/chatml.md
    """
    return (
        "{% for message in messages %}"
        "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '好的，让我们逐行解释这段 `transformers` 库中的源代码。

### `default_chat_template` 属性

```python
@property
def default_chat_template(self):
    """
    This template formats inputs in the standard ChatML format. See
    https://github.com/openai/openai-python/blob/main/chatml.md
    """
    # 定义一个属性方法 `default_chat_template`，返回一个格式化的字符串模板，用于将输入格式化为标准的 ChatML 格式。
    return (
        # 使用 Jinja2 模板语法格式化消息列表 `messages`
        "{% for message in messages %}"
        # 对每个消息 `message`，输出开始标签 `<|im_start|>`，接着是消息的角色（role）和内容（content），最后是结束标签 `### `default_chat_template` 属性

```python
@property
def default_chat_template(self):
    """
    This template formats inputs in the standard ChatML format. See
    https://github.com/openai/openai-python/blob/main/chatml.md
    """
    return (
        "{% for message in messages %}"
        "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '\n