qwen.cpp和模型下载方式

tensor.shape

已于 2024-03-07 19:02:24 修改

阅读量1k

点赞数 9

分类专栏： llama 文章标签：深度学习人工智能语言模型

于 2024-03-02 20:49:50 首次发布

本文链接：https://blog.csdn.net/qq_44576434/article/details/136422121

版权

llama 专栏收录该内容

12 篇文章 0 订阅

订阅专栏

1、Release模式

git clone --recursive https://github.com/QwenLM/qwen.cpp && cd qwen.cpp
git submodule update --init --recursive
python3 qwen_cpp/convert.py -i /mnt/workspace/qwen.cpp/Qianwen/qwen/Qwen-7B-Chat -t q4_0 -o qwen7b-ggml.bin

cmake -B build
cmake --build build -j --config Release
./build/bin/main -m ./qwen7b-ggml.bin --tiktoken /mnt/workspace/qwen.cpp/Qianwen/qwen/Qwen-7B-Chat/qwen.tiktoken -p 你好

2、Debug模式

将CMakeLists.txt的Release改成Debug

if (NOT CMAKE_BUILD_TYPE)
  set(CMAKE_BUILD_TYPE Debug)
endif ()

cmake --build build -j --config Bebug
gdb ./build/bin/main

问题：
https://github.com/QwenLM/qwen.cpp/pull/40
139行assert中的!=应改为==，否则在debug模式下无法正常运行

2.1 Qwen的covert

"""
Convert Hugging Face Qwen models to GGML format
"""
import argparse
import platform
import struct
import sys
from enum import Enum
from pathlib import Path
from typing import BinaryIO

import torch
from tabulate import tabulate
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM


GGML_QK8_0 = 32
GGML_QK4_0 = 32
GGML_QK4_1 = 32
GGML_QK5_0 = 32
GGML_QK5_1 = 32

GGML_MEM_ALIGN = 16

if platform.system() == "Darwin":
    # cpm_kernels doesn't support macOS but transformers will check missing packages, so mock it
    sys.modules["cpm_kernels"] = object()


class GGMLType(Enum):
    F32 = 0
    F16 = 1
    Q4_0 = 2
    Q4_1 = 3
    Q5_0 = 6
    Q5_1 = 7
    Q8_0 = 8


def quantize_q8_0(tensor: torch.Tensor) -> torch.CharTensor:
    # equivalent to ggml_quantize_q8_0 in ggml.c
    assert tensor.shape[1] % GGML_QK8_0 == 0
    tensor = tensor.view(-1, GGML_QK8_0)
    scale = tensor.abs().max(dim=-1, keepdim=True).values / ((1 << 7) - 1)
    tensor = (tensor / scale).round().clamp(min=-128, max=127).char()
    # add scale into each block
    tensor = torch.cat((scale.half().view(torch.int8), tensor), dim=-1)
    return tensor


def quantize_q4_0(tensor: torch.Tensor) -> torch.CharTensor:
    # equivalent to ggml_quantize_q4_0 in ggml.c，GGML_QK4_0=32
    assert tensor.shape[1] % GGML_QK4_0 == 0
    tensor = tensor.view(-1, GGML_QK4_0)  # 将tensor调整为二维张量
    abs_max_indices = tensor.abs().max(dim=-1, keepdim=True).indices  # ：这行代码计算了每行中绝对值最大的元素的索引
    max_values = torch.take_along_dim(tensor, abs_max_indices, dim=-1) # 这行代码利用上一步计算得到的索引，从张量中取出对应的元素值，形成一个新的张量 max_values。
    scale = max_values / -8
    tensor = (tensor / scale + 8).round().clamp(min=0, max=15).char()
    # compress two int4 weights into an int8
    tensor = tensor[:, :16] | (tensor[:, 16:] << 4)
    # add scale into each block
    tensor = torch.cat((scale.half().view(torch.int8), tensor), dim=-1)
    return tensor


def quantize_q4_1(tensor: torch.Tensor) -> torch.CharTensor:
    # equivalent to ggml_quantize_q4_1 in ggml.c
    assert tensor.shape[1] % GGML_QK4_1 == 0
    tensor = tensor.view(-1, GGML_QK4_1)
    min_vals = tensor.min(dim=-1, keepdim=True).values
    max_vals = tensor.max(dim=-1, keepdim=True).values
    scale = (max_vals - min_vals) / ((1 << 4) - 1)
    tensor = ((tensor - min_vals) / scale).round().clamp(min=0, max=15).char()
    # compress two int4 weights into an int8
    tensor = tensor[:, :16] | (tensor[:, 16:] << 4)
    # add scale & min into each block
    tensor = torch.cat((scale.half().view(torch.int8), min_vals.half().view(torch.int8), tensor), dim=-1)
    return tensor


def quantize_q5_0(tensor: torch.Tensor) -> torch.CharTensor:
    # equivalent to ggml_quantize_q5_0 in ggml.c
    assert tensor.shape[1] % GGML_QK5_0 == 0
    tensor = tensor.view(-1, GGML_QK5_0)
    abs_max_indices = tensor.abs().max(dim=-1, keepdim=True).indices
    max_values = torch.take_along_dim(tensor, abs_max_indices, dim=-1)
    scale = max_values / -16
    tensor = (tensor / scale + 16).round().clamp(min=0, max=31).char()
    qs = (tensor[:, :16] & 0x0F) | (tensor[: 16:] << 4)
    qh = torch.zeros(tensor.shape[:-1], dtype=torch.int32)
    for i in range(32):
        qh |= ((tensor[:, i] & 0x10) >> 4).int() << i

    # add scale into each block
    tensor = torch.cat((scale.half().view(torch.int8), qh[..., None].view(torch.int8), qs), dim=-1)
    return tensor


def quantize_q5_1(tensor: torch.Tensor) -> torch.CharTensor:
    # equivalent to ggml_quantize_q5_1 in ggml.c
    assert tensor.shape[1] % GGML_QK5_1 == 0
    tensor = tensor.view(-1, GGML_QK5_1)
    min_vals = tensor.min(dim=-1, keepdim=True).values
    max_vals = tensor.max(dim=-1, keepdim=True).values
    scale = (max_vals - min_vals) / ((1 << 5) - 1)
    tensor = ((tensor - min_vals) / scale).round().clamp(min=0, max=31).char()
    qs = (tensor[:, :16] & 0x0F) | (tensor[:, 16:] << 4)
    qh = torch.zeros(tensor.shape[:-1], dtype=torch.int32)
    for i in range(32):
        qh |= ((tensor[:, i] & 0x10) >> 4).int() << i

    # add scale & min into each block
    tensor = torch.cat(
        (scale.half().view(torch.int8), min_vals.half().view(torch.int8), qh[..., None].view(torch.int8), qs), dim=-1
    )
    return tensor


def dump_tensor(f, name: str, tensor: torch.Tensor, ggml_type: GGMLType):
    assert tensor.dtype == torch.float32

    # tensor name
    f.write(struct.pack("i", len(name.encode())))
    f.write(name.encode())

    # tensor shape & dtype
    f.write(struct.pack("i" * (2 + tensor.ndim), tensor.ndim, *tensor.shape, ggml_type.value))

    # tensor data
    if ggml_type == GGMLType.F32:
        tensor = tensor.float()
    elif ggml_type == GGMLType.F16:
        tensor = tensor.half()
    elif ggml_type == GGMLType.Q8_0:
        tensor = quantize_q8_0(tensor)
    elif ggml_type == GGMLType.Q4_0:
        tensor = quantize_q4_0(tensor)
    elif ggml_type == GGMLType.Q4_1:
        tensor = quantize_q4_1(tensor)
    elif ggml_type == GGMLType.Q5_0:
        tensor = quantize_q5_0(tensor)
    elif ggml_type == GGMLType.Q5_1:
        tensor = quantize_q5_1(tensor)
    else:
        raise NotImplementedError(f"Cannot dump tensor of dtype {tensor.dtype}")

    # align address
    aligned_pos = (f.tell() + (GGML_MEM_ALIGN - 1)) // GGML_MEM_ALIGN * GGML_MEM_ALIGN
    f.seek(aligned_pos)
    tensor.numpy().tofile(f)
'''OrderedDict([('transformer.wte.weight', tensor([[-1.6846e-02, -9.5825e-03,  8.1787e-03,  ...,  1.6357e-02,
          1.3351e-03,  1.7578e-02],
        [ 2.9297e-03,  9.6436e-03, -6.0425e-03,  ...,  1.8799e-02,
         -8.4839e-03,  9.1553e-03],
        [ 1.6235e-02, -2.2461e-02, -1.0193e-02,  ...,  3.9307e-02,
          2.3071e-02, -1.2589e-03],
        ...,
        [ 1.1921e-07, -8.4043e-06,  2.6822e-05,  ...,  2.5153e-05,
         -4.7684e-06,  4.4703e-06],
        [-8.4639e-06,  1.2040e-05,  2.1219e-05,  ...,  1.9431e-05,
          3.8147e-06,  3.0398e-06],
        [-2.5749e-05,  2.4080e-05, -1.0252e-05,  ...,  2.9802e-06,
         -1.3709e-05,  1.4067e-05]], dtype=torch.float16)), 
         ('transformer.h.0.ln_1.weight', tensor([0.0977, 0.0884, 0.1050,  ..., 0.0938, 0.0957, 0.0938],
       dtype=torch.float16)), 
       ('transformer.h.0.attn.c_attn.weight', tensor([[ 0.0295, -0.0085,  0.0361,  ...,  0.0059, -0.0104,  0.0110],
        [ 0.0023, -0.0036, -0.0481,  ...,  0.0031,  0.0276,  0.0084],
        [ 0.0303, -0.0239, -0.0075,  ...,  0.0315,  0.0053, -0.0413],
        ...,
        [-0.0037,  0.0234, -0.0079,  ...,  0.0068,  0.0258, -0.0160],
        [ 0.0034,  0.0060, -0.0135,  ..., -0.0159, -0.0009,  0.0036],
        [-0.0008,  0.0073,  0.0067,  ..., -0.0405, -0.0018,  0.0080]],
       dtype=torch.float16)), 
       ('transformer.h.0.attn.c_attn.bias', tensor([-9.4531e-01,  1.8828e+00, -7.4609e-01,  ...,  1.7548e-04,
        -3.1128e-03, -1.0605e-03], dtype=torch.float16)), 
        ('transformer.h.0.attn.c_proj.weight', tensor([[-7.2327e-03, -2.6550e-03, -6.4373e-05,  ..., -2.7954e-02,
          5.1880e-03,  1.2146e-02],
        [ 7.3242e-03,  3.7384e-03, -1.1047e-02,  ..., -8.5449e-03,
         -7.9956e-03, -2.6978e-02],
        [-2.9907e-02,  8.3618e-03, -6.0425e-03,  ...,  4.0771e-02,
         -1.6403e-03, -2.9541e-02],
        ...,
        [ 9.2773e-03, -9.5215e-03,  4.6997e-03,  ...,  6.3782e-03,
          3.7003e-04, -8.9111e-03],
        [-3.8574e-02, -6.2256e-03, -3.8574e-02,  ..., -2.8839e-03,
          1.2665e-03, -8.3008e-03],
        [-4.7913e-03,  1.7090e-02,  3.9795e-02,  ..., -1.1292e-02,
         -1.6602e-02, -1.1215e-03]], dtype=torch.float16)),
        ('transformer.h.0.ln_2.weight', tensor([0.1768, 0.1719, 0.1680,  ..., 0.1611, 0.1719, 0.1660],
       dtype=torch.float16)), 
       ('transformer.h.0.mlp.w1.weight', tensor([[-0.0276,  0.0123, -0.0299,  ...,  0.0215, -0.0173, -0.0293],
        [-0.0036, -0.0060,  0.0062,  ..., -0.0009,  0.0176,  0.0119],
        [-0.0013, -0.0004,  0.0214,  ...,  0.0002, -0.0216,  0.0304],
        ...,
        [-0.0161,  0.0149, -0.0017,  ..., -0.0085, -0.0064,  0.0126],
        [ 0.0002,  0.0021,  0.0122,  ...,  0.0026,  0.0254,  0.0322],
        [ 0.0033,  0.0184, -0.0200,  ..., -0.0052, -0.0137,  0.0056]],
       dtype=torch.float16)), 
       ('transformer.h.0.mlp.w2.weight', tensor([[-0.0264, -0.0044,  0.0160,  ...,  0.0128, -0.0221, -0.0148],
        [ 0.0217,  0.0097, -0.0359,  ..., -0.0049,  0.0225,  0.0077],
        [ 0.0192,  0.0140, -0.0132,  ..., -0.0040,  0.0102, -0.0449],
        ...,
        [ 0.0005, -0.0295, -0.0195,  ..., -0.0117,  0.0026, -0.0044],
        [ 0.0035, -0.0194, -0.0269,  ..., -0.0302, -0.0015, -0.0043],
        [-0.0215, -0.0140,  0.0017,  ...,  0.0043,  0.0157,  0.0112]],
       dtype=torch.float16)), 
       ('transformer.h.0.mlp.c_proj.weight', tensor([[-0.0042, -0.0111, -0.0013,  ...,  0.0018, -0.0116,  0.0288],
        [ 0.0192,  0.0131, -0.0081,  ...,  0.0120,  0.0176,  0.0120],
        [ 0.0030,  0.0109,  0.0067,  ...,  0.0003,  0.0166, -0.0005],
        ...,
        [-0.0120,  0.0133,  0.0131,  ..., -0.0273,  0.0016, -0.0011],
        [ 0.0320,  0.0041, -0.0244,  ..., -0.0078,  0.0053,  0.0132],
        [-0.0337, -0.0137, -0.0013,  ..., -0.0088,  0.0315, -0.0094]],
       dtype=torch.float16)), 
       ('transformer.h.1.ln_1.weight', tensor([0.1348, 0.0913, 0.1045,  ..., 0.1289, 0.0806, 0.0938],
       dtype=torch.float16)), 
       ('transformer.h.1.attn.c_attn.weight', tensor([[ 4.4922e-02, -8.2397e-03,  2.1484e-02,  ..., -7.0496e-03,
         -1.6724e-02, -3.0670e-03],
        [-2.2583e-02,  2.9449e-03, -3.7994e-03,  ...,  9.8877e-03,
          7.4768e-03, -2.1210e-03],
        [-3.3447e-02,  7.6599e-03, -1.7822e-02,  ...,  1.5869e-02,
          1.9775e-02, -2.3193e-02],
        ...,
        [-4.5166e-03,  3.6621e-03,  1.8799e-02,  ..., -9.1553e-03,
          4.2152e-04, -1.0803e-02],
        [-6.7139e-03,  8.2397e-03,  1.2756e-02,  ..., -1.7929e-03,
         -2.6733e-02, -5.9843e-05],
        [-1.1963e-02, -1.5259e-02, -9.7046e-03,  ...,  2.1210e-03,
         -9.5215e-03, -7.6294e-04]], dtype=torch.float16)), 
         ('transformer.h.1.attn.c_attn.bias', tensor([ 4.1797e-01, -1.1172e+00, -1.1094e+00,  ...,  4.0436e-04,
        -1.9455e-03,  1.8921e-03], dtype=torch.float16)), 
        ('transformer.h.1.attn.c_proj.weight', tensor([[ 0.0018,  0.0124,  0.0074,  ..., -0.0155, -0.0009,  0.0004],
        [-0.0031, -0.0286,  0.0116,  ...,  0.0066,  0.0021,  0.0006],
        [ 0.0028, -0.0011,  0.0085,  ..., -0.0044, -0.0176,  0.0090],
        ...,
        [ 0.0216,  0.0042, -0.0164,  ...,  0.0072,  0.0076,  0.0093],
        [-0.0007,  0.0248, -0.0166,  ...,  0.0151,  0.0106,  0.0018],
        [ 0.0001, -0.0057, -0.0295,  ...,  0.0075, -0.0088, -0.0012]],
       dtype=torch.float16)), 
       ('transformer.h.1.ln_2.weight', tensor([0.2441, 0.2441, 0.2393,  ..., 0.2217, 0.2363, 0.2305],
       dtype=torch.float16)), ...])'''

def dump_state_dict(f, weight_names, state_dict, ggml_type):
    tensor_info = []
    for name in tqdm(weight_names, desc="Processing model states"):
        tensor = state_dict[name]
        if tensor.ndim == 2:
            # 2d weight: should quantize it if needed

            # step 1: de-quantize it back to float32
            tensor = tensor.float()

            # step 2: quantize it into ggml format
            tensor_ggml_type = ggml_type
        else:
            # 1d weight: convert it to float32
            assert tensor.ndim == 1
            tensor = tensor.float()
            tensor_ggml_type = GGMLType.F32

        dump_tensor(f, name, tensor, tensor_ggml_type)
        # [('transformer.wte.weight', torch.Size([151936, 4096]), 'Q4_0')]
        tensor_info.append((name, tensor.shape, tensor_ggml_type.name))

    print(tabulate(tensor_info, headers=["name", "shape", "dtype"], tablefmt="psql"))

'''
/root/.cache/huggingface/modules/transformers_modules/Qwen-7B-Chat/configuration_qwen.py
configuration_qwen.py调用/root/.cache/huggingface/modules/transformers_modules/Qwen-7B-Chat/modeling_qwen.py
到QWenLMHeadModel停止，config:/mnt/workspace/qwen.cpp/Qianwen/qwen/Qwen-7B-Chat/config.json
两句重要映射代码
self.transformer = QWenModel(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)


model:
QWenLMHeadModel(
  (transformer): QWenModel(
    (wte): Embedding(151936, 4096)
    (drop): Dropout(p=0.0, inplace=False)
    (rotary_emb): RotaryEmbedding()
    (h): ModuleList(
      (0-31): 32 x QWenBlock(
        (ln_1): RMSNorm()
        (attn): QWenAttention(
          (c_attn): Linear(in_features=4096, out_features=12288, bias=True)
          (c_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (attn_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): RMSNorm()
        (mlp): QWenMLP(
          (w1): Linear(in_features=4096, out_features=11008, bias=False)
          (w2): Linear(in_features=4096, out_features=11008, bias=False)
          (c_proj): Linear(in_features=11008, out_features=4096, bias=False)
        )
      )
    )
    (ln_f): RMSNorm()
  )
  (lm_head): Linear(in_features=4096, out_features=151936, bias=False)
)
tokenizer:分词器的配置
/root/.cache/huggingface/modules/transformers_modules/Qwen-7B-Chat/tokenization_qwen.py
vocab_file：'/mnt/workspace/qwen.cpp/Qianwen/qwen/Qwen-7B-Chat/qwen.tiktoken'


QWenTokenizer(name_or_path='/mnt/workspace/qwen.cpp/Qianwen/qwen/Qwen-7B-Chat',
 vocab_size=151851, model_max_length=8192, is_fast=False, padding_side='right', 
 truncation_side='right', special_tokens={}, clean_up_tokenization_spaces=True),
   added_tokens_decoder={
	
}

'''
class QwenConverter:
    @classmethod
    def convert(cls, f, model, tokenizer, ggml_type):
        f.write(b"ggml")  # magic
        cls.dump_config(f, model.config, model.generation_config, tokenizer, ggml_type)
        cls.dump_model(f, model, ggml_type)

    @staticmethod
    def dump_config(f, config, generation_config, tokenizer, ggml_type):
        config_values = [
            ggml_type.value,
            config.vocab_size,
            config.hidden_size,
            config.num_attention_heads,
            config.num_attention_heads,
            config.num_hidden_layers,
            config.intermediate_size,
            config.seq_length,
            generation_config.eos_token_id,
            generation_config.pad_token_id,
            tokenizer.im_start_id,
            tokenizer.im_end_id,
        ]  # dump_config 函数用于将模型配置信息以二进制格式写入文件
        f.write(struct.pack("i" * len(config_values), *config_values))
    # 这段代码定义了一个静态方法 dump_model，用于将模型的权重信息以二进制格式写入文件transformer.h.{0-31}.ln_1.weight
    @staticmethod
    def dump_model(f, model, ggml_type):
        weight_names = ["transformer.wte.weight"]
        for i in range(model.config.num_hidden_layers):
            weight_names += [
                f"transformer.h.{i}.ln_1.weight",
                f"transformer.h.{i}.attn.c_attn.weight",
                f"transformer.h.{i}.attn.c_attn.bias",
                f"transformer.h.{i}.attn.c_proj.weight",
                f"transformer.h.{i}.ln_2.weight",
                f"transformer.h.{i}.mlp.w1.weight",
                f"transformer.h.{i}.mlp.w2.weight",
                f"transformer.h.{i}.mlp.c_proj.weight",
            ]
        weight_names += [
            "transformer.ln_f.weight",
            "lm_head.weight",
        ]
        dump_state_dict(f, weight_names, model.state_dict(), ggml_type)


def convert(f: BinaryIO, model_name_or_path: str, dtype: str = "q4_0"):
    # 将q4_0中的q改变为大写Q，<GGMLType.Q4_0: 2>
    ggml_type = GGMLType[dtype.upper()]  
    '''AutoTokenizer通用的分词器类，
       分词器（Tokenizer）是自然语言处理（NLP）中的一个重要组件，用于将文本分解成单词、
       词组或其他基本单元，以便计算机可以更好地理解和处理文本数据。
       在NLP任务中，文本通常以连续的字符序列形式输入到计算机中，
       但计算机处理文本的方式更多地依赖于词汇和语义单位。因此，分词器的主要作用是将连续的文本流分割成有意义的单位，
       以便进行后续的处理，如词频统计、词性标注、命名实体识别、句法分析等。
       根据路径或者名称下载AutoTokenizer预训练分词器，AutoModelForCausalLM语言模型
       这个构建模型和一系列config配置参数没有llama.c简单
    '''
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path, trust_remote_code=True)

    QwenConverter.convert(f, model, tokenizer, ggml_type)


def main():
    parser = argparse.ArgumentParser("qwen-convert")
    # parser.add_argument(
    #     "-i",
    #     "--model_name_or_path",
    #     default="Qwen/Qwen-7B-Chat",
    #     type=str,
    #     help="Model name or path used in AutoModel.from_pretrained",
    # )
    parser.add_argument(
        "-i",
        "--model_name_or_path",
        default="/mnt/workspace/qwen.cpp/Qianwen/qwen/Qwen-7B-Chat",
        type=str,
        help="Model name or path used in AutoModel.from_pretrained",
    )
    # parser.add_argument(
    #     "-o", "--save_path", default="qwen7b-ggml.bin", type=Path, help="Path to save the generated GGML model"
    # )
    parser.add_argument(
        "-o", "--save_path", default="qwen7b-ggml.bin", type=Path, help="Path to save the generated GGML model"
    )
    parser.add_argument(
        "-t",
        "--type",
        default="q4_0",
        type=str,
        choices=["f32", "f16", "q8_0", "q4_0", "q4_1", "q5_0", "q5_1"],
        help="GGML model quantization type",
    )
    args = parser.parse_args()
    # f就是qwen7b-ggml.bin
    with open(args.save_path, "wb") as f:
        convert(f, args.model_name_or_path, dtype=args.type)

    print(f"GGML model saved to {args.save_path}")


if __name__ == "__main__":
    main()
'''
读取模型和config，中途进行量化，两维的量化为int4，并拼成int8，一维的直接fp32
'''

3、llama2.c运行llama2时export.py问题

export1.py
https://github.com/karpathy/llama2.c/blob/de005474d37d0cde1356739b8c79ebe7b42b5973/export_meta_llama_bin.py

#模型下载
from modelscope import snapshot_download
model_dir = snapshot_download('shakechen/Llama-2-7b',cache_dir='/mnt/workspace/llama2.c/llama2pth')


def concat_weights(models):
    state_dict = {}
    for name in list(models[0]):
        tensors = [model[name] for model in models]
        if len(tensors) == 1 or len(tensors[0].shape) == 1:
            state_dict[name] = tensors[0]
            continue
        is_axis_1 = (
            name.startswith('tok_embeddings.')
            or name.endswith('.attention.wo.weight')
            or name.endswith('.feed_forward.w2.weight')
        )
        axis = 1 if is_axis_1 else 0
        state_dict[name] = torch.cat(tensors, dim=axis)
        for model in models:
            del model[name]
    return state_dict
    
def load_and_export(model_path, output_path):
# 读取配置文件，json.load(f)方法解析json内容并存储在变量params中
    params_path = os.path.join(model_path, 'params.json')
    with open(params_path) as f:
        params = json.load(f)
        print(params)
    # 将pth加载到cpu上，然后将加载后的模型字典存入列表models。concat这儿只有一个model
    model_paths = sorted(list(Path(model_path).glob('consolidated.*.pth')))
    models = [torch.load(p, map_location='cpu') for p in model_paths]
    state_dict = concat_weights(models)
    del models

解释：

models
[{'tok_embeddings.weight': tensor([[ 1.2293e-06, -1.8179e-06, -4.3511e-06,  ...,  8.7172e-07,
         -6.5267e-06,  8.9034e-07],
        [ 1.8616e-03, -3.3722e-03,  3.9864e-04,  ..., -8.3008e-03,
          2.5787e-03, -3.9368e-03],
        [ 1.0986e-02,  9.8877e-03, -5.0964e-03,  ...,  2.5177e-03,
          7.7057e-04, -5.0049e-03],
        ...,
        [-1.3977e-02, -2.7313e-03, -1.9897e-02,  ..., -1.0437e-02,
          9.5825e-03, -1.8005e-03],
        [-1.0742e-02,  9.3384e-03,  1.2939e-02,  ..., -3.3203e-02,
         -1.6357e-02,  3.3875e-03],
        [-8.3008e-03, -4.0588e-03, -1.1063e-03,  ...,  3.4790e-03,
         -1.2939e-02,  3.1948e-05]], dtype=torch.bfloat16), 'norm.weight': tensor([1.8672, 1.8672, 1.8047,  ..., 1.7188, 1.8281, 1.6016],
       dtype=torch.bfloat16), 'output.weight': tensor([[-0.0039,  0.0032, -0.0071,  ...,  0.0053, -0.0082,  0.0070],
        [-0.0315,  0.0466, -0.0023,  ..., -0.0211,  0.0173,  0.0334],
        [-0.0125,  0.0036,  0.0195,  ..., -0.0271,  0.0143, -0.0082],
        ...,
        [-0.0281, -0.0195, -0.0024,  ...,  0.0123, -0.0117, -0.0237],
        [ 0.0229,  0.0255,  0.0315,  ...,  0.0067, -0.0092, -0.0058],
        [ 0.0080, -0.0088,  0.0063,  ..., -0.0293, -0.0200,  0.0337]],
       dtype=torch.bfloat16), 'layers.0.attention.wq.weight': tensor([[-0.0062, -0.0148, -0.0022,  ...,  0.0045,  0.0017, -0.0036],
        [ 0.0089, -0.0136,  0.0003,  ..., -0.0100, -0.0198,  0.0078],
        [ 0.0142, -0.0043,  0.0028,  ..., -0.0093, -0.0114,  0.0076],
        ...,
        [ 0.0256,  0.0102,  0.0032,  ..., -0.0334, -0.0156, -0.0123],
        [-0.0086, -0.0022, -0.0008,  ...,  0.0237, -0.0081,  0.0059],
        [-0.0134, -0.0066,  0.0018,  ...,  0.0181,  0.0166, -0.0082]],
       dtype=torch.bfloat16), 'layers.0.attention.wk.weight': tensor([[-0.0162,  0.0079, -0.0013,  ...,  0.0166, -0.0099, -0.0135],
        [ 0.0255,  0.0170,  0.0019,  ..., -0.0081,  0.0113,  0.0103],
        [ 0.0192,  0.0015,  0.0036,  ..., -0.0211,  0.0152,  0.0234],
        ...,
        [-0.0056,  0.0173, -0.0032,  ..., -0.0032,  0.0115, -0.0110],
        [ 0.0178, -0.0038,  0.0003,  ...,  0.0053, -0.0109,  0.0104],
        [ 0.0037, -0.0021,  0.0013,  ...,  0.0070, -0.0115,  0.0095]],
       dtype=torch.bfloat16), 'layers.0.attention.wv.weight': tensor([[ 0.0008, -0.0006,  0.0019,  ...,  0.0059, -0.0006,  0.0103],
        [-0.0069, -0.0005, -0.0077,  ..., -0.0106,  0.0126,  0.0048],
        [ 0.0018,  0.0096,  0.0010,  ...,  0.0048, -0.0139, -0.0142],
        ...,
        [-0.0063, -0.0057,  0.0103,  ...,  0.0031,  0.0040, -0.0022],
        [ 0.0031,  0.0048, -0.0010,  ...,  0.0054,  0.0156,  0.0007],
        [ 0.0001,  0.0025,  0.0056,  ..., -0.0007, -0.0007,  0.0015]],
       dtype=torch.bfloat16), 'layers.0.attention.wo.weight': tensor([[-1.6212e-05, -1.9226e-03,  4.8828e-03,  ...,  5.9204e-03,
          3.4485e-03, -9.5215e-03],
        [ 2.7618e-03,  1.8463e-03, -1.2970e-03,  ..., -1.0300e-03,
          1.8082e-03,  6.2561e-03],
        [ 2.3346e-03, -2.7275e-04,  9.2697e-04,  ..., -1.6556e-03,
         -5.7373e-03, -6.3705e-04],
        ...,
        [ 4.1809e-03, -3.3264e-03,  5.8899e-03,  ...,  1.2131e-03,
          2.6093e-03,  4.3030e-03],
        [-3.3569e-03, -2.4872e-03, -2.5787e-03,  ...,  6.1951e-03,
         -3.4790e-03, -5.1117e-04],
        [ 6.1951e-03, -6.5613e-04,  2.6245e-03,  ...,  5.4932e-03,
         -7.5989e-03, -6.6833e-03]], dtype=torch.bfloat16), 'layers.0.feed_forward.w1.weight': tensor([[ 1.5747e-02,  1.7090e-02,  3.1494e-02,  ..., -1.5869e-02,
          6.5002e-03,  1.5869e-02],
        [-2.1667e-03, -6.0120e-03,  5.6458e-03,  ...,  1.6113e-02,
         -8.6670e-03,  9.8877e-03],
        [ 6.8359e-03, -2.1606e-02,  2.0508e-02,  ..., -1.3000e-02,
          1.8921e-02,  1.9409e-02],
        ...,
        [ 1.4126e-05, -3.2227e-02,  5.7983e-03,  ..., -8.9111e-03,
         -1.3489e-02,  4.0283e-02],
        [ 2.6611e-02,  2.0142e-02, -1.7090e-02,  ..., -3.4332e-03,
         -6.4087e-03, -1.8921e-02],
        [-5.9891e-04, -1.1353e-02, -2.3682e-02,  ...,  1.1063e-03,
          5.9204e-03, -2.4780e-02]], dtype=torch.bfloat16), 'layers.0.feed_forward.w2.weight': tensor([[ 0.0027, -0.0145,  0.0083,  ..., -0.0175, -0.0054,  0.0014],
        [ 0.0046, -0.0042,  0.0090,  ...,  0.0160, -0.0138,  0.0334],
        [ 0.0020,  0.0339, -0.0044,  ..., -0.0146,  0.0220,  0.0167],
        ...,
        [-0.0089, -0.0114,  0.0052,  ...,  0.0231, -0.0135,  0.0295],
        [-0.0177,  0.0374,  0.0090,  ..., -0.0069, -0.0122, -0.0219],
        [ 0.0120, -0.0013, -0.0079,  ..., -0.0003, -0.0030, -0.0302]],
       dtype=torch.bfloat16), ...}]

def export(p, state_dict, filepath='model.bin'):
    """export the model weights in fp32 into .bin file to be read from C"""
    f = open(filepath, 'wb')

    def serialize(key):
        print(f"writing {key}...")
        t = state_dict[key].contiguous().view(-1).type(torch.float32).numpy()
        f.write(memoryview(t))
        del state_dict[key]

    # first write out the header
    # 从预训练模型的状态字典(state_dict)中获取第一个隐藏层的前馈神经网络(FFN)的第一层权重矩阵（w1）的维度，将其赋值给变量   
    # hidden_dim。这个值通常代表了模型内部的隐藏状态维度大小。
    hidden_dim = state_dict['layers.0.feed_forward.w1.weight'].shape[0]
    p['vocab_size'] = 32000
    p['max_seq_len'] = 2048
    # 如果参数字典p中有键'n_kv_heads'，则获取其值作为n_kv_heads；否则使用p['n_heads']的值。这可能是用来指定模型在注意力机     
    # 制中用于查询（query）和键（key）计算的特定数量的注意力头。
    n_kv_heads = p.get('n_kv_heads') or p['n_heads']
    # 通过struct.pack函数构建模型头信息，它会将多个整数打包成二进制格式。这里的参数包括模型维度、隐藏维度、层数、总注意力头  
    # 数、查询/键注意力头数、词汇表大小以及最大序列长度。
    header = struct.pack(
        'iiiiiii', # 这里的 'iiiiiii' 表示七次连续的32位整数打包操作，每个 'i' 代表一个整数类型。
        # 在Python的struct模块中，每个字符代表一种特定类型的数据。这里使用的 'i' 表示32位有符号整数（signed integer）。
        # 所以 'iiiiiii' 表示连续打包七个32位整数。
        p['dim'], hidden_dim, p['n_layers'], p['n_heads'],
        n_kv_heads, -p['vocab_size'], p['max_seq_len']
    )
    # NOTE ABOVE: -ve vocab_size is indicating that the classifier weights are present
    # in the checkpoint and should be loaded.
    f.write(header)

解释：该 export 函数的主要目的是将模型权重以浮点32（fp32）格式导出到一个名为 .bin 的二进制文件中，以便C语言程序能够读取这些权重。以


state_dict
{'tok_embeddings.weight': tensor([[ 1.2293e-06, -1.8179e-06, -4.3511e-06,  ...,  8.7172e-07,
         -6.5267e-06,  8.9034e-07],
        [ 1.8616e-03, -3.3722e-03,  3.9864e-04,  ..., -8.3008e-03,
          2.5787e-03, -3.9368e-03],
        [ 1.0986e-02,  9.8877e-03, -5.0964e-03,  ...,  2.5177e-03,
          7.7057e-04, -5.0049e-03],
        ...,
        [-1.3977e-02, -2.7313e-03, -1.9897e-02,  ..., -1.0437e-02,
          9.5825e-03, -1.8005e-03],
        [-1.0742e-02,  9.3384e-03,  1.2939e-02,  ..., -3.3203e-02,
         -1.6357e-02,  3.3875e-03],
        [-8.3008e-03, -4.0588e-03, -1.1063e-03,  ...,  3.4790e-03,
         -1.2939e-02,  3.1948e-05]], dtype=torch.bfloat16), 'norm.weight': tensor([1.8672, 1.8672, 1.8047,  ..., 1.7188, 1.8281, 1.6016],
       dtype=torch.bfloat16), 'output.weight': tensor([[-0.0039,  0.0032, -0.0071,  ...,  0.0053, -0.0082,  0.0070],
        [-0.0315,  0.0466, -0.0023,  ..., -0.0211,  0.0173,  0.0334],
        [-0.0125,  0.0036,  0.0195,  ..., -0.0271,  0.0143, -0.0082],
        ...,
        [-0.0281, -0.0195, -0.0024,  ...,  0.0123, -0.0117, -0.0237],
        [ 0.0229,  0.0255,  0.0315,  ...,  0.0067, -0.0092, -0.0058],
        [ 0.0080, -0.0088,  0.0063,  ..., -0.0293, -0.0200,  0.0337]],
       dtype=torch.bfloat16), 'layers.0.attention.wq.weight': tensor([[-0.0062, -0.0148, -0.0022,  ...,  0.0045,  0.0017, -0.0036],
        [ 0.0089, -0.0136,  0.0003,  ..., -0.0100, -0.0198,  0.0078],
        [ 0.0142, -0.0043,  0.0028,  ..., -0.0093, -0.0114,  0.0076],
        ...,
        [ 0.0256,  0.0102,  0.0032,  ..., -0.0334, -0.0156, -0.0123],
        [-0.0086, -0.0022, -0.0008,  ...,  0.0237, -0.0081,  0.0059],
        [-0.0134, -0.0066,  0.0018,  ...,  0.0181,  0.0166, -0.0082]],
       dtype=torch.bfloat16), 'layers.0.attention.wk.weight': tensor([[-0.0162,  0.0079, -0.0013,  ...,  0.0166, -0.0099, -0.0135],
        [ 0.0255,  0.0170,  0.0019,  ..., -0.0081,  0.0113,  0.0103],
        [ 0.0192,  0.0015,  0.0036,  ..., -0.0211,  0.0152,  0.0234],
        ...,
        [-0.0056,  0.0173, -0.0032,  ..., -0.0032,  0.0115, -0.0110],
        [ 0.0178, -0.0038,  0.0003,  ...,  0.0053, -0.0109,  0.0104],
        [ 0.0037, -0.0021,  0.0013,  ...,  0.0070, -0.0115,  0.0095]],
       dtype=torch.bfloat16), 'layers.0.attention.wv.weight': tensor([[ 0.0008, -0.0006,  0.0019,  ...,  0.0059, -0.0006,  0.0103],
        [-0.0069, -0.0005, -0.0077,  ..., -0.0106,  0.0126,  0.0048],
        [ 0.0018,  0.0096,  0.0010,  ...,  0.0048, -0.0139, -0.0142],
        ...,
        [-0.0063, -0.0057,  0.0103,  ...,  0.0031,  0.0040, -0.0022],
        [ 0.0031,  0.0048, -0.0010,  ...,  0.0054,  0.0156,  0.0007],
        [ 0.0001,  0.0025,  0.0056,  ..., -0.0007, -0.0007,  0.0015]],
       dtype=torch.bfloat16), 'layers.0.attention.wo.weight': tensor([[-1.6212e-05, -1.9226e-03,  4.8828e-03,  ...,  5.9204e-03,
          3.4485e-03, -9.5215e-03],
        [ 2.7618e-03,  1.8463e-03, -1.2970e-03,  ..., -1.0300e-03,
          1.8082e-03,  6.2561e-03],
        [ 2.3346e-03, -2.7275e-04,  9.2697e-04,  ..., -1.6556e-03,
         -5.7373e-03, -6.3705e-04],
        ...,
        [ 4.1809e-03, -3.3264e-03,  5.8899e-03,  ...,  1.2131e-03,
          2.6093e-03,  4.3030e-03],
        [-3.3569e-03, -2.4872e-03, -2.5787e-03,  ...,  6.1951e-03,
         -3.4790e-03, -5.1117e-04],
        [ 6.1951e-03, -6.5613e-04,  2.6245e-03,  ...,  5.4932e-03,
         -7.5989e-03, -6.6833e-03]], dtype=torch.bfloat16), 'layers.0.feed_forward.w1.weight': tensor([[ 1.5747e-02,  1.7090e-02,  3.1494e-02,  ..., -1.5869e-02,
          6.5002e-03,  1.5869e-02],
        [-2.1667e-03, -6.0120e-03,  5.6458e-03,  ...,  1.6113e-02,
         -8.6670e-03,  9.8877e-03],
        [ 6.8359e-03, -2.1606e-02,  2.0508e-02,  ..., -1.3000e-02,
          1.8921e-02,  1.9409e-02],
        ...,
        [ 1.4126e-05, -3.2227e-02,  5.7983e-03,  ..., -8.9111e-03,
         -1.3489e-02,  4.0283e-02],
        [ 2.6611e-02,  2.0142e-02, -1.7090e-02,  ..., -3.4332e-03,
         -6.4087e-03, -1.8921e-02],
        [-5.9891e-04, -1.1353e-02, -2.3682e-02,  ...,  1.1063e-03,
          5.9204e-03, -2.4780e-02]], dtype=torch.bfloat16), 'layers.0.feed_forward.w2.weight': tensor([[ 0.0027, -0.0145,  0.0083,  ..., -0.0175, -0.0054,  0.0014],
        [ 0.0046, -0.0042,  0.0090,  ...,  0.0160, -0.0138,  0.0334],
        [ 0.0020,  0.0339, -0.0044,  ..., -0.0146,  0.0220,  0.0167],
        ...,
        [-0.0089, -0.0114,  0.0052,  ...,  0.0231, -0.0135,  0.0295],
        [-0.0177,  0.0374,  0.0090,  ..., -0.0069, -0.0122, -0.0219],
        [ 0.0120, -0.0013, -0.0079,  ..., -0.0003, -0.0030, -0.0302]],
       dtype=torch.bfloat16), 'layers.0.feed_forward.w3.weight': tensor([[ 0.0003, -0.0292,  0.0148,  ..., -0.0210, -0.0270,  0.0065],
        [-0.0111, -0.0312,  0.0128,  ...,  0.0190,  0.0060,  0.0025],
        [-0.0059,  0.0149, -0.0084,  ..., -0.0227,  0.0075,  0.0017],
        ...,
        [-0.0091, -0.0016, -0.0067,  ...,  0.0295, -0.0028,  0.0183],
        [-0.0166,  0.0073,  0.0189,  ...,  0.0014, -0.0166,  0.0031],
        [ 0.0190,  0.0197, -0.0004,  ...,  0.0118, -0.0143, -0.0388]],
       dtype=torch.bfloat16), 'layers.0.attention_norm.weight': tensor([0.0297, 0.0136, 0.0020,  ..., 0.0103, 0.0110, 0.0061],
       dtype=torch.bfloat16), 'layers.0.ffn_norm.weight': tensor([0.0503, 0.0525, 0.0500,  ..., 0.0525, 0.0535, 0.0491],
       dtype=torch.bfloat16), 'layers.1.attention.wq.weight': tensor([[-0.0125,  0.0073, -0.0381,  ..., -0.0024, -0.0588,  0.0356],
        [-0.0195,  0.0410,  0.0544,  ...,  0.0214, -0.0308,  0.0315],
        [-0.0006, -0.0082,  0.0079,  ..., -0.0083, -0.0488,  0.0277],
        ...,
        [-0.0021, -0.0038, -0.0042,  ...,  0.0088,  0.0052,  0.0062],
        [ 0.0024, -0.0022,  0.0049,  ...,  0.0098,  0.0005,  0.0128],
        [ 0.0003,  0.0048,  0.0067,  ..., -0.0079, -0.0005, -0.0111]],
       dtype=torch.bfloat16), 'layers.1.attention.wk.weight': tensor([[-0.0248, -0.0025,  0.0383,  ...,  0.0179,  0.0208, -0.0096],
        [ 0.0147,  0.0030, -0.0276,  ..., -0.0227, -0.0044, -0.0032],
        [-0.0295,  0.0046, -0.0114,  ..., -0.0159,  0.0094, -0.0588],
        ...,
        [ 0.0080, -0.0192,  0.0040,  ..., -0.0121, -0.0015, -0.0065],
        [ 0.0090, -0.0239,  0.0014,  ..., -0.0122,  0.0027, -0.0074],
        [-0.0080,  0.0147,  0.0007,  ...,  0.0040, -0.0016,  0.0060]],
       dtype=torch.bfloat16), ...}