llama2c（1）之export

tensor.shape

已于 2024-03-09 20:53:34 修改

阅读量440

点赞数 4

分类专栏： llama C/C++ 文章标签： c语言深度学习语言模型 llama

于 2024-03-08 13:02:14 首次发布

本文链接：https://blog.csdn.net/qq_44576434/article/details/136555877

版权

C/C++ 同时被 2 个专栏收录

19 篇文章 1 订阅

订阅专栏

llama

12 篇文章 0 订阅

订阅专栏

为了方便调试，使得 model = Transformer(config)模型内存不溢出，将config中的"n_layers": 2，整体看一下Transformer的架构。
注：config就是设置Transformer中的参数。

class Transformer(nn.Module):
    last_loss: Optional[torch.Tensor]

    def __init__(self, params: ModelArgs):
        super().__init__()
        self.params = params
        self.vocab_size = params.vocab_size
        self.n_layers = params.n_layers

        self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
        self.dropout = nn.Dropout(params.dropout)
        self.layers = torch.nn.ModuleList()
        # self.layers给定的数量加载TransformerBlock
        for layer_id in range(params.n_layers):
            self.layers.append(TransformerBlock(layer_id, params))
        self.norm = RMSNorm(params.dim, eps=params.norm_eps)
        self.output = nn.Linear(params.dim, params.vocab_size, bias=False)

        # share the unembedding parameters with the embedding parameters
        self.tok_embeddings.weight = self.output.weight # https://paperswithcode.com/method/weight-tying

        # some useful precompute for the RoPE relative positional embeddings
        freqs_cos, freqs_sin = precompute_freqs_cis(self.params.dim // self.params.n_heads, self.params.max_seq_len)
        self.register_buffer("freqs_cos", freqs_cos, persistent=False)
        self.register_buffer("freqs_sin", freqs_sin, persistent=False)

        # init all weights
        self.apply(self._init_weights)
        # apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('w3.weight') or pn.endswith('wo.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * params.n_layers))

        # Initialize attribute for the loss of the last forward call. This will be set if the forward is called with a targets 
        # tensor.
        self.last_loss = None

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, tokens: torch.Tensor, targets: Optional[torch.Tensor] = None) -> torch.Tensor:
        _bsz, seqlen = tokens.shape
        h = self.tok_embeddings(tokens)
        h = self.dropout(h)
        freqs_cos = self.freqs_cos[:seqlen]
        freqs_sin = self.freqs_sin[:seqlen]

        for layer in self.layers:
            h = layer(h, freqs_cos, freqs_sin)
        # RMSNorm
        h = self.norm(h)

        if targets is not None:
            # if we are given some desired targets also calculate the loss
            # Linear
            logits = self.output(h)
            self.last_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # inference-time mini-optimization: only forward the output on the very last position
            logits = self.output(h[:, [-1], :]) # note: using list [-1] to preserve the time dim
            self.last_loss = None

        return logits
        
class TransformerBlock(nn.Module):
    def __init__(self, layer_id: int, args: ModelArgs):
        super().__init__()
        self.n_heads = args.n_heads
        self.dim = args.dim
        self.head_dim = args.dim // args.n_heads
        self.attention = Attention(args)
        self.feed_forward = FeedForward(
            dim=args.dim,
            hidden_dim=args.hidden_dim,
            multiple_of=args.multiple_of,
            dropout=args.dropout,
        )
        self.layer_id = layer_id
        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)

    def forward(self, x, freqs_cos, freqs_sin):
        h = x + self.attention.forward(self.attention_norm(x), freqs_cos, freqs_sin)
        out = h + self.feed_forward.forward(self.ffn_norm(h))
        return out

Transformer(
  (tok_embeddings): Embedding(32000, 4096)
  (dropout): Dropout(p=0.0, inplace=False)
  (layers): ModuleList(
    (0-1): 2 x TransformerBlock(
      (attention): Attention(
        (wq): Linear(in_features=4096, out_features=4096, bias=False)
        (wk): Linear(in_features=4096, out_features=4096, bias=False)
        (wv): Linear(in_features=4096, out_features=4096, bias=False)
        (wo): Linear(in_features=4096, out_features=4096, bias=False)
        (attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
      (feed_forward): FeedForward(
        (w1): Linear(in_features=4096, out_features=11008, bias=False)
        (w2): Linear(in_features=11008, out_features=4096, bias=False)
        (w3): Linear(in_features=4096, out_features=11008, bias=False)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (attention_norm): RMSNorm()
      (ffn_norm): RMSNorm()
    )
  )
  (norm): RMSNorm()
  (output): Linear(in_features=4096, out_features=32000, bias=False)
)

model = load_meta_model(args.meta_llama) 打开params,存于json格式，目的是model =
Transformer(config)时，这些模型参数构建一个新的Transformer 加载模型权重，并映射到state_dict字典格式

此时，有了新的Transformer模型model，有了权重，就可以给model赋予权重了。如代码：

model.tok_embeddings.weight = nn.Parameter(state_dict['tok_embeddings.weight'])

此时返回一个具有params且具有权重的Transformer。

1、export中的量化

def version2_export(model, filepath, group_size=64):
model.layers为
“ModuleList(
  (0-1): 2 x TransformerBlock(
    (attention): Attention(
      (wq): Linear(in_features=4096, out_features=4096, bias=False)
      (wk): Linear(in_features=4096, out_features=4096, bias=False)
      (wv): Linear(in_features=4096, out_features=4096, bias=False)
      (wo): Linear(in_features=4096, out_features=4096, bias=False)
      (attn_dropout): Dropout(p=0.0, inplace=False)
      (resid_dropout): Dropout(p=0.0, inplace=False)
    )
    (feed_forward): FeedForward(
      (w1): Linear(in_features=4096, out_features=11008, bias=False)
      (w2): Linear(in_features=11008, out_features=4096, bias=False)
      (w3): Linear(in_features=4096, out_features=11008, bias=False)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (attention_norm): RMSNorm()
    (ffn_norm): RMSNorm()
  )
)”

S1：对模型参数进行分组，分组目的是量化以分组大小为单位进行，减少异常值的影响
S2：收集模型中的权重参数，存储在weights的列表中。其中，模型参数是个tensor
如model.tok_embeddings.weight
[layer.attention.wq.weight for layer in model.layers], 当TransformerBlock为2时，这儿weights为15，因为72+1=15（一个Bloclk有7个参数）
S3:判断weights中的每个权重w是否能被group_size整除，方便分组量化。分组量化优势：
降低量化误差。在对连续的浮点数权重直接进行量化时，可能会因为单一阈值导致较大的量化误差，尤其是对于那些数值分布不均匀的权重。
分组量化通过将权重分为多个小组，并为每个小组独立计算量化参数（如量化步长或缩放因子），可以减少因个别极端值引起的全局量化误差
S4：将文件头的信息写入bin文件。包括magic、version、params（7个int参数）头信息：文件头信息（File
Header）是指存储在文件起始部分的一段特定数据，它包含了关于文件内容、格式、版本、结构以及如何解释文件中数据的必要信息。不同类型的文件有不同的文件头信息格式。
S5：关于norm层的全部fp32，通过numpy形式写入bin文件 S6：分组，找每组Wmax，s=Wmax/127.0,
q=w/s,round(q)得到int8
反量化回去得fp32valr，算与w的最大误差err，在O(~0.001)最好

for i, w in enumerate(weights): #i是从0开始的索引，w是指当前索引的weight
        # quantize this weight
        q, s, err = quantize_q80(w, group_size)
        # save the int8 weights to file
        serialize_int8(out_file, q) # save the tensor in int8
        serialize_fp32(out_file, s) # save scale factors
def quantize_q80(w, group_size):
    """
    takes a tensor and returns the Q8_0 quantized version
    i.e. symmetric quantization into int8, range [-127,127]
    """
    assert w.numel() % group_size == 0
    ori_shape = w.shape
    w = w.float() # convert to float32
    # 将w分成多个小组，每个小组有group_size个元素,这儿是torch.Size([2048000, 64])
    w = w.reshape(-1, group_size)
    # find the max in each group
    wmax = torch.abs(w).max(dim=1).values
    # calculate the scaling factor such that float = quant * scale
    scale = wmax / 127.0
    # scale into range [-127, 127]
    quant = w / scale[:,None]
    # round to nearest integer
    int8val = torch.round(quant).to(torch.int8)
    # dequantize by rescaling
    fp32val = (int8val.float() * scale[:,None]).view(-1)
    fp32valr = fp32val.reshape(-1, group_size)
    # calculate the max error in each group
    err = torch.abs(fp32valr - w).max(dim=1).values
    # find the max error across all groups
    maxerr = err.max().item()
    return int8val, scale, maxerr