llama2c(1)之export

12 篇文章 0 订阅
为了方便调试,使得 model = Transformer(config)模型内存不溢出,将config中的"n_layers": 2,整体看一下Transformer的架构。
注:config就是设置Transformer中的参数。
class Transformer(nn.Module):
    last_loss: Optional[torch.Tensor]

    def __init__(self, params: ModelArgs):
        super().__init__()
        self.params = params
        self.vocab_size = params.vocab_size
        self.n_layers = params.n_layers

        self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
        self.dropout = nn.Dropout(params.dropout)
        self.layers = torch.nn.ModuleList()
        # self.layers给定的数量加载TransformerBlock
        for layer_id in range(params.n_layers):
            self.layers.append(TransformerBlock(layer_id, params))
        self.norm = RMSNorm(params.dim, eps=params.norm_eps)
        self.output = nn.Linear(params.dim, params.vocab_size, bias=False)

        # share the unembedding parameters with the embedding parameters
        self.tok_embeddings.weight = self.output.weight # https://paperswithcode.com/method/weight-tying

        # some useful precompute for the RoPE relative positional embeddings
        freqs_cos, freqs_sin = precompute_freqs_cis(self.params.dim // self.params.n_heads, self.params.max_seq_len)
        self.register_buffer("freqs_cos", freqs_cos, persistent=False)
        self.register_buffer("freqs_sin", freqs_sin, persistent=False)

        # init all weights
        self.apply(self._init_weights)
        # apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('w3.weight') or pn.endswith('wo.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * params.n_layers))

        # Initialize attribute for the loss of the last forward call. This will be set if the forward is called with a targets 
        # tensor.
        self.last_loss = None

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, tokens: torch.Tensor, targets: Optional[torch.Tensor] = None) -> torch.Tensor:
        _bsz, seqlen = tokens.shape
        h = self.tok_embeddings(tokens)
        h = self.dropout(h)
        freqs_cos = self.freqs_cos[:seqlen]
        freqs_sin = self.freqs_sin[:seqlen]

        for layer in self.layers:
            h = layer(h, freqs_cos, freqs_sin)
        # RMSNorm
        h = self.norm(h)

        if targets is not None:
            # if we are given some desired targets also calculate the loss
            # Linear
            logits = self.output(h)
            self.last_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # inference-time mini-optimization: only forward the output on the very last position
            logits = self.output(h[:, [-1], :]) # note: using list [-1] to preserve the time dim
            self.last_loss = None

        return logits
        
class TransformerBlock(nn.Module):
    def __init__(self, layer_id: int, args: ModelArgs):
        super().__init__()
        self.n_heads = args.n_heads
        self.dim = args.dim
        self.head_dim = args.dim // args.n_heads
        self.attention = Attention(args)
        self.feed_forward = FeedForward(
            dim=args.dim,
            hidden_dim=args.hidden_dim,
            multiple_of=args.multiple_of,
            dropout=args.dropout,
        )
        self.layer_id = layer_id
        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)

    def forward(self, x, freqs_cos, freqs_sin):
        h = x + self.attention.forward(self.attention_norm(x), freqs_cos, freqs_sin)
        out = h + self.feed_forward.forward(self.ffn_norm(h))
        return out
Transformer(
  (tok_embeddings): Embedding(32000, 4096)
  (dropout): Dropout(p=0.0, inplace=False)
  (layers): ModuleList(
    (0-1): 2 x TransformerBlock(
      (attention): Attention(
        (wq): Linear(in_features=4096, out_features=4096, bias=False)
        (wk): Linear(in_features=4096, out_features=4096, bias=False)
        (wv): Linear(in_features=4096, out_features=4096, bias=False)
        (wo): Linear(in_features=4096, out_features=4096, bias=False)
        (attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
      (feed_forward): FeedForward(
        (w1): Linear(in_features=4096, out_features=11008, bias=False)
        (w2): Linear(in_features=11008, out_features=4096, bias=False)
        (w3): Linear(in_features=4096, out_features=11008, bias=False)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (attention_norm): RMSNorm()
      (ffn_norm): RMSNorm()
    )
  )
  (norm): RMSNorm()
  (output): Linear(in_features=4096, out_features=32000, bias=False)
)

model = load_meta_model(args.meta_llama) 打开params,存于json格式,目的是model =
Transformer(config)时,这些模型参数构建一个新的Transformer 加载模型权重,并映射到state_dict字典格式

此时,有了新的Transformer模型model,有了权重,就可以给model赋予权重了。 如代码:

model.tok_embeddings.weight = nn.Parameter(state_dict['tok_embeddings.weight'])

此时返回一个具有params且具有权重的Transformer。

1、export中的量化

def version2_export(model, filepath, group_size=64):
model.layers为
“ModuleList(
  (0-1): 2 x TransformerBlock(
    (attention): Attention(
      (wq): Linear(in_features=4096, out_features=4096, bias=False)
      (wk): Linear(in_features=4096, out_features=4096, bias=False)
      (wv): Linear(in_features=4096, out_features=4096, bias=False)
      (wo): Linear(in_features=4096, out_features=4096, bias=False)
      (attn_dropout): Dropout(p=0.0, inplace=False)
      (resid_dropout): Dropout(p=0.0, inplace=False)
    )
    (feed_forward): FeedForward(
      (w1): Linear(in_features=4096, out_features=11008, bias=False)
      (w2): Linear(in_features=11008, out_features=4096, bias=False)
      (w3): Linear(in_features=4096, out_features=11008, bias=False)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (attention_norm): RMSNorm()
    (ffn_norm): RMSNorm()
  )
)

S1:对模型参数进行分组,分组目的是量化以分组大小为单位进行,减少异常值的影响
S2:收集模型中的权重参数,存储在weights的列表中。其中,模型参数是个tensor
如model.tok_embeddings.weight
[layer.attention.wq.weight for layer in model.layers], 当TransformerBlock为2时,这儿weights为15,因为72+1=15(一个Bloclk有7个参数)
S3:判断weights中的每个权重w是否能被group_size整除,方便分组量化。 分组量化优势:
降低量化误差。在对连续的浮点数权重直接进行量化时,可能会因为单一阈值导致较大的量化误差,尤其是对于那些数值分布不均匀的权重。
分组量化通过将权重分为多个小组,并为每个小组独立计算量化参数(如量化步长或缩放因子),可以减少因个别极端值引起的全局量化误差
S4:将文件头的信息写入bin文件。包括magic、version、params(7个int参数) 头信息:文件头信息(File
Header)是指存储在文件起始部分的一段特定数据,它包含了关于文件内容、格式、版本、结构以及如何解释文件中数据的必要信息。不同类型的文件有不同的文件头信息格式。
S5:关于norm层的全部fp32,通过numpy形式写入bin文件 S6:分组,找每组Wmax,s=Wmax/127.0,
q=w/s,round(q)得到int8
反量化回去得fp32valr,算与w的最大误差err,在O(~0.001)最好

for i, w in enumerate(weights): #i是从0开始的索引,w是指当前索引的weight
        # quantize this weight
        q, s, err = quantize_q80(w, group_size)
        # save the int8 weights to file
        serialize_int8(out_file, q) # save the tensor in int8
        serialize_fp32(out_file, s) # save scale factors
def quantize_q80(w, group_size):
    """
    takes a tensor and returns the Q8_0 quantized version
    i.e. symmetric quantization into int8, range [-127,127]
    """
    assert w.numel() % group_size == 0
    ori_shape = w.shape
    w = w.float() # convert to float32
    # 将w分成多个小组,每个小组有group_size个元素,这儿是torch.Size([2048000, 64])
    w = w.reshape(-1, group_size)
    # find the max in each group
    wmax = torch.abs(w).max(dim=1).values
    # calculate the scaling factor such that float = quant * scale
    scale = wmax / 127.0
    # scale into range [-127, 127]
    quant = w / scale[:,None]
    # round to nearest integer
    int8val = torch.round(quant).to(torch.int8)
    # dequantize by rescaling
    fp32val = (int8val.float() * scale[:,None]).view(-1)
    fp32valr = fp32val.reshape(-1, group_size)
    # calculate the max error in each group
    err = torch.abs(fp32valr - w).max(dim=1).values
    # find the max error across all groups
    maxerr = err.max().item()
    return int8val, scale, maxerr  

总结:

整个bin文件排布是256kb的header + 权重

first write out the header. the header will be 256 bytes

now that the header is done, let’s write out the model

  • 4
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值