为了方便调试,使得 model = Transformer(config)模型内存不溢出,将config中的"n_layers": 2,整体看一下Transformer的架构。
注:config就是设置Transformer中的参数。
class Transformer(nn.Module):
last_loss: Optional[torch.Tensor]
def __init__(self, params: ModelArgs):
super().__init__()
self.params = params
self.vocab_size = params.vocab_size
self.n_layers = params.n_layers
self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
self.dropout = nn.Dropout(params.dropout)
self.layers = torch.nn.ModuleList()
# self.layers给定的数量加载TransformerBlock
for layer_id in range(params.n_layers):
self.layers.append(TransformerBlock(layer_id, params))
self.norm = RMSNorm(params.dim, eps=params.norm_eps)
self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
# share the unembedding parameters with the embedding parameters
self.tok_embeddings.weight = self.output.weight # https://paperswithcode.com/method/weight-tying
# some useful precompute for the RoPE relative positional embeddings
freqs_cos, freqs_sin = precompute_freqs_cis(self.params.dim // self.params.n_heads, self.params.max_seq_len)
self.register_buffer("freqs_cos", freqs_cos, persistent=False)
self.register_buffer("freqs_sin", freqs_sin, persistent=False)
# init all weights
self.apply(self._init_weights)
# apply special scaled init to the residual projections, per GPT-2 paper
for pn, p in self.named_parameters():
if pn.endswith('w3.weight') or pn.endswith('wo.weight'):
torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * params.n_layers))
# Initialize attribute for the loss of the last forward call. This will be set if the forward is called with a targets
# tensor.
self.last_loss = None
def _init_weights(self, module):
if isinstance(module, nn.Linear):
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
if module.bias is not None:
torch.nn.init.zeros_(module.bias)
elif isinstance(module, nn.Embedding):
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
def forward(self, tokens: torch.Tensor, targets: Optional[torch.Tensor] = None) -> torch.Tensor:
_bsz, seqlen = tokens.shape
h = self.tok_embeddings(tokens)
h = self.dropout(h)
freqs_cos = self.freqs_cos[:seqlen]
freqs_sin = self.freqs_sin[:seqlen]
for layer in self.layers:
h = layer(h, freqs_cos, freqs_sin)
# RMSNorm
h = self.norm(h)
if targets is not None:
# if we are given some desired targets also calculate the loss
# Linear
logits = self.output(h)
self.last_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
else:
# inference-time mini-optimization: only forward the output on the very last position
logits = self.output(h[:, [-1], :]) # note: using list [-1] to preserve the time dim
self.last_loss = None
return logits
class TransformerBlock(nn.Module):
def __init__(self, layer_id: int, args: ModelArgs):
super().__init__()
self.n_heads = args.n_heads
self.dim = args.dim
self.head_dim = args.dim // args.n_heads
self.attention = Attention(args)
self.feed_forward = FeedForward(
dim=args.dim,
hidden_dim=args.hidden_dim,
multiple_of=args.multiple_of,
dropout=args.dropout,
)
self.layer_id = layer_id
self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
def forward(self, x, freqs_cos, freqs_sin):
h = x + self.attention.forward(self.attention_norm(x), freqs_cos, freqs_sin)
out = h + self.feed_forward.forward(self.ffn_norm(h))
return out
Transformer(
(tok_embeddings): Embedding(32000, 4096)
(dropout): Dropout(p=0.0, inplace=False)
(layers): ModuleList(
(0-1): 2 x TransformerBlock(
(attention): Attention(
(wq): Linear(in_features=4096, out_features=4096, bias=False)
(wk): Linear(in_features=4096, out_features=4096, bias=False)
(wv): Linear(in_features=4096, out_features=4096, bias=False)
(wo): Linear(in_features=4096, out_features=4096, bias=False)
(attn_dropout): Dropout(p=0.0, inplace=False)
(resid_dropout): Dropout(p=0.0, inplace=False)
)
(feed_forward): FeedForward(
(w1): Linear(in_features=4096, out_features=11008, bias=False)
(w2): Linear(in_features=11008, out_features=4096, bias=False)
(w3): Linear(in_features=4096, out_features=11008, bias=False)
(dropout): Dropout(p=0.0, inplace=False)
)
(attention_norm): RMSNorm()
(ffn_norm): RMSNorm()
)
)
(norm): RMSNorm()
(output): Linear(in_features=4096, out_features=32000, bias=False)
)
model = load_meta_model(args.meta_llama) 打开params,存于json格式,目的是model =
Transformer(config)时,这些模型参数构建一个新的Transformer 加载模型权重,并映射到state_dict字典格式此时,有了新的Transformer模型model,有了权重,就可以给model赋予权重了。 如代码:
model.tok_embeddings.weight = nn.Parameter(state_dict['tok_embeddings.weight'])
此时返回一个具有params且具有权重的Transformer。
1、export中的量化
def version2_export(model, filepath, group_size=64):
model.layers为
“ModuleList(
(0-1): 2 x TransformerBlock(
(attention): Attention(
(wq): Linear(in_features=4096, out_features=4096, bias=False)
(wk): Linear(in_features=4096, out_features=4096, bias=False)
(wv): Linear(in_features=4096, out_features=4096, bias=False)
(wo): Linear(in_features=4096, out_features=4096, bias=False)
(attn_dropout): Dropout(p=0.0, inplace=False)
(resid_dropout): Dropout(p=0.0, inplace=False)
)
(feed_forward): FeedForward(
(w1): Linear(in_features=4096, out_features=11008, bias=False)
(w2): Linear(in_features=11008, out_features=4096, bias=False)
(w3): Linear(in_features=4096, out_features=11008, bias=False)
(dropout): Dropout(p=0.0, inplace=False)
)
(attention_norm): RMSNorm()
(ffn_norm): RMSNorm()
)
)”
S1:对模型参数进行分组,分组目的是量化以分组大小为单位进行,减少异常值的影响
S2:收集模型中的权重参数,存储在weights的列表中。其中,模型参数是个tensor
如model.tok_embeddings.weight
[layer.attention.wq.weight for layer in model.layers], 当TransformerBlock为2时,这儿weights为15,因为72+1=15(一个Bloclk有7个参数)
S3:判断weights中的每个权重w是否能被group_size整除,方便分组量化。 分组量化优势:
降低量化误差。在对连续的浮点数权重直接进行量化时,可能会因为单一阈值导致较大的量化误差,尤其是对于那些数值分布不均匀的权重。
分组量化通过将权重分为多个小组,并为每个小组独立计算量化参数(如量化步长或缩放因子),可以减少因个别极端值引起的全局量化误差
S4:将文件头的信息写入bin文件。包括magic、version、params(7个int参数) 头信息:文件头信息(File
Header)是指存储在文件起始部分的一段特定数据,它包含了关于文件内容、格式、版本、结构以及如何解释文件中数据的必要信息。不同类型的文件有不同的文件头信息格式。
S5:关于norm层的全部fp32,通过numpy形式写入bin文件 S6:分组,找每组Wmax,s=Wmax/127.0,
q=w/s,round(q)得到int8
反量化回去得fp32valr,算与w的最大误差err,在O(~0.001)最好
for i, w in enumerate(weights): #i是从0开始的索引,w是指当前索引的weight
# quantize this weight
q, s, err = quantize_q80(w, group_size)
# save the int8 weights to file
serialize_int8(out_file, q) # save the tensor in int8
serialize_fp32(out_file, s) # save scale factors
def quantize_q80(w, group_size):
"""
takes a tensor and returns the Q8_0 quantized version
i.e. symmetric quantization into int8, range [-127,127]
"""
assert w.numel() % group_size == 0
ori_shape = w.shape
w = w.float() # convert to float32
# 将w分成多个小组,每个小组有group_size个元素,这儿是torch.Size([2048000, 64])
w = w.reshape(-1, group_size)
# find the max in each group
wmax = torch.abs(w).max(dim=1).values
# calculate the scaling factor such that float = quant * scale
scale = wmax / 127.0
# scale into range [-127, 127]
quant = w / scale[:,None]
# round to nearest integer
int8val = torch.round(quant).to(torch.int8)
# dequantize by rescaling
fp32val = (int8val.float() * scale[:,None]).view(-1)
fp32valr = fp32val.reshape(-1, group_size)
# calculate the max error in each group
err = torch.abs(fp32valr - w).max(dim=1).values
# find the max error across all groups
maxerr = err.max().item()
return int8val, scale, maxerr
总结:
整个bin文件排布是256kb的header + 权重
first write out the header. the header will be 256 bytes
now that the header is done, let’s write out the model