model.embed_tokens.weight
: torch.Size([128256, 4096])
model.layers.0.input_layernorm.weight
: torch.Size([4096])
model.layers.0.self_attn.q_proj.weight
: torch.Size([4096, 4096])
model.layers.0.self_attn.k_proj.weight
: torch.Size([1024, 4096])
model.layers.0.self_attn.v_proj.weight
: torch.Size([1024, 4096])
model.layers.0.self_attn.o_proj.weight
: torch.Size([4096, 4096])
model.layers.0.post_attention_layernorm.weight
: torch.Size([4096])
model.layers.0.mlp.gate_proj.weight
: torch.Size([14336, 4096])
model.layers.0.mlp.up_proj.weight
: torch.Size([14336, 4096])
model.layers.0.mlp.down_proj.weight
: torch.Size([4096, 14336])
model.norm.weight
: torch.Size([4096])
lm_head.weight
: torch.Size([128256, 4096])
embedding.word_embeddings.weight
: torch.Size([128256, 4096])
decoder.layers.0.input_layernorm.weight
: torch.Size([4096])
decoder.layers.0.self_attention.linear_qkv.weight
: torch.Size([6144, 4096])
decoder.layers.0.self_attention.linear_proj.weight
: torch.Size([4096, 4096])
decoder.layers.0.pre_mlp_layernorm.weight
: torch.Size([4096])
decoder.layers.0.mlp.linear_fc1.weight
: torch.Size([28672, 4096])
decoder.layers.0.mlp.linear_fc2.weight
: torch.Size([4096, 14336])
decoder.final_layernorm.weight
: torch.Size([4096])
output_layer.weight
: torch.Size([128256, 4096])
embedding.word_embeddings.weight
: torch.Size([64128, 4096])
decoder.layers.0.self_attention.linear_qkv.weight
: torch.Size([3072, 4096])
decoder.layers.0.self_attention.linear_proj.weight
: torch.Size([4096, 2048])
decoder.layers.0.mlp.linear_fc1.weight
: torch.Size([14336, 4096])
decoder.layers.0.mlp.linear_fc2.weight
: torch.Size([4096, 7168])
decoder.layers.0.input_layernorm.weight
: torch.Size([4096])
decoder.layers.0.pre_mlp_layernorm.weight
: torch.Size([4096])
*16 layers