对比各个大模型的网络结构
ps:使用自己的config,但是模型结构跟官方配置原理一致.
chatglm3
ChatGLMForConditionalGeneration(
(transformer): ChatGLMModel(
(embedding): Embedding(
(word_embeddings): Embedding(65024, 4096)
)
(rotary_pos_emb): RotaryEmbedding()
(encoder): GLMTransformer(
(layers): ModuleList(
(0-1): 2 x GLMBlock(
(input_layernorm): RMSNorm()
(self_attention): SelfAttention(
(query_key_value): Linear(in_features=4096, out_features=12288, bias=False)
(core_attention): CoreAttention(
(attention_dropout): Dropout(p=0.0, inplace=False)
)
(dense): Linear(in_features=4096, out_features=4096, bias=False)
)
(post_attention_layernorm): RMSNorm()
(mlp): MLP(
(dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
(dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
)
)
)
(final_layernorm): RMSNorm()
)
(output_layer): Linear(in_features=4096, out_features=65024, bias=False)
)
)
#模型调试代码:
#=======通过下面这个代码可以debug学习chatglm3的代码.
from modeling_chatglm import ChatGLMForConditionalGeneration,ChatGLMConfig
import torch
def run ():
config= ChatGLMConfig(num_layers=2,original_rope=True,use_cache=True) #=====有一些参数在config.json里面搬过来即可.
model = ChatGLMForConditionalGeneration(config=config)
inputs_ids = torch.randint(low=0,high=config.vocab_size, size=(4,30))
print(model)
res = model(inputs_ids)
print(res)
run()
llama2
LlamaForCausalLM(
(model): LlamaModel(
(embed_tokens): Embedding(32000, 2048)
(layers): ModuleList(
(0-1): 2 x LlamaDecoderLayer(
(self_attn): LlamaAttention(
(q_proj): Linear(in_features=2048, out_features=2048, bias=False)
(k_proj): Linear(in_features=2048, out_features=2048, bias=False)
(v_proj): Linear(in_features=2048, out_features=2048, bias=False)
(o_proj): Linear(in_features=2048, out_features=2048, bias=False)
(rotary_emb): LlamaRotaryEmbedding()
)
(mlp): LlamaMLP(
(gate_proj): Linear(in_features=2048, out_features=554, bias=False)
(up_proj): Linear(in_features=2048, out_features=554, bias=False)
(down_proj): Linear(in_features=554, out_features=2048, bias=False)
(act_fn): SiLUActivation()
)
(input_layernorm): LlamaRMSNorm()
(post_attention_layernorm): LlamaRMSNorm()
)
)
(norm): LlamaRMSNorm()
)
(lm_head): Linear(in_features=2048, out_features=32000, bias=False)
)
#模型调试代码:
from transformers.models .llama import LlamaModel,LlamaConfig,LlamaForCausalLM
import torch
def run ():
llamaconfig= LlamaConfig(vocab_size=32000,
hidden_size=4096//2,
intermediate_size=1108//2,
num_hidden_layers=2,
num_attention_heads=32//2,max_position_embeddings=2048//2)
llamamodel = LlamaForCausalLM(config=llamaconfig) #https://hf-mirror.com/hiyouga/Llama-2-Chinese-13b-chat/blob/main/config.json 参考这里面的architecture知道llama2依然用的事llamaforcausallm架构.
inputs_ids = torch.randint(low=0,high=llamaconfig.vocab_size, size=(4,30))
print(llamamodel)
res = llamamodel(inputs_ids)
print(res)
run()
核心部分放在一起比较
chatglm: (input_layernorm): RMSNorm()
(self_attention): SelfAttention(
(query_key_value): Linear(in_features=4096, out_features=12288, bias=False)
(core_attention): CoreAttention(
(attention_dropout): Dropout(p=0.0, inplace=False)
)
(dense): Linear(in_features=4096, out_features=4096, bias=False)
)
(post_attention_layernorm): RMSNorm()
(mlp): MLP(
(dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
(dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
)
llama2: (self_attn): LlamaAttention(
(q_proj): Linear(in_features=2048, out_features=2048, bias=False)
(k_proj): Linear(in_features=2048, out_features=2048, bias=False)
(v_proj): Linear(in_features=2048, out_features=2048, bias=False)
(o_proj): Linear(in_features=2048, out_features=2048, bias=False)
(rotary_emb): LlamaRotaryEmbedding()
)
(mlp): LlamaMLP(
(gate_proj): Linear(in_features=2048, out_features=554, bias=False)
(up_proj): Linear(in_features=2048, out_features=554, bias=False)
(down_proj): Linear(in_features=554, out_features=2048, bias=False)
(act_fn): SiLUActivation()
)
(input_layernorm): LlamaRMSNorm()
(post_attention_layernorm): LlamaRMSNorm()
总结: 可以看出这2个模型除了参数有些区别外模型架构一模一样. chatglm的attendim=4096, dnn层 参数少. llama2的attentiondim=2048,dnn层参数多.就这么点区别.
最新的phi-2很火
PhiForCausalLM(
(model): PhiModel(
(embed_tokens): Embedding(51200, 2048)
(embed_dropout): Dropout(p=0.0, inplace=False)
(layers): ModuleList(
(0-1): 2 x PhiDecoderLayer(
(self_attn): PhiAttention(
(q_proj): Linear(in_features=2048, out_features=2048, bias=True)
(k_proj): Linear(in_features=2048, out_features=2048, bias=True)
(v_proj): Linear(in_features=2048, out_features=2048, bias=True)
(dense): Linear(in_features=2048, out_features=2048, bias=True)
(rotary_emb): PhiRotaryEmbedding()
)
(mlp): PhiMLP(
(activation_fn): NewGELUActivation()
(fc1): Linear(in_features=2048, out_features=8192, bias=True)
(fc2): Linear(in_features=8192, out_features=2048, bias=True)
)
(input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
(resid_dropout): Dropout(p=0.0, inplace=False)
)
)
(final_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
)
(lm_head): Linear(in_features=2048, out_features=51200, bias=True)
)
#调试代码:
from transformers.models .llama import LlamaModel,LlamaConfig,LlamaForCausalLM
import torch
from configuration_phi import PhiConfig
from modeling_phi import PhiForCausalLM
def run ():
config= PhiConfig(
num_hidden_layers=2,)
llamamodel = PhiForCausalLM(config=config) #https://hf-mirror.com/hiyouga/Llama-2-Chinese-13b-chat/blob/main/config.json 参考这里面的architecture知道llama2依然用的事llamaforcausallm架构.
inputs_ids = torch.randint(low=0,high=config.vocab_size, size=(4,30))
print(llamamodel)
res = llamamodel(inputs_ids)
print(res)
run()
可以看到差别也不大.
多模态llava
核心代码
cur_new_input_embeds = torch.cat((cur_input_embeds[:image_start_token_pos+1], cur_image_features, cur_input_embeds[image_start_token_pos + num_patches + 1:]), dim=0) # 文字信息+图片开始token+图片特征+图片结束token#==============核心代码都在这里!!!!!!!
# 项目比较复杂. 整个项目在
https://github.com/zhangbo2008/llm_conclusion/blob/main/llava.py#L199
调试代码在https://github.com/zhangbo2008/llm_conclusion/4.py
总结:
多模态使用的方法是:文字信息+图片开始token+图片特征+图片结束token,然后进行编码,之后把所有信息当做文字信息喂给llama即可. 然后llama输出文字信息.所以整体架构就是把clip提取的文字信息跟 文字经过embedding之后的tensor拼接之后喂给llama即可. 如果print网络,整体架构跟llama一样.
ps:后续会看各种大模型的训练方法.