https://github.com/OpenGVLab/LLaMA-Adapter/tree/main/llama_adapter_v2_multimodal
这个连接中有一个大模型微调的例子
个人认为adapter只有在backbone比较大,学习的比较好的情况下有用,不适合一般的任务。
———————尽管有代码,貌似没效果—————————
这个是adapter的代码块,可直接用
# 这个是adapter的代码块,可直接用
class Adapter(nn.Module):
"""Conventional Adapter layer, in which the weights of up and down sampler modules
are parameters and are optimized."""
def __init__(self, config):
super().__init__()
self.config = config
self.input_dim = 768
reduction_factor = 16
self.down_sample_size = self.input_dim // reduction_factor
self.down_sampler = nn.Linear(self.input_dim, self.down_sample_size)
self.up_sampler = nn.Linear(self.down_sample_size, self.input_dim)
self.gate = nn.Parameter(torch.zeros(1))
def forward(self, x):
z = self.down_sampler(x)
z = gelu(z)
z = self.up_sampler(z)
output = z + x
if self.gate is not None:
output = self.gate * output
return output
我把它直接用在bert的输出层中,也可以用在其他地方
class BertOutput(nn.Module):
def __init__(self, config):
super(BertOutput, self).__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.adapter = Adapter(config) # here !
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.adapter(hidden_states) # here !
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
冻结参数
print('----------------------')
for n, p in self.global_encoder.named_parameters():
p.requires_grad = False
解冻adapter模块
for name, sub_module in self.global_encoder.named_modules():
# self.global_encoder 可以是模型也可以是某个模块
# print(sub_module)
if isinstance(sub_module,(Adapter)):
print(f"{name} is trainable...")
# if len(name.split(".")) < 7: # this will not consider layer norms inside adapters then.
for param_name, param in sub_module.named_parameters():
param.requires_grad = True
打印可训练的参数
def print_trainable_params_percentage(self, model):
orig_param_size = sum(p.numel() for p in model.parameters())
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
trainable_size = count_parameters(model)
percentage = trainable_size / orig_param_size * 100
print(f"Trainable param percentage: {percentage:.2f}%")
print(trainable_size)
return percentage