如果我们让一个类继承了PretrainedModel,并对它调用from_pretrained方法,它的权重是在构造方法(__init__)方法执行过后才赋给了相应的模块,所以如果我们试图在构造方法里面获取某部分的权重,则只会得到一个随机权重。
举例来说,我们自定义了一个类CustomBart:
class BartModelCustom(BartPretrainedModel):
_keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: BartConfig):
super().__init__(config)
padding_idx, vocab_size = config.pad_token_id, config.vocab_size
self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
self.encoder = BartEncoder(config, self.shared)
self.knowl_encoder = BartEncoder(config, self.shared) #我们需要让这部分复制self.encoder的权重
self.decoder = MyBartDecoder(config, self.shared)
self.post_init()
如果我们在__init__方法里写如下代码:
with torch.no_grad():
encoder_dict=self.encoder.state_dict()
knowl_encoder_dict=self.knowl_encoder.state_dict()
for name in knowl_encoder_dict:
knowl_encoder_dict[name].data.copy_(encoder_dict[name].data)
self.model.knowl_encoder.load_state_dict(knowl_encoder_dict)
则会发现并没有成功复制,而是获得了一个随机初始化的权重。如果想要成功复制,则需要在执行过from_pretrained后才进行调用。
model = CustomBart.from_pretrained(xxxxxx)
with torch.no_grad():
encoder_dict=selfencoder.state_dict()
knowl_encoder_dict=self.knowl_encoder.state_dict()
for name in knowl_encoder_dict:
knowl_encoder_dict[name].data.copy_(encoder_dict[name].data)
self.model.knowl_encoder.load_state_dict(knowl_encoder_dict)
对对应层的权重进行测试,解决了:
for k,v in model.named_parameters():
if k in ['encoder.layers.0.self_attn.k_proj.weight','knowl_encoder.layers.0.self_attn.k_proj.weight']:
print(k)
print(v)
model.encoder.layers.0.self_attn.k_proj.weight
Parameter containing:
tensor([[-0.0429, 0.1521, -0.0441, ..., -0.0341, 0.1436, 0.0472],
[ 0.0246, -0.0109, 0.1132, ..., 0.0854, 0.0322, 0.0642],
[ 0.0891, 0.0030, 0.0444, ..., -0.0030, 0.0170, -0.0308],
...,
[ 0.0415, 0.0958, 0.0370, ..., -0.0731, -0.0214, -0.0576],
[ 0.0730, 0.1142, -0.0082, ..., 0.0634, 0.0139, -0.0818],
[ 0.0955, -0.0986, -0.0167, ..., -0.0326, 0.0297, 0.0875]],
requires_grad=True)
model.knowl_encoder.layers.0.self_attn.k_proj.weight
Parameter containing:
tensor([[-0.0429, 0.1521, -0.0441, ..., -0.0341, 0.1436, 0.0472],
[ 0.0246, -0.0109, 0.1132, ..., 0.0854, 0.0322, 0.0642],
[ 0.0891, 0.0030, 0.0444, ..., -0.0030, 0.0170, -0.0308],
...,
[ 0.0415, 0.0958, 0.0370, ..., -0.0731, -0.0214, -0.0576],
[ 0.0730, 0.1142, -0.0082, ..., 0.0634, 0.0139, -0.0818],
[ 0.0955, -0.0986, -0.0167, ..., -0.0326, 0.0297, 0.0875]],
requires_grad=True)