the teacher layer ids to distill is: [0, 1, 2, 3, 4, 5]
odict_keys([
‘bert.embeddings.word_embeddings.weight’,
‘bert.embeddings.position_embeddings.weight’,
‘bert.embeddings.token_type_embeddings.weight’,
‘bert.embeddings.LayerNorm.weight’,
‘bert.embeddings.LayerNorm.bias’,
‘bert.encoder.layer.0.attention.self.query.weight’,
‘bert.encoder.layer.0.attention.self.query.bias’,
‘bert.encoder.layer.0.attention.self.key.weight’,
‘bert.encoder.layer.0.attention.self.key.bias’,
‘bert.encoder.layer.0.attention.self.value.weight’,
‘bert.encoder.layer.0.attention.self.value.bias’,
‘bert.encoder.layer.0.attention.output.dense.weight’,
‘bert.encoder.layer.0.attention.output.dense.bias’,
‘bert.encoder.layer.0.attention.output.LayerNorm.weight’,
‘bert.encoder.layer.0.attention.output.LayerNorm.bias’,
‘bert.encoder.layer.0.intermediate.dense.weight’,
‘bert.encoder.layer.0.intermediate.dense.bias’,
‘bert.encoder.layer.0.output.dense.weight’,
‘bert.encoder.layer.0.output.dense.bias’,
‘bert.encoder.layer.0.output.LayerNorm.weight’,
‘bert.encoder.layer.0.output.LayerNorm.bias’,
‘cls.predictions.transform.dense.weight’,
‘cls.predictions.transform.dense.bias’,
‘cls.predictions.transform.LayerNorm.weight’,
‘cls.predictions.transform.LayerNorm.bias’,
‘cls.predictions.decoder.weight’,
‘cls.predictions.decoder.bias’
‘cls.predictions.bias’,
‘bert.pooler.dense.weight’,
‘bert.pooler.dense.bias’,
])