Config
GPT2Config {
"activation_function": "gelu_new",
"architectures": [
"GPT2LMHeadModel"
],
"attn_pdrop": 0.1,
"bos_token_id": 50256,
"embd_pdrop": 0.1,
"eos_token_id": 50256,
"initializer_range": 0.02,
"layer_norm_epsilon": 1e-05,
"model_type": "gpt2",
"n_ctx": 1024,
"n_embd": 768,
"n_head": 12,
"n_inner": null,
"n_layer": 12,
"n_positions": 1024,
"reorder_and_upcast_attn": false,
"resid_pdrop": 0.1,
"scale_attn_by_inverse_layer_idx": false,
"scale_attn_weights": true,
"summary_activation": null,
"summary_first_dropout": 0.1,
"summary_proj_to_labels": true,
"summary_type": "cls_index",
"summary_use_proj": true,
"task_specific_params": {
"text-generation": {
"do_sample": true,
"max_length": 50
}
},
"transformers_version": "4.38.2",
"use_cache": true,
"vocab_size": 50257
}
name & moudle
打印网络结构
for name, module in model.named_modules():
print(name)
print(module)
GPT2LMHeadModel(
(transformer): GPT2Model(
(wte): Embedding(50257, 768)
(wpe): Embedding(1024, 768)
(drop): Dropout(p=0.1, inplace=False)
(h): ModuleList(
(0-11): 12 x GPT2Block(
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(lm_head): Linear(in_features=768, out_features=50257, bias=False)
)
transformer
GPT2Model(
(wte): Embedding(50257, 768)
(wpe): Embedding(1024, 768)
(drop): Dropout(p=0.1, inplace=False)
(h): ModuleList(
(0-11): 12 x GPT2Block(
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
transformer.wte
Embedding(50257, 768)
transformer.wpe
Embedding(1024, 768)
transformer.drop
Dropout(p=0.1, inplace=False)
transformer.h
ModuleList(
(0-11): 12 x GPT2Block(
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
transformer.h.0
GPT2Block(
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
)
transformer.h.0.ln_1
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.0.attn
GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.0.attn.c_attn
Conv1D()
transformer.h.0.attn.c_proj
Conv1D()
transformer.h.0.attn.attn_dropout
Dropout(p=0.1, inplace=False)
transformer.h.0.attn.resid_dropout
Dropout(p=0.1, inplace=False)
transformer.h.0.ln_2
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.0.mlp
GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.0.mlp.c_fc
Conv1D()
transformer.h.0.mlp.c_proj
Conv1D()
transformer.h.0.mlp.act
NewGELUActivation()
transformer.h.0.mlp.dropout
Dropout(p=0.1, inplace=False)
transformer.h.1
GPT2Block(
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
)
transformer.h.1.ln_1
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.1.attn
GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.1.attn.c_attn
Conv1D()
transformer.h.1.attn.c_proj
Conv1D()
transformer.h.1.attn.attn_dropout
Dropout(p=0.1, inplace=False)
transformer.h.1.attn.resid_dropout
Dropout(p=0.1, inplace=False)
transformer.h.1.ln_2
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.1.mlp
GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.1.mlp.c_fc
Conv1D()
transformer.h.1.mlp.c_proj
Conv1D()
transformer.h.1.mlp.act
NewGELUActivation()
transformer.h.1.mlp.dropout
Dropout(p=0.1, inplace=False)
transformer.h.2
GPT2Block(
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
)
transformer.h.2.ln_1
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.2.attn
GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.2.attn.c_attn
Conv1D()
transformer.h.2.attn.c_proj
Conv1D()
transformer.h.2.attn.attn_dropout
Dropout(p=0.1, inplace=False)
transformer.h.2.attn.resid_dropout
Dropout(p=0.1, inplace=False)
transformer.h.2.ln_2
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.2.mlp
GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.2.mlp.c_fc
Conv1D()
transformer.h.2.mlp.c_proj
Conv1D()
transformer.h.2.mlp.act
NewGELUActivation()
transformer.h.2.mlp.dropout
Dropout(p=0.1, inplace=False)
transformer.h.3
GPT2Block(
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
)
transformer.h.3.ln_1
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.3.attn
GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.3.attn.c_attn
Conv1D()
transformer.h.3.attn.c_proj
Conv1D()
transformer.h.3.attn.attn_dropout
Dropout(p=0.1, inplace=False)
transformer.h.3.attn.resid_dropout
Dropout(p=0.1, inplace=False)
transformer.h.3.ln_2
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.3.mlp
GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.3.mlp.c_fc
Conv1D()
transformer.h.3.mlp.c_proj
Conv1D()
transformer.h.3.mlp.act
NewGELUActivation()
transformer.h.3.mlp.dropout
Dropout(p=0.1, inplace=False)
transformer.h.4
GPT2Block(
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
)
transformer.h.4.ln_1
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.4.attn
GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.4.attn.c_attn
Conv1D()
transformer.h.4.attn.c_proj
Conv1D()
transformer.h.4.attn.attn_dropout
Dropout(p=0.1, inplace=False)
transformer.h.4.attn.resid_dropout
Dropout(p=0.1, inplace=False)
transformer.h.4.ln_2
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.4.mlp
GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.4.mlp.c_fc
Conv1D()
transformer.h.4.mlp.c_proj
Conv1D()
transformer.h.4.mlp.act
NewGELUActivation()
transformer.h.4.mlp.dropout
Dropout(p=0.1, inplace=False)
transformer.h.5
GPT2Block(
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
)
transformer.h.5.ln_1
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.5.attn
GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.5.attn.c_attn
Conv1D()
transformer.h.5.attn.c_proj
Conv1D()
transformer.h.5.attn.attn_dropout
Dropout(p=0.1, inplace=False)
transformer.h.5.attn.resid_dropout
Dropout(p=0.1, inplace=False)
transformer.h.5.ln_2
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.5.mlp
GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.5.mlp.c_fc
Conv1D()
transformer.h.5.mlp.c_proj
Conv1D()
transformer.h.5.mlp.act
NewGELUActivation()
transformer.h.5.mlp.dropout
Dropout(p=0.1, inplace=False)
transformer.h.6
GPT2Block(
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
)
transformer.h.6.ln_1
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.6.attn
GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.6.attn.c_attn
Conv1D()
transformer.h.6.attn.c_proj
Conv1D()
transformer.h.6.attn.attn_dropout
Dropout(p=0.1, inplace=False)
transformer.h.6.attn.resid_dropout
Dropout(p=0.1, inplace=False)
transformer.h.6.ln_2
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.6.mlp
GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.6.mlp.c_fc
Conv1D()
transformer.h.6.mlp.c_proj
Conv1D()
transformer.h.6.mlp.act
NewGELUActivation()
transformer.h.6.mlp.dropout
Dropout(p=0.1, inplace=False)
transformer.h.7
GPT2Block(
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
)
transformer.h.7.ln_1
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.7.attn
GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.7.attn.c_attn
Conv1D()
transformer.h.7.attn.c_proj
Conv1D()
transformer.h.7.attn.attn_dropout
Dropout(p=0.1, inplace=False)
transformer.h.7.attn.resid_dropout
Dropout(p=0.1, inplace=False)
transformer.h.7.ln_2
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.7.mlp
GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.7.mlp.c_fc
Conv1D()
transformer.h.7.mlp.c_proj
Conv1D()
transformer.h.7.mlp.act
NewGELUActivation()
transformer.h.7.mlp.dropout
Dropout(p=0.1, inplace=False)
transformer.h.8
GPT2Block(
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
)
transformer.h.8.ln_1
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.8.attn
GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.8.attn.c_attn
Conv1D()
transformer.h.8.attn.c_proj
Conv1D()
transformer.h.8.attn.attn_dropout
Dropout(p=0.1, inplace=False)
transformer.h.8.attn.resid_dropout
Dropout(p=0.1, inplace=False)
transformer.h.8.ln_2
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.8.mlp
GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.8.mlp.c_fc
Conv1D()
transformer.h.8.mlp.c_proj
Conv1D()
transformer.h.8.mlp.act
NewGELUActivation()
transformer.h.8.mlp.dropout
Dropout(p=0.1, inplace=False)
transformer.h.9
GPT2Block(
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
)
transformer.h.9.ln_1
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.9.attn
GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.9.attn.c_attn
Conv1D()
transformer.h.9.attn.c_proj
Conv1D()
transformer.h.9.attn.attn_dropout
Dropout(p=0.1, inplace=False)
transformer.h.9.attn.resid_dropout
Dropout(p=0.1, inplace=False)
transformer.h.9.ln_2
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.9.mlp
GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.9.mlp.c_fc
Conv1D()
transformer.h.9.mlp.c_proj
Conv1D()
transformer.h.9.mlp.act
NewGELUActivation()
transformer.h.9.mlp.dropout
Dropout(p=0.1, inplace=False)
transformer.h.10
GPT2Block(
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
)
transformer.h.10.ln_1
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.10.attn
GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.10.attn.c_attn
Conv1D()
transformer.h.10.attn.c_proj
Conv1D()
transformer.h.10.attn.attn_dropout
Dropout(p=0.1, inplace=False)
transformer.h.10.attn.resid_dropout
Dropout(p=0.1, inplace=False)
transformer.h.10.ln_2
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.10.mlp
GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.10.mlp.c_fc
Conv1D()
transformer.h.10.mlp.c_proj
Conv1D()
transformer.h.10.mlp.act
NewGELUActivation()
transformer.h.10.mlp.dropout
Dropout(p=0.1, inplace=False)
transformer.h.11
GPT2Block(
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
)
transformer.h.11.ln_1
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.11.attn
GPT2Attention(
(c_attn): Conv1D()
(c_proj): Conv1D()
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.11.attn.c_attn
Conv1D()
transformer.h.11.attn.c_proj
Conv1D()
transformer.h.11.attn.attn_dropout
Dropout(p=0.1, inplace=False)
transformer.h.11.attn.resid_dropout
Dropout(p=0.1, inplace=False)
transformer.h.11.ln_2
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
transformer.h.11.mlp
GPT2MLP(
(c_fc): Conv1D()
(c_proj): Conv1D()
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
transformer.h.11.mlp.c_fc
Conv1D()
transformer.h.11.mlp.c_proj
Conv1D()
transformer.h.11.mlp.act
NewGELUActivation()
transformer.h.11.mlp.dropout
Dropout(p=0.1, inplace=False)
transformer.ln_f
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
lm_head
Linear(in_features=768, out_features=50257, bias=False)