《从零构建大模型》系列（21）：从头实现GPT模型—

# 未归一化的激活值问题
activations = torch.randn(1000, 768) * 10  # 模拟大方差激活
mean = activations.mean(dim=1)  # 各样本均值差异大
std = activations.std(dim=1)   # 各样本标准差差异大

print("均值范围:", mean.min().item(), "~", mean.max().item())
print("标准差范围:", std.min().item(), "~", std.max().item())

层归一化优势：

稳定训练过程
加速收敛速度
缓解梯度消失/爆炸问题
减少对初始化的依赖

2.2 层归一化实现代码

class LayerNorm(nn.Module):
    def __init__(self, d_model, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(d_model))  # 缩放参数
        self.beta = nn.Parameter(torch.zeros(d_model))  # 平移参数
    
    def forward(self, x):
        # 计算均值和方差
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True, unbiased=False)
        
        # 归一化
        x_normalized = (x - mean) / (std + self.eps)
        
        # 缩放和平移
        return self.gamma * x_normalized + self.beta

# 与PyTorch官方实现对比测试
def test_layernorm():
    input_tensor = torch.randn(2, 3, 768)
    
    # 自定义层归一化
    custom_ln = LayerNorm(768)
    custom_out = custom_ln(input_tensor)
    
    # PyTorch官方层归一化
    official_ln = nn.LayerNorm(768)
    official_out = official_ln(input_tensor)
    
    # 检查差异
    diff = (custom_out - official_out).abs().max().item()
    print(f"最大差异: {diff:.6f}")  # 应小于1e-5

test_layernorm()

三、前馈神经网络实现

3.1 GPT中的前馈结构

3.2 GELU激活函数

class GELU(nn.Module):
    """高斯误差线性单元激活函数"""
    def forward(self, x):
        return 0.5 * x * (1.0 + torch.tanh(
            math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))
        )

3.3 完整前馈网络实现

class FeedForward(nn.Module):
    def __init__(self, d_model, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),  # 扩展维度
            GELU(),  # 使用自定义GELU
            nn.Linear(4 * d_model, d_model),  # 降回原维度
            nn.Dropout(dropout)
        )
    
    def forward(self, x):
        return self.net(x)

四、Transformer块实现

4.1 Transformer块结构

class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        # 多头注意力层
        self.attn = MultiHeadAttention(d_model, num_heads, dropout)
        
        # 前馈网络层
        self.ffn = FeedForward(d_model, dropout)
        
        # 层归一化层
        self.ln1 = LayerNorm(d_model)
        self.ln2 = LayerNorm(d_model)
        
        # Dropout层
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        # 残差连接1: 注意力
        attn_output = self.attn(self.ln1(x))
        x = x + self.dropout(attn_output)
        
        # 残差连接2: 前馈网络
        ffn_output = self.ffn(self.ln2(x))
        x = x + self.dropout(ffn_output)
        
        return x

4.2 残差连接的重要性

模型	带残差连接	不带残差连接
训练稳定性	高	低
收敛速度	快	慢
深度扩展性	支持深层	难以超过10层
梯度流动	畅通	易消失

五、完整GPT模型实现

5.1 模型配置类

class GPTConfig:
    def __init__(self, 
                 vocab_size=50257, 
                 context_length=1024,
                 emb_dim=768,
                 n_heads=12,
                 n_layers=12,
                 drop_rate=0.1,
                 qkv_bias=False):
        self.vocab_size = vocab_size
        self.context_length = context_length
        self.emb_dim = emb_dim
        self.n_heads = n_heads
        self.n_layers = n_layers
        self.drop_rate = drop_rate
        self.qkv_bias = qkv_bias

# 创建GPT-2 Small配置
GPT_CONFIG_124M = GPTConfig()

5.2 完整GPT模型类

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        
        # 词元嵌入层
        self.token_embed = nn.Embedding(config.vocab_size, config.emb_dim)
        
        # 位置嵌入层
        self.position_embed = nn.Embedding(config.context_length, config.emb_dim)
        
        # Dropout层
        self.dropout = nn.Dropout(config.drop_rate)
        
        # Transformer块堆叠
        self.blocks = nn.ModuleList([
            TransformerBlock(
                d_model=config.emb_dim,
                num_heads=config.n_heads,
                dropout=config.drop_rate
            ) for _ in range(config.n_layers)
        ])
        
        # 最终层归一化
        self.ln_f = LayerNorm(config.emb_dim)
        
        # 输出层
        self.head = nn.Linear(config.emb_dim, config.vocab_size, bias=False)
        
        # 权重绑定：输出层与嵌入层共享权重
        self.head.weight = self.token_embed.weight
        
        # 初始化权重
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        """GPT-2风格的权重初始化"""
        if isinstance(module, nn.Linear):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
    
    def forward(self, input_ids):
        batch_size, seq_len = input_ids.shape
        
        # 词元嵌入
        token_embeds = self.token_embed(input_ids)
        
        # 位置嵌入
        positions = torch.arange(seq_len, device=input_ids.device)
        pos_embeds = self.position_embed(positions)
        
        # 组合嵌入
        x = token_embeds + pos_embeds
        x = self.dropout(x)
        
        # 通过Transformer块
        for block in self.blocks:
            x = block(x)
        
        # 最终层归一化
        x = self.ln_f(x)
        
        # 输出预测
        logits = self.head(x)
        
        return logits

六、模型输入输出处理

6.1 文本处理流程

import tiktoken

# 初始化分词器
tokenizer = tiktoken.get_encoding("gpt2")

# 示例文本
texts = [
    "Every effort moves you",
    "Every day holds a"
]

# 文本转词元ID
def encode_batch(texts):
    batch = []
    for text in texts:
        tokens = tokenizer.encode(text)
        batch.append(torch.tensor(tokens))
    return torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0)

# 处理批次
input_ids = encode_batch(texts)
print("输入ID形状:", input_ids.shape)
print("输入ID:\n", input_ids)

6.2 模型推理示例

# 初始化模型
config = GPTConfig(
    vocab_size=tokenizer.n_vocab,
    context_length=1024,
    emb_dim=768,
    n_heads=12,
    n_layers=12,
    drop_rate=0.1
)
model = GPT(config)

# 前向传播
with torch.no_grad():
    logits = model(input_ids)

print("输出logits形状:", logits.shape)
print("最后一个词元的预测分布:", logits[0, -1, :5])  # 展示前5个logits

七、模型参数量计算

7.1 参数量计算函数

def count_parameters(model):
    total_params = 0
    for name, param in model.named_parameters():
        if param.requires_grad:
            params = param.numel()
            total_params += params
            print(f"{name}: {params:,}")
    print(f"总参数量: {total_params:,}")
    return total_params

# 计算GPT-2 Small参数量
total_params = count_parameters(model)
print(f"模型参数量: {total_params/1e6:.2f}M")

7.2 GPT-2 Small参数分布

组件	参数量	占比
词元嵌入	38.5M	31.0%
位置嵌入	0.8M	0.6%
Transformer块	84.9M	68.4%
输出层	0 (共享权重)	0%
总计	124.2M	100%

八、模型存储需求分析

8.1 存储需求计算

def model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    total_size = (param_size + buffer_size) / (1024**2)  # 转换为MB
    return total_size

print(f"模型内存占用: {model_size(model):.2f} MB")

# 不同精度下的存储需求
precision_sizes = {
    "float32": total_params * 4 / (1024**2),
    "float16": total_params * 2 / (1024**2),
    "int8": total_params * 1 / (1024**2)
}

for prec, size in precision_sizes.items():
    print(f"{prec}精度下存储需求: {size:.2f} MB")

8.2 不同GPT模型存储需求

模型	参数量	float32 (MB)	float16 (MB)	int8 (MB)
GPT-2 Small	124M	496	248	124
GPT-2 Medium	355M	1,420	710	355
GPT-2 Large	774M	3,096	1,548	774
GPT-2 XL	1.5B	6,000	3,000	1,500
GPT-3 175B	175B	700,000	350,000	175,000

九、模型架构验证

9.1 输出形状验证

# 测试不同输入长度
for seq_len in [32, 64, 128]:
    test_input = torch.randint(0, config.vocab_size, (2, seq_len))
    logits = model(test_input)
    assert logits.shape == (2, seq_len, config.vocab_size)
    print(f"seq_len={seq_len} 测试通过")

9.2 梯度流测试

# 创建优化器
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# 模拟训练步骤
def train_step(batch):
    optimizer.zero_grad()
    
    # 创建简单目标：预测下一个词元
    inputs = batch[:, :-1]
    targets = batch[:, 1:]
    
    # 前向传播
    logits = model(inputs)
    
    # 计算损失
    loss = torch.nn.functional.cross_entropy(
        logits.view(-1, logits.size(-1)),
        targets.contiguous().view(-1),
        ignore_index=0  # 忽略填充值
    )
    
    # 反向传播
    loss.backward()
    
    # 检查梯度
    for name, param in model.named_parameters():
        if param.grad is not None:
            grad_mean = param.grad.abs().mean().item()
            if grad_mean < 1e-6:
                print(f"警告: {name} 梯度过小 ({grad_mean:.2e})")
    
    optimizer.step()
    return loss.item()

# 执行测试步骤
test_batch = torch.randint(0, config.vocab_size, (4, 16))
loss = train_step(test_batch)
print(f"测试步骤损失: {loss:.4f}")