知识蒸馏(torch-手写数字识别)

知识蒸馏

代码

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
# 检查 CUDA 是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 定义大型教师模型
class TeacherModel(nn.Module):
    def __init__(self):
        super(TeacherModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)  # 输出层加 softmax
    

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = torch.flatten(x, 1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)  # 注意:教师模型的输出需要 softmax
        x = torch.nn.functional.softmax(x,dim=-1)
        return x
# 定义小型学生模型
class StudentModel(nn.Module):
    def __init__(self):
        super(StudentModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(32 * 7 * 7, 64)
        self.fc2 = nn.Linear(64, 10)  # 输出层加 softmax

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = torch.flatten(x, 1)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)  # 注意:学生模型的输出不需要 softmax
        x = torch.nn.functional.softmax(x,dim=-1)
        return x

# 加载 MNIST 数据集
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])
# 加载MNIST数据集
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

#定义模型
teacher = TeacherModel()
student = StudentModel()
teacher.to(device)
student.to(device)
# 训练模型
def train(model, criterion, optimizer, train_loader, num_epochs=5):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for batch_idx, (data, target) in enumerate(train_loader):
            data = data.to(device)
            target = target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            if (batch_idx + 1) % 100 == 0:
                print(f'Epoch {epoch + 1}, Batch {batch_idx + 1}, Loss: {running_loss / 100:.4f}')
                running_loss = 0.0

# 测试模型
def test(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in test_loader:
            data = data.to(device)
            target = target.to(device)
            output = model(data)
            _, predicted = torch.max(output, 1)
            correct += (predicted == target).sum().item()
            total += target.size(0)
    accuracy = correct / total
    print(f'Test Accuracy: {accuracy:.4f}')
teacher_optimizer = torch.optim.Adam(teacher.parameters(),lr=0.001)
student_optimizer = torch.optim.Adam(student.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss()
# 训练并测试模型
train(teacher, criterion, teacher_optimizer, train_loader, num_epochs=5)
test(teacher, test_loader)
criterion = nn.KLDivLoss(reduction='batchmean')  # 使用 KL 散度损失函数
temperature = 2  # 温度参数,用于软化分布
# 定义知识蒸馏训练函数
def knowledge_distillation_train(teacher_model, student_model, criterion, optimizer, train_loader, num_epochs=5):
    teacher_model.eval()  # 将教师模型设置为评估模式
    student_model.train()  # 将学生模型设置为训练模式
    for epoch in range(num_epochs):
        running_loss = 0.0
        for data, target in train_loader:
            data = data.to(device)
            target = target.to(device)
            optimizer.zero_grad()
            teacher_output = teacher_model(data)  # 教师模型的输出
            student_output = student_model(data)  # 学生模型的输出
            # teacher_output = torch.div(teacher_output,temperature,dim=-1)
            # student_output = torch.div(student_output,temperature,dim=-1)
            teacher_output = teacher_output / temperature
            student_output = student_output / temperature
            loss = criterion(student_output, teacher_output)  # 计算 KL 散度损失
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'Epoch {epoch + 1}, Loss: {running_loss / len(train_loader):.4f}')

# 训练学生模型进行知识蒸馏
knowledge_distillation_train(teacher, student, criterion, student_optimizer, train_loader, num_epochs=50)

比较老师和学生训练的效果

#测试老师和学生的效果

test(teacher, test_loader)
test(student,test_loader)
  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值