知识蒸馏
代码
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
# 检查 CUDA 是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 定义大型教师模型
class TeacherModel(nn.Module):
def __init__(self):
super(TeacherModel, self).__init__()
self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
self.pool = nn.MaxPool2d(2, 2)
self.fc1 = nn.Linear(64 * 7 * 7, 128)
self.fc2 = nn.Linear(128, 10) # 输出层加 softmax
def forward(self, x):
x = self.pool(torch.relu(self.conv1(x)))
x = self.pool(torch.relu(self.conv2(x)))
x = torch.flatten(x, 1)
x = torch.relu(self.fc1(x))
x = self.fc2(x) # 注意:教师模型的输出需要 softmax
x = torch.nn.functional.softmax(x,dim=-1)
return x
# 定义小型学生模型
class StudentModel(nn.Module):
def __init__(self):
super(StudentModel, self).__init__()
self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
self.pool = nn.MaxPool2d(2, 2)
self.fc1 = nn.Linear(32 * 7 * 7, 64)
self.fc2 = nn.Linear(64, 10) # 输出层加 softmax
def forward(self, x):
x = self.pool(torch.relu(self.conv1(x)))
x = self.pool(torch.relu(self.conv2(x)))
x = torch.flatten(x, 1)
x = torch.relu(self.fc1(x))
x = self.fc2(x) # 注意:学生模型的输出不需要 softmax
x = torch.nn.functional.softmax(x,dim=-1)
return x
# 加载 MNIST 数据集
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
# 加载MNIST数据集
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)
# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
#定义模型
teacher = TeacherModel()
student = StudentModel()
teacher.to(device)
student.to(device)
# 训练模型
def train(model, criterion, optimizer, train_loader, num_epochs=5):
model.train()
for epoch in range(num_epochs):
running_loss = 0.0
for batch_idx, (data, target) in enumerate(train_loader):
data = data.to(device)
target = target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
running_loss += loss.item()
if (batch_idx + 1) % 100 == 0:
print(f'Epoch {epoch + 1}, Batch {batch_idx + 1}, Loss: {running_loss / 100:.4f}')
running_loss = 0.0
# 测试模型
def test(model, test_loader):
model.eval()
correct = 0
total = 0
with torch.no_grad():
for data, target in test_loader:
data = data.to(device)
target = target.to(device)
output = model(data)
_, predicted = torch.max(output, 1)
correct += (predicted == target).sum().item()
total += target.size(0)
accuracy = correct / total
print(f'Test Accuracy: {accuracy:.4f}')
teacher_optimizer = torch.optim.Adam(teacher.parameters(),lr=0.001)
student_optimizer = torch.optim.Adam(student.parameters(),lr=0.001)
criterion = nn.CrossEntropyLoss()
# 训练并测试模型
train(teacher, criterion, teacher_optimizer, train_loader, num_epochs=5)
test(teacher, test_loader)
criterion = nn.KLDivLoss(reduction='batchmean') # 使用 KL 散度损失函数
temperature = 2 # 温度参数,用于软化分布
# 定义知识蒸馏训练函数
def knowledge_distillation_train(teacher_model, student_model, criterion, optimizer, train_loader, num_epochs=5):
teacher_model.eval() # 将教师模型设置为评估模式
student_model.train() # 将学生模型设置为训练模式
for epoch in range(num_epochs):
running_loss = 0.0
for data, target in train_loader:
data = data.to(device)
target = target.to(device)
optimizer.zero_grad()
teacher_output = teacher_model(data) # 教师模型的输出
student_output = student_model(data) # 学生模型的输出
# teacher_output = torch.div(teacher_output,temperature,dim=-1)
# student_output = torch.div(student_output,temperature,dim=-1)
teacher_output = teacher_output / temperature
student_output = student_output / temperature
loss = criterion(student_output, teacher_output) # 计算 KL 散度损失
loss.backward()
optimizer.step()
running_loss += loss.item()
print(f'Epoch {epoch + 1}, Loss: {running_loss / len(train_loader):.4f}')
# 训练学生模型进行知识蒸馏
knowledge_distillation_train(teacher, student, criterion, student_optimizer, train_loader, num_epochs=50)
比较老师和学生训练的效果
#测试老师和学生的效果
test(teacher, test_loader)
test(student,test_loader)