目录
一.实验数据集设置
数据集介绍:玩转Kaggle:Dog Breed Identification【识别狗的类别】
https://blog.csdn.net/jerry_liufeng/article/details/120066058
设置这120类图片,百分之70的图片作为训练集,百分之30作为验证集。
二.训练前的准备
训练好教师模型,也可以训练一些学生模型与知识蒸馏训练的学生模型对比。
教师网络resnet101的200轮最优结果和学生网络resnet18的200轮最优结果如下
三.知识蒸馏训练方式
3.1 加载教师模型的最优参数,加载学生模型的最优参数,使用蒸馏方法训练
该方法是于无法提升处提升。
学生模型如resnet18,在该数据集最高准确率是0.79,在第83个epoch。无论训练多少轮,准确率都无法提升了。
此时使用知识蒸馏方法,加载教师模型的最优参数0.89,也加载学生模型的最优参数,继续训练。
3.2 加载教师模型的最优参数,学生模型用原始模型,然后使用蒸馏方法训练
3.3 结果对比
两种方法都比直接训练学生网络提升了1点左右。如果再调调温度系数和alpha,应该还可以提高。
3.4 知识蒸馏的调参经验
loss = alapa * hard_loss + (1-alpha) * distill_loss
尽量让训练时 alapa * hard_loss 和 (1-alpha) * distill_loss 的结果在同一个量级。
当教师模型的准确率高出学生模型很多,则可以让蒸馏loss比hard_loss高些。
方法:提高温度系数,提高1-alpha的值,让Hard_loss占比少一些。
四.代码
train_teacher_model.py
import torch
from torchinfo import summary #用来可视化的
import models
import utils
import train_tools
from datetime import datetime
from d2l import torch as d2l
# 设置参数
dataset_name = "imagenet_dog"
lr,wd = 1e-4,1e-4
lr_period, lr_decay = 2,0.9
resnet_type = 101 # resnet的层数
model_name = f'resnet{resnet_type}'
is_pretrained = True
epochs = 200
valid_ratio = 0.3
reassign_dataset=False # 是否第一次加载数据,如果是第一次,要重新划分数据集。不过不是,直接加载就好。
random_seed=0 # 手动设置随机种子
# 如果设置为None,表示resnet只使用预训练的参数
student_best_weight_path = f'weights/resnet{model_name}_150_best_acc_params.pth'
log_name = (
"logs/"
+ model_name
+ "_"
+ dataset_name
+ "_"
+ datetime.now().strftime("%Y-%m-%d %H:%M:%S")
)
# loguru.logger.info(f"lr={lr},epochs={epochs},alpah={alpha},temp={temp},manual_seed({random_seed})")
# 定义logger
logger = utils.Logger(log_name)
logger.write(f"lr={lr},epochs={epochs},manual_seed({random_seed})\n")
# 设置随机数种子
torch.manual_seed(0)
# 单卡,如果是多卡
devices = d2l.try_all_gpus()
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 使用cuDNN加速卷积运算
torch.backends.cudnn.benchmark = True
# 得到训练集和测试集
train_iter,train_valid_iter,valid_iter,test_iter = utils.load_dog_transform_data(valid_ratio=valid_ratio,reassign_dataset=reassign_dataset)
# 定义模型
model = models.get_resnet(devices,resnet_type,is_pretrained)
# model = model.to(devices[0])
# 打印模型的参数
logger.write(str(summary(model))+'\n')
train_tools.train(epochs, model, model_name, lr,train_iter,valid_iter,devices,logger)
train_student_model.py
import torch
from torchinfo import summary #用来可视化的
import models
import utils
import train_tools
from datetime import datetime
from d2l import torch as d2l
# 设置参数
dataset_name = "imagenet_dog"
lr,wd = 1e-4,1e-4
lr_period, lr_decay = 2,0.9
resnet_type = 18 # resnet的层数
model_name = f'resnet{resnet_type}'
is_pretrained = True
epochs = 200
valid_ratio = 0.3
reassign_dataset = False # 是否要重新划分数据集?如果不是,则程序直接加载化分好的数据集即可。
random_seed=0 # 手动设置随机种子
# 如果设置为None,表示resnet只使用预训练的参数
student_best_weight_path = f'weights/resnet{model_name}_150_best_acc_params.pth'
log_name = (
"logs/"
+ model_name
+ "_"
+ dataset_name
+ "_"
+ datetime.now().strftime("%Y-%m-%d %H:%M:%S")
)
# loguru.logger.info(f"lr={lr},epochs={epochs},alpah={alpha},temp={temp},manual_seed({random_seed})")
# 定义logger
logger = utils.Logger(log_name)
logger.write(f"lr={lr},epochs={epochs},manual_seed({random_seed})\n")
# 设置随机数种子
torch.manual_seed(0)
# 单卡,如果是多卡
devices = d2l.try_all_gpus()
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 使用cuDNN加速卷积运算
torch.backends.cudnn.benchmark = True
# 得到训练集和测试集
train_iter,train_valid_iter,valid_iter,test_iter = utils.load_dog_transform_data(valid_ratio=valid_ratio,reassign_dataset=reassign_dataset)
# 定义学生模型
model = models.get_resnet(devices,resnet_type,is_pretrained)
# model = model.to(devices[0])
# 打印模型的参数
logger.write(str(summary(model))+'\n')
# train_tools.train(epochs, model, model_name, lr,train_iter,valid_iter,devices,logger)
train_tools.train(epochs, model, model_name, lr,train_iter,valid_iter,devices,logger)
train_distill_student_model.py
import torch
import train_tools
import models
import utils
from torchinfo import summary #用来可视化的
from datetime import datetime
from d2l import torch as d2l
#------------------------------------------------设置参数--------------------------------------------
dataset_name = "imagenet_dog"
is_pretrained = True
lr,wd = 1e-4,1e-4
lr_period, lr_decay = 2,0.9
student_resnet_type = 18 # resnet的层数
teacher_resnet_type = 101
num_epochs = 200
loss_name = 'dist'
# loss_name = 'kldiv'
valid_ratio = 0.3 # 数据集训练集和测试集占比
reassign_dataset = False # 是否要重新划分数据集?如果不是,则程序直接加载化分好的数据集即可。
alpha = 0.7 # hard_loss权重
temp = 7# 蒸馏温度
random_seed=0 # 手动设置随机种子
model_name = f'distill_resnet_{teacher_resnet_type}_{student_resnet_type}'
teacher_best_weight_path = f'weights/resnet101_200_0.8915441036224365.pth'
# 如果设置为None,表示resnet只使用预训练的参数
student_best_weight_path = f'weights/resnet18_200_0.7982536554336548.pth'
# student_best_weight_path = None
log_name = (
"logs/"
+ model_name
+ "_"
+ dataset_name
+ "_"
+ loss_name
+ "_"
+ datetime.now().strftime("%Y-%m-%d %H:%M:%S")
)
# 定义logger
logger = utils.Logger(log_name)
logger.write(f"loss={loss_name}\t lr={lr}\t epochs={num_epochs}\t alpah={alpha}\t temp={temp}\t manual_seed({random_seed})\n")
# 设置随机数种子
torch.manual_seed(random_seed)
devices = d2l.try_all_gpus()
# 使用cuDNN加速卷积运算
torch.backends.cudnn.benchmark = True
# 加载数据
# 得到训练集和测试集
train_iter,train_valid_iter,valid_iter,test_iter = utils.load_dog_transform_data(valid_ratio=0.3,reassign_dataset=reassign_dataset)
# 加载训练好的teacher model
teacher_model = models.get_resnet(devices,teacher_resnet_type,is_pretrained)
teacher_model = utils.load_best_weight(teacher_model,teacher_best_weight_path)
teacher_model.eval()
#定义新的学生模型,内部已经 model.to(device)了
student_model = models.get_resnet(devices,student_resnet_type,is_pretrained)
if student_best_weight_path != None:
student_model = utils.load_best_weight(student_model,student_best_weight_path)
student_model.train()
# 查看模型参数
logger.write(str(summary(student_model))+'\n')
# 开始训练
train_tools.distill_train(num_epochs,teacher_model,student_model,model_name,train_iter,valid_iter,alpha,lr,temp,devices,logger,loss_name)
utils.py
划分数据集并加载数据集的代码解读
https://blog.csdn.net/qq_42864343/article/details/134767971?csdn_share_tail=%7B%22type%22%3A%22blog%22%2C%22rType%22%3A%22article%22%2C%22rId%22%3A%22134767971%22%2C%22source%22%3A%22qq_42864343%22%7D
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader
from d2l import torch as d2l
import os
import torch
# 按比例划分数据集并加载数据集
def load_dog_transform_data(valid_ratio=0.3,reassign_dataset=False):
batch_size = 128
data_dir = os.path.join('data')
# 必须有这个参数,如果没有,每次运行就会重新划分一次数据集,由于之前划分数据集文件没有删除,所以会污染之前划分好的数据集。
if reassign_dataset:
# ---------------------------------------下载数据-----------------------------------
d2l.DATA_HUB['dog_tiny'] = (d2l.DATA_URL + 'kaggle_dog_tiny.zip','0cb91d09b814ecdc07b50f31f8dcad3e81d6a86d')
# 如果使用Kaggle比赛的完整数据集,请将下面的变量更改为False
demo = False
if demo:
data_dir = d2l.download_extract('dog_tiny')
else:
data_dir = os.path.join('data')
# ---------------------------读取训练数据标签、拆分验证集并整理训练集-----------------------
batch_size = 32 if demo else 128
labels = d2l.read_csv_labels(os.path.join(data_dir, 'labels.csv'))
# 根据比例计算应该划分给验证集的每个类别的样本数量
d2l.reorg_train_valid(data_dir, labels, valid_ratio)
d2l.reorg_test(data_dir)
# ------------------------------定义数据增强方式------------------------------------
transform_train = torchvision.transforms.Compose([
# 随机裁剪图像,所得图像为原始面积的0.08〜1之间,高宽比在3/4和4/3之间。
# 然后,缩放图像以创建224x224的新图像
torchvision.transforms.RandomResizedCrop(224, scale=(0.08, 1.0),
ratio=(3.0 / 4.0, 4.0 / 3.0)),
torchvision.transforms.RandomHorizontalFlip(),
# 随机更改亮度,对比度和饱和度
torchvision.transforms.ColorJitter(brightness=0.4,
contrast=0.4,
saturation=0.4),
# 添加随机噪声
torchvision.transforms.ToTensor(),
# 标准化图像的每个通道
torchvision.transforms.Normalize([0.485, 0.456, 0.406],
[0.229, 0.224, 0.225])])
# 测试时, 我们只使用确定性的图像预处理操作。
transform_test = torchvision.transforms.Compose([
torchvision.transforms.Resize(256),
# 从图像中心裁切224x224大小的图片
torchvision.transforms.CenterCrop(224),
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize([0.485, 0.456, 0.406],
[0.229, 0.224, 0.225])])
# ------------------------------加载数据集------------------------------------
train_ds, train_valid_ds = [torchvision.datasets.ImageFolder(
os.path.join(data_dir, 'train_valid_test', folder),
transform=transform_train) for folder in ['train', 'train_valid']]
valid_ds, test_ds = [torchvision.datasets.ImageFolder(
os.path.join(data_dir, 'train_valid_test', folder),
transform=transform_test) for folder in ['valid', 'test']]
train_iter, train_valid_iter = [torch.utils.data.DataLoader(
dataset, batch_size, shuffle=True, drop_last=True)
for dataset in (train_ds, train_valid_ds)]
valid_iter = torch.utils.data.DataLoader(valid_ds, batch_size, shuffle=False,
drop_last=True)
test_iter = torch.utils.data.DataLoader(test_ds, batch_size, shuffle=False, drop_last=False)
return train_iter,train_valid_iter,valid_iter,test_iter
# 多GPU训练的模型,要用如下加载方法加载模型参数
def load_best_weight(model,best_weight_path):
model_para_dict_temp = torch.load(best_weight_path)
model_para_dict = {}
for key_i in model_para_dict_temp.keys():
model_para_dict[key_i[7:]] = model_para_dict_temp[key_i] # 删除掉前7个字符'module.'
del model_para_dict_temp
model.load_state_dict(model_para_dict)
return model
loss函数代码
创建一个python包为losses
代码实现:
https://blog.csdn.net/qq_42864343/article/details/134768003?csdn_share_tail=%7B%22type%22%3A%22blog%22%2C%22rType%22%3A%22article%22%2C%22rId%22%3A%22134768003%22%2C%22source%22%3A%22qq_42864343%22%7D
models.py
import torch
from torch import nn
import torchvision
# resnet_type=34时,返回resnet34。
# resnet_type=18时,返回resnet18。
def get_resnet(devices,resnet_type,is_pretrained):
finetune_net = nn.Sequential()
if resnet_type==34:
finetune_net.features = torchvision.models.resnet34(pretrained=is_pretrained)
elif resnet_type==18:
finetune_net.features = torchvision.models.resnet18(pretrained=is_pretrained)
elif resnet_type==101:
finetune_net.features = torchvision.models.resnet101(pretrained=is_pretrained)
elif resnet_type==50:
finetune_net.features = torchvision.models.resnet50(pretrained=is_pretrained)
# 定义一个新的输出网络,共有120个输出类别
finetune_net.output_new = nn.Sequential(nn.Linear(1000, 256),
nn.ReLU(),
nn.Linear(256, 120))
# 将模型参数分配给用于计算的CPU或GPU
finetune_net = finetune_net.to(devices[0])
# 冻结参数
for param in finetune_net.features.parameters():
param.requires_grad = False
return finetune_net
train_tools.py
from torch import nn
import time
import torch
import tqdm
from losses.kd import KLDiv
from losses.dist import DIST
def train(epochs, model, model_name, lr,train_dataloader,test_dataloader,devices,logger):
# ----------------------开始计时-----------------------------------
start_time = time.time()
model = nn.DataParallel(model, device_ids=devices).to(devices[0])
# 设置参数开始训练
best_acc, best_epoch = 0, 0
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
# 初始化一个变量来保存最优的state_dict
best_state_dict = model.state_dict()
for epoch in range(epochs):
model.train()
# 训练集上训练模型权重
for data, targets in tqdm.tqdm(train_dataloader):
# 把数据加载到GPU上
data = data.to(devices[0])
targets = targets.to(devices[0])
# 前向传播
preds = model(data)
loss = criterion(preds, targets)
# 反向传播
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 测试集上评估模型性能
model.eval()
num_correct = 0
num_samples = 0
with torch.no_grad():
for x, y in tqdm.tqdm(test_dataloader):
x = x.to(devices[0])
y = y.to(devices[0])
preds = model(x)
predictions = preds.max(1).indices # 返回每一行的最大值和该最大值在该行的列索引
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
acc = (num_correct / num_samples).item()
if acc > best_acc:
best_acc = acc
best_epoch = epoch+1
# 保存模型最优准确率的参数
best_state_dict = model.state_dict() # 更新最优state_dict
model.train()
logger.write('Epoch:{}\t valid_acc:{:.4f} \t train_loss={}\n'.format(epoch + 1,acc,loss))
if (epoch+1) % 10 == 0:
logger.write(f"------------------------当前最优准确率为:{best_acc}\t所在的epoch为:{best_epoch}--------------------\n")
logger.write(f'最优准确率为{best_acc}\t所在的epoch为:{best_epoch}\t最优参数已经保存到:weights/{model_name}_best_valid_acc_params.pth' + '\n')
# 训练结束保存
torch.save(best_state_dict, f"weights/{model_name}_{epochs}_{best_acc}.pth")
print(f"训练结束,当前最优参数已经保存:f'weights/{model_name}_{epochs}_{best_acc}.pth'")
# -------------------------结束计时------------------------------------
end_time = time.time()
run_time = end_time - start_time
# 将输出的秒数保留两位小数
# 将输出的秒数保留两位小数
if int(run_time) < 60:
logger.write(f'训练用时为:{round(run_time, 2)}s' + '\n')
else:
print()
logger.write(f'训练用时为:{round(run_time / 60, 2)}minutes' + '\n')
def distill_train(epochs,teacher_model,student_model,model_name,train_dataloader,test_dataloader,alpha,lr,temp,devices,logger,loss_name):
# -------------------------------------开始计时--------------------------------
start_time = time.time()
# 多gpu训练,必须用这个,否则,加载参数加载不出来
teacher_model = nn.DataParallel(teacher_model, device_ids=devices).to(devices[0])
student_model = nn.DataParallel(student_model, device_ids=devices).to(devices[0])
# 定以损失函数
hard_loss = nn.CrossEntropyLoss() # LE
# 定义优化器
optimizer = torch.optim.Adam(student_model.parameters(), lr=lr)
best_acc,best_epoch = 0,0
for epoch in range(epochs):
student_model.train()
# 训练集上训练模型权重
for data,targets in tqdm.tqdm(train_dataloader):
# 把数据加载到GPU上
data = data.to(devices[0])
targets = targets.to(devices[0])
# 教师模型预测
with torch.no_grad():
teacher_preds = teacher_model(data)
#----------------------------------------------
# 学生模型预测
student_preds = student_model(data)
# 计算hard_loss
student_hard_loss = hard_loss(student_preds, targets)
# student_hard_loss = student_hard_loss.sum()
student_hard_loss = alpha * student_hard_loss
# 普通蒸馏损失
if loss_name == 'kldiv':
distill_loss_class = KLDiv(temp=temp)
# 原始的蒸馏损失才会乘温度系数的平方。KLDIV类中已经乘过了。
elif loss_name == 'dist':
distill_loss_class = DIST(temp=temp)
# 调用蒸馏损失
distill_loss = distill_loss_class.forward(student_preds, teacher_preds)
distill_loss = (1 - alpha) * distill_loss
loss = student_hard_loss + distill_loss
#-----------------------------------------------
# 反向传播,优化权重
optimizer.zero_grad()
loss.backward()
optimizer.step()
#测试集上评估模型性能
student_model.eval()
num_correct = 0
num_samples = 0
with torch.no_grad():
for x,y in tqdm.tqdm(test_dataloader):
x = x.to(devices[0])
y = y.to(devices[0])
preds = student_model(x)
predictions = preds.max(1).indices #返回每一行的最大值和该最大值在该行的列索引
num_correct += (predictions ==y).sum()
num_samples += predictions.size(0)
acc = (num_correct/num_samples).item()
if acc>best_acc:
best_acc = acc
best_epoch = epoch+1
# 保存模型最优准确率的参数
# torch.save(student_model.state_dict(),f"../weights/{model_name}_best_acc_params.pth")
student_model.train()
logger.write('Epoch:{}\t valid_acc:{:.4f}\t student_hard_loss={}\t ditillation_loss={}\t loss={}\n'.format(epoch+1,acc,student_hard_loss,distill_loss,loss))
if (epoch+1) %10==0:
logger.write(f"------------------------当前最优准确率为:{best_acc},所在的epoch为:{best_epoch}--------------------\n")
logger.write(f'最优准确率为{best_acc}\t所在的epoch为:{best_epoch}\n')
# --------------------------------结束计时----------------------------------
end_time = time.time()
run_time = end_time - start_time
# 将输出的秒数保留两位小数
if int(run_time) < 60:
logger.write(f'训练用时为:{round(run_time, 2)}s' + '\n')
else:
logger.write(f'训练用时为:{round(run_time / 60, 2)}minutes' + '\n')