小白入门,看完CS231N做完了assignment1,把学习到的东西用到cifar10的残差网络里试一下~
这一篇进行的是学习率的调节
源代码的train部分如下:
import argparse
import os
import shutil
import time
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import resnet
model_names = sorted(name for name in resnet.__dict__
if name.islower() and not name.startswith("__")
and name.startswith("resnet")
and callable(resnet.__dict__[name]))
print(model_names)
parser = argparse.ArgumentParser(description='Propert ResNets for CIFAR10 in pytorch')
parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet32',
choices=model_names,
help='model architecture: ' + ' | '.join(model_names) +
' (default: resnet32)')
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=200, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=128, type=int,
metavar='N', help='mini-batch size (default: 128)')
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
metavar='LR', help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
help='momentum')
parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
metavar='W', help='weight decay (default: 1e-4)')
parser.add_argument('--print-freq', '-p', default=50, type=int,
metavar='N', help='print frequency (default: 50)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
help='use pre-trained model')
parser.add_argument('--half', dest='half', action='store_true',
help='use half-precision(16-bit) ')
parser.add_argument('--save-dir', dest='save_dir',
help='The directory used to save the trained models',
default='save_temp', type=str)
parser.add_argument('--save-every', dest='save_every',
help='Saves checkpoints at every specified number of epochs',
type=int, default=10)
best_prec1 = 0
def main():
global args, best_prec1
args = parser.parse_args()
# Check the save_dir exists or not
if not os.path.exists(args.save_dir):
os.makedirs(args.save_dir)
model = torch.nn.DataParallel(resnet.__dict__[args.arch]())
model.cuda()
# optionally resume from a checkpoint
if args.resume:
if os.path.isfile(args.resume):
print("=> loading checkpoint '{}'".format(args.resume))
checkpoint = torch.load(args.resume)
args.start_epoch = checkpoint['epoch']
best_prec1 = checkpoint['best_prec1']
model.load_state_dict(checkpoint['state_dict'])
print("=> loaded checkpoint '{}' (epoch {})"
.format(args.evaluate, checkpoint['epoch']))
else:
print("=> no checkpoint found at '{}'".format(args.resume))
cudnn.benchmark = True
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10(root='./data', train=True, transform=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, 4),
transforms.ToTensor(),
normalize,
]), download=True),
batch_size=args.batch_size, shuffle=True,
num_workers=args.workers, pin_memory=True)
val_loader = torch.utils.data.DataLoader(
datasets.CIFAR10(root='./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
normalize,
])),
batch_size=128, shuffle=False,
num_workers=args.workers, pin_memory=True)
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().cuda()
if args.half:
model.half()
criterion.half()
optimizer = torch.optim.SGD(model.parameters(), args.lr,
momentum=args.momentum,
weight_decay=args.weight_decay)
lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
milestones=[100, 150], last_epoch=args.start_epoch - 1)
if args.arch in ['resnet1202', 'resnet110']:
# for resnet1202 original paper uses lr=0.01 for first 400 minibatches for warm-up
# then switch back. In this setup it will correspond for first epoch.
for param_group in optimizer.param_groups:
param_group['lr'] = args.lr*0.1
if args.evaluate:
validate(val_loader, model, criterion)
return
for epoch in range(args.start_epoch, args.epochs):
# train for one epoch
print('current lr {:.5e}'.format(optimizer.param_groups[0]['lr']))
train(train_loader, model, criterion, optimizer, epoch)
lr_scheduler.step()
# evaluate on validation set
prec1 = validate(val_loader, model, criterion)
# remember best prec@1 and save checkpoint
is_best = prec1 > best_prec1
best_prec1 = max(prec1, best_prec1)
if epoch > 0 and epoch % args.save_every == 0:
save_checkpoint({
'epoch': epoch + 1,
'state_dict': model.state_dict(),
'best_prec1': best_prec1,
}, is_best, filename=os.path.join(args.save_dir, 'checkpoint.th'))
save_checkpoint({
'state_dict': model.state_dict(),
'best_prec1': best_prec1,
}, is_best, filename=os.path.join(args.save_dir, 'model.th'))
def train(train_loader, model, criterion, optimizer, epoch):
"""
Run one train epoch
"""
batch_time = AverageMeter()
data_time = AverageMeter()
losses = AverageMeter()
top1 = AverageMeter()
# switch to train mode
model.train()
end = time.time()
for i, (input, target) in enumerate(train_loader):
# measure data loading time
data_time.update(time.time() - end)
target = target.cuda()
input_var = input.cuda()
target_var = target
if args.half:
input_var = input_var.half()
# compute output
output = model(input_var)
loss = criterion(output, target_var)
# compute gradient and do SGD step
optimizer.zero_grad()
loss.backward()
optimizer.step()
output = output.float()
loss = loss.float()
# measure accuracy and record loss
prec1 = accuracy(output.data, target)[0]
losses.update(loss.item(), input.size(0))
top1.update(prec1.item(), input.size(0))
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if i % args.print_freq == 0:
print('Epoch: [{0}][{1}/{2}]\t'
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
epoch, i, len(train_loader), batch_time=batch_time,
data_time=data_time, loss=losses, top1=top1))
def validate(val_loader, model, criterion):
"""
Run evaluation
"""
batch_time = AverageMeter()
losses = AverageMeter()
top1 = AverageMeter()
# switch to evaluate mode
model.eval()
end = time.time()
with torch.no_grad():
for i, (input, target) in enumerate(val_loader):
target = target.cuda()
input_var = input.cuda()
target_var = target.cuda()
if args.half:
input_var = input_var.half()
# compute output
output = model(input_var)
loss = criterion(output, target_var)
output = output.float()
loss = loss.float()
# measure accuracy and record loss
prec1 = accuracy(output.data, target)[0]
losses.update(loss.item(), input.size(0))
top1.update(prec1.item(), input.size(0))
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if i % args.print_freq == 0:
print('Test: [{0}/{1}]\t'
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
i, len(val_loader), batch_time=batch_time, loss=losses,
top1=top1))
print(' * Prec@1 {top1.avg:.3f}'
.format(top1=top1))
return top1.avg
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
"""
Save the training model
"""
torch.save(state, filename)
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def accuracy(output, target, topk=(1,)):
"""Computes the precision@k for the specified values of k"""
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0)
res.append(correct_k.mul_(100.0 / batch_size))
return res
if __name__ == '__main__':
main()
我把SGD改成了Adam,把结果用tensorboard可视化出来,我的目的是找到train的loss在哪个epoch开始收敛,找到以后我再设定从那个批次开始降低lr,查看效果。
import argparse
import os
import shutil
import time
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from matplotlib import pyplot as plt
from tensorboardX import SummaryWriter
# 创建SummaryWriter对象
writer = SummaryWriter(log_dir='log_dir_2')
import resnet
from tensorboardX import SummaryWriter
model_names = sorted(name for name in resnet.__dict__
if name.islower() and not name.startswith("__")
and name.startswith("resnet")
and callable(resnet.__dict__[name]))
print(model_names)
parser = argparse.ArgumentParser(description='Propert ResNets for CIFAR10 in pytorch')
parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet32',
choices=model_names,
help='model architecture: ' + ' | '.join(model_names) +
' (default: resnet32)')
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
help='number of data loading workers (default: 4)')
parser.add_argument('--epochs', default=200, type=int, metavar='N',
help='number of total epochs to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=128, type=int,
metavar='N', help='mini-batch size (default: 128)')
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
metavar='LR', help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
help='momentum')
parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
metavar='W', help='weight decay (default: 1e-4)')
parser.add_argument('--print-freq', '-p', default=50, type=int,
metavar='N', help='print frequency (default: 50)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true',
help='use pre-trained model')
parser.add_argument('--half', dest='half', action='store_true',
help='use half-precision(16-bit) ')
parser.add_argument('--save-dir', dest='save_dir',
help='The directory used to save the trained models',
default='save_temp', type=str)
parser.add_argument('--save-every', dest='save_every',
help='Saves checkpoints at every specified number of epochs',
type=int, default=10)
best_prec1 = 0
best_train_prec1 = 0
def main():
global args, best_prec1,best_train_prec1
args = parser.parse_args()
args.epochs = 50
# Check the save_dir exists or not
if not os.path.exists(args.save_dir):
os.makedirs(args.save_dir)
model = torch.nn.DataParallel(resnet.__dict__['resnet20']())
model.cuda()
# optionally resume from a checkpoint
if args.resume:
if os.path.isfile(args.resume):
print("=> loading checkpoint '{}'".format(args.resume))
checkpoint = torch.load(args.resume)
args.start_epoch = checkpoint['epoch']
best_prec1 = checkpoint['best_prec1']
model.load_state_dict(checkpoint['state_dict'])
print("=> loaded checkpoint '{}' (epoch {})"
.format(args.evaluate, checkpoint['epoch']))
else:
print("=> no checkpoint found at '{}'".format(args.resume))
cudnn.benchmark = True
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10(root='./data', train=True, transform=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, 4),
transforms.ToTensor(),
normalize,
]), download=True),
batch_size=args.batch_size, shuffle=True,
num_workers=args.workers, pin_memory=True)
val_loader = torch.utils.data.DataLoader(
datasets.CIFAR10(root='./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
normalize,
])),
batch_size=128, shuffle=False,
num_workers=args.workers, pin_memory=True)
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().cuda()
if args.half:
model.half()
criterion.half()
# learning_rates = np.logspace(-10, 10, 10)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001,betas=(0.9,0.999),eps = 1e-8,weight_decay=0.01)
# lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,
# milestones=[5,10,15],gamma=0.9,last_epoch=args.start_epoch - 1)
if args.arch in ['resnet1202', 'resnet110']:
# for resnet1202 original paper uses lr=0.01 for first 400 minibatches for warm-up
# then switch back. In this setup it will correspond for first epoch.
for param_group in optimizer.param_groups:
param_group['lr'] = args.lr*0.1
if args.evaluate:
validate(val_loader, model, criterion)
return
results1 = {}
results2 = {}
train_acc_history = []
train_loss_history =[]
val_acc_history = []
total_train_step = 0
total_test_step = 0
for epoch in range(args.start_epoch, args.epochs):
# train for one epoch
print('current lr {:.5e}'.format(optimizer.param_groups[0]['lr']))
train_prec1,train_loss = train(train_loader, model, criterion, optimizer, epoch)
# lr_scheduler.step()
# train_acc_history.append(train_prec1)
# train_loss_history.append(train_loss)
total_train_step = total_train_step + 1
writer.add_scalar("train_loss", train_loss, total_train_step)
writer.add_scalar("train_acc", train_prec1, total_train_step)
is_best_train = train_prec1 > best_train_prec1
best_train_prec1 = max(train_prec1, best_train_prec1)
results1[(optimizer.param_groups[0]['lr'])] = (best_train_prec1, train_loss)
# evaluate on validation set
prec1,loss1 = validate(val_loader, model, criterion)
writer.add_scalar("test_loss", loss1, total_test_step)
writer.add_scalar("test_accuracy", prec1, total_test_step)
total_test_step = total_test_step + 1
# train_acc_history.append(prec1)
# remember best prec@1 and save checkpoint
is_best = prec1 > best_prec1
best_prec1 = max(prec1, best_prec1)
results2[(optimizer.param_groups[0]['lr'])]=(best_prec1,loss1)
writer.close()
val_acc_history = []
if epoch > 0 and epoch % args.save_every == 0:
save_checkpoint({
'epoch': epoch + 1,
'state_dict': model.state_dict(),
'best_prec1': best_prec1,
}, is_best, filename=os.path.join(args.save_dir, 'checkpoint.th'))
save_checkpoint({
'state_dict': model.state_dict(),
'best_prec1': best_prec1,
}, is_best, filename=os.path.join(args.save_dir, 'model.th'))
print(results1,results2)
def train(train_loader, model, criterion, optimizer, epoch):
"""
Run one train epoch
"""
batch_time = AverageMeter()
data_time = AverageMeter()
losses = AverageMeter()
top1 = AverageMeter()
# switch to train mode
model.train()
end = time.time()
for i, (input, target) in enumerate(train_loader):
# measure data loading time
data_time.update(time.time() - end)
target = target.cuda()
input_var = input.cuda()
target_var = target
if args.half:
input_var = input_var.half()
# compute output
output = model(input_var)
loss = criterion(output, target_var)
# compute gradient and do SGD step
optimizer.zero_grad()
loss.backward()
optimizer.step()
output = output.float()
loss = loss.float()
# measure accuracy and record loss
prec1 = accuracy(output.data, target)[0]
losses.update(loss.item(), input.size(0))
top1.update(prec1.item(), input.size(0))
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if i % args.print_freq == 0:
print('Epoch: [{0}][{1}/{2}]\t'
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
epoch, i, len(train_loader), batch_time=batch_time,
data_time=data_time, loss=losses, top1=top1))
return top1.avg,losses.avg
def validate(val_loader, model, criterion):
"""
Run evaluation
"""
batch_time = AverageMeter()
losses = AverageMeter()
top1 = AverageMeter()
# switch to evaluate mode
model.eval()
end = time.time()
with torch.no_grad():
for i, (input, target) in enumerate(val_loader):
target = target.cuda()
input_var = input.cuda()
target_var = target.cuda()
if args.half:
input_var = input_var.half()
# compute output
output = model(input_var)
loss = criterion(output, target_var)
output = output.float()
loss = loss.float()
# measure accuracy and record loss
prec1 = accuracy(output.data, target)[0]
losses.update(loss.item(), input.size(0))
top1.update(prec1.item(), input.size(0))
# measure elapsed time
batch_time.update(time.time() - end)
end = time.time()
if i % args.print_freq == 0:
print('Test: [{0}/{1}]\t'
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
i, len(val_loader), batch_time=batch_time, loss=losses,
top1=top1))
print(' * Prec@1 {top1.avg:.3f}'
.format(top1=top1))
return top1.avg,losses.avg
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
"""
Save the training model
"""
torch.save(state, filename)
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
def accuracy(output, target, topk=(1,)):
"""Computes the precision@k for the specified values of k"""
maxk = max(topk)
batch_size = target.size(0)
_, pred = output.topk(maxk, 1, True, True)
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].view(-1).float().sum(0)
res.append(correct_k.mul_(100.0 / batch_size))
return res
if __name__ == '__main__':
main()
lr=0.001 验证集的准确率可以达到80.16,训练集是80.986
结果没有原来的SGD效果好,0.009000000000000001验证集的准确率是 82.25