指定单卡很简单,后面的model放到device上,data也放上去就行
device = torch.device(f'cuda:{args.GPU_num}')
torch.cuda.set_device(device)
model和data都要用 to(local_rank),另外我没有做
train_sampler = DistributedSampler(train_dataset, rank=local_rank)
的处理,负载会不均衡
注意: 一定要在args里添加--local_rank的参数!!!
# Code for "AMC: AutoML for Model Compression and Acceleration on Mobile Devices"
# Yihui He*, Ji Lin*, Zhijian Liu, Hanrui Wang, Li-Jia Li, Song Han
# {jilin, songhan}@mit.edu
import os
import time
import argparse
import shutil
import math
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from tensorboardX import SummaryWriter
from lib.utils import accuracy, AverageMeter, progress_bar, get_output_folder
from lib.data import get_dataset
from lib.net_measure import measure_model
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
import torch.multiprocessing as mp
def parse_args():
parser = argparse.ArgumentParser(description='AMC fine-tune script')
parser.add_argument('--model', default='mobilenet', type=str, help='name of the model to train')
parser.add_argument('--dataset', default='tiny_imagenet', type=str, help='name of the dataset to train')
parser.add_argument('--lr', default=0.1, type=float, help='learning rate')
parser.add_argument('--n_gpu', default=4, type=int, help='number of GPUs to use')
parser.add_argument('--batch_size', default=128, type=int, help='batch size')
parser.add_argument('--n_worker', default=4, type=int, help='number of data loader worker')
parser.add_argument('--lr_type', default='exp', type=str, help='lr scheduler (exp/cos/step3/fixed)')
parser.add_argument('--n_epoch', default=150, type=int, help='number of epochs to train')
parser.add_argument('--wd', default=4e-5, type=float, help='weight decay')
parser.add_argument('--seed', default=None, type=int, help='random seed to set')
parser.add_argument('--data_root', default=None, type=str, help='dataset path')
# resume
parser.add_argument('--ckpt_path', default=None, type=str, help='checkpoint path to resume from')
# run eval
parser.add_argument('--eval', action='store_true', help='Simply run eval')
#添加一个local_rank参数,否则可能会报错
parser.add_argument('--local_rank', default=0, type=int, help='node rank for distributed training')
return parser.parse_args()
def get_model():
print('=> Building model..')
if args.model == 'mobilenet':
from models.mobilenet import MobileNet
net = MobileNet(n_class=200)
elif args.model == 'mobilenet_0.5flops':
from models.mobilenet import MobileNet
net = MobileNet(n_class=200, profile='0.5flops')
else:
raise NotImplementedError
return net
def train(epoch, train_loader, net):
print('\nEpoch: %d' % epoch)
net.train()
batch_time = AverageMeter()
losses = AverageMeter()
top1 = AverageMeter()
top5 = AverageMeter()
end = time.time()
for batch_idx, (inputs, targets) in enumerate(train_loader):
if use_cuda:
# inputs, targets = inputs.cuda(), targets.cuda()
inputs, targets = inputs.to(local_rank), targets.to(local_rank)
optimizer.zero_grad()
outputs = net(inputs)
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
# measure accuracy and record loss
prec1, prec5 = accuracy(outputs.data, targets.data, local_rank = local_rank, topk=(1, 5))
losses.update(loss.item(), inputs.size(0))
top1.update(prec1.item(), inputs.size(0))
top5.update(prec5.item(), inputs.size(0))
# timing
batch_time.update(time.time() - end)
end = time.time()
if local_rank == 0:
progress_bar(batch_idx, len(train_loader), 'Loss: {:.3f} | Acc1: {:.3f}% | Acc5: {:.3f}%'
.format(losses.avg, top1.avg, top5.avg))
#同步
dist.barrier()
writer.add_scalar('loss/train', losses.avg, epoch)
writer.add_scalar('acc/train_top1', top1.avg, epoch)
writer.add_scalar('acc/train_top5', top5.avg, epoch)
def test(epoch, test_loader, net, save=True):
global best_acc
net.eval()
batch_time = AverageMeter()
losses = AverageMeter()
top1 = AverageMeter()
top5 = AverageMeter()
end = time.time()
total_loss = 0
with torch.no_grad():
for batch_idx, (inputs, targets) in enumerate(test_loader):
if use_cuda:
# inputs, targets = inputs.cuda(), targets.cuda()
inputs, targets = inputs.to(local_rank), targets.to(local_rank)
outputs = net(inputs)
loss = criterion(outputs, targets)
# measure accuracy and record loss
prec1, prec5 = accuracy(outputs.data, targets.data, local_rank=local_rank, topk=(1, 5))
losses.update(loss.item(), inputs.size(0))
top1.update(prec1.item(), inputs.size(0))
top5.update(prec5.item(), inputs.size(0))
# timing
batch_time.update(time.time() - end)
end = time.time()
progress_bar(batch_idx, len(test_loader), 'Loss: {:.3f} | Acc1: {:.3f}% | Acc5: {:.3f}%'
.format(losses.avg, top1.avg, top5.avg))
total_loss += loss.item()
dist.barrier()
total_loss = torch.tensor(total_loss, requires_grad=False).to(local_rank)
dist.all_reduce(total_loss)
if save:
writer.add_scalar('loss/test', losses.avg, epoch)
writer.add_scalar('acc/test_top1', top1.avg, epoch)
writer.add_scalar('acc/test_top5', top5.avg, epoch)
is_best = False
if top1.avg > best_acc:
best_acc = top1.avg
is_best = True
print('Current best acc: {}'.format(best_acc))
save_checkpoint({
'epoch': epoch,
'model': args.model,
'dataset': args.dataset,
'state_dict': net.module.state_dict() if isinstance(net, nn.DataParallel) else net.state_dict(),
'acc': top1.avg,
'optimizer': optimizer.state_dict(),
}, is_best, checkpoint_dir=log_dir)
def adjust_learning_rate(optimizer, epoch):
if args.lr_type == 'cos': # cos without warm-up
lr = 0.5 * args.lr * (1 + math.cos(math.pi * epoch / args.n_epoch))
elif args.lr_type == 'exp':
step = 1
decay = 0.96
lr = args.lr * (decay ** (epoch // step))
elif args.lr_type == 'fixed':
lr = args.lr
else:
raise NotImplementedError
print('=> lr: {}'.format(lr))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
return lr
def save_checkpoint(state, is_best, checkpoint_dir='.'):
filename = os.path.join(checkpoint_dir, 'ckpt.pth.tar')
print('=> Saving checkpoint to {}'.format(filename))
torch.save(state, filename)
if is_best:
shutil.copyfile(filename, filename.replace('.pth.tar', '.best.pth.tar'))
if __name__ == '__main__':
args = parse_args()
use_cuda = torch.cuda.is_available()
if use_cuda:
torch.backends.cudnn.benchmark = True
#结束进程的函数
def cleanup():
dist.destroy_process_group()
#这两个不用加,docker环境会报错
# os.environ['MASTER_ADDR'] = 'localhost'
# os.environ['MASTER_PORT'] = '12355'
# 初始化进程组
# dist.init_process_group(backend="nccl", init_method="env://", rank = 0, world_size= int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1)
#初始化进程组,只用写backend一个参数就可以
dist.init_process_group(backend="nccl")
# 计算local_rank和world_size
local_rank = torch.distributed.get_rank()
world_size = dist.get_world_size()
torch.cuda.set_device(local_rank)
# 设置seed
torch.manual_seed(seed=args.seed)
best_acc = 0 # best test accuracy
start_epoch = 0 # start from epoch 0 or last checkpoint epoch
if args.seed is not None:
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)
print('=> Preparing data..')
train_loader, val_loader, n_class = get_dataset(args.dataset, args.batch_size, args.n_worker,
data_root=args.data_root)
net = get_model() # for measure
IMAGE_SIZE = 224 if args.dataset == 'tiny_imagenet' else 32
n_flops, n_params = measure_model(net, IMAGE_SIZE, IMAGE_SIZE, local_rank=local_rank)
print('=> Model Parameter: {:.3f} M, FLOPs: {:.3f}M'.format(n_params / 1e6, n_flops / 1e6))
del net
# net = get_model() # real training
# 创建模型, 并将其移动到local_rank对应的GPU上
net = get_model().to(local_rank)
if args.ckpt_path is not None: # assigned checkpoint path to resume from
print('=> Resuming from checkpoint..')
checkpoint = torch.load(args.ckpt_path)
sd = checkpoint['state_dict'] if 'state_dict' in checkpoint else checkpoint
net.load_state_dict(sd)
if use_cuda and args.n_gpu > 1:
# net = torch.nn.DataParallel(net, list(range(args.n_gpu)))
net = DDP(net, device_ids=[local_rank], output_device=local_rank)
criterion = nn.CrossEntropyLoss()
print('Using SGD...')
print('weight decay = {}'.format(args.wd))
optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.wd)
if args.eval: # just run eval
print('=> Start evaluation...')
test(0, val_loader, save=False)
else: # train
print('=> Start training...')
print('Training {} on {}...'.format(args.model, args.dataset))
# log_dir = get_output_folder('./logs', '{}_{}_finetune'.format(args.model, args.dataset))
log_dir = get_output_folder('./logs', '{}_{}_train'.format(args.model, args.dataset))
print('=> Saving logs to {}'.format(log_dir))
# tf writer
writer = SummaryWriter(logdir=log_dir)
for epoch in range(start_epoch, start_epoch + args.n_epoch):
lr = adjust_learning_rate(optimizer, epoch)
train(epoch, train_loader, net=net)
test(epoch, val_loader, net=net)
cleanup()
writer.close()
print('=> Model Parameter: {:.3f} M, FLOPs: {:.3f}M, best top-1 acc: {}%'.format(n_params / 1e6, n_flops / 1e6, best_acc))
train.sh为
运行的时候直接 bash train.sh即可 -W ignore是忽略警告信息 后面的-m xxxx.launch --nproc_per_node = 8 xxxx.py是利用launch启动py文件
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
python -W ignore \
-m torch.distributed.launch --nproc_per_node=8 \
/remote-home/xt/AMC/amc-master/amc_train.py \
--model=mobilenet \
--dataset=tiny_imagenet \
--lr=0.05 \
--n_gpu=4 \
--batch_size=128 \
--n_worker=32 \
--lr_type=cos \
--n_epoch=150 \
--wd=4e-5 \
--seed=2018 \
--data_root=/dataset/tiny_imagenet \
白哥的例程
import os
import pandas as pd
import torch
import argparse
import torch.distributed as dist
import torch.multiprocessing as mp
from sklearn.model_selection import train_test_split
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.optim as optim
import torch.nn as nn
import torch.distributed as dist
from datetime import timedelta
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import get_cosine_schedule_with_warmup
from tqdm.auto import tqdm
from torch.optim import AdamW
from transformers import WEIGHTS_NAME, CONFIG_NAME
num_epochs = 20
UNFREEZE_START = 9
SEED = 3407
output_dir = 'gpt2_fintune'
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
writer = SummaryWriter(log_dir=output_dir)
torch.distributed.init_process_group(backend="nccl")
class ImageDataset(Dataset):
def __init__(self, max_length=70, tokenizer_pre=None):
path = 'metadata_withcmt.parquet'
dataframe = pd.read_parquet(path)
self.prompt_list = list(dataframe['prompt'])
self.max_length = max_length
self.tokenizer_pre = tokenizer_pre
self.prompt_list_clean = self.pre_process()
def __len__(self):
return len(self.prompt_list_clean)
def pre_process(self):
prompt_list_clean = []
for sentence in self.prompt_list:
if len(sentence.split()) > self.max_length:
continue
prompt_list_clean.append(sentence)
return prompt_list_clean
def __getitem__(self, idx):
prompt = self.prompt_list_clean[idx]
return prompt
def load_pretrained_model():
model = GPT2LMHeadModel.from_pretrained("gpt2")
trainable_model_weights = False
for name, child in model.named_children():
if name == 'transformer':
for pn, p in child.named_parameters():
if str(UNFREEZE_START) in pn:
"""start unfreezing layer , the weights are trainable"""
trainable_model_weights = True
p.requires_grad = trainable_model_weights
return model
def cleanup():
dist.destroy_process_group()
def run_demo():
# 计算global_rank和world_size
local_rank = torch.distributed.get_rank()
torch.cuda.set_device(local_rank)
# 设置seed
torch.manual_seed(SEED)
# 创建模型, 并将其移动到local_rank对应的GPU上
model = load_pretrained_model().to(local_rank)
ddp_model = DDP(model, device_ids=[local_rank], output_device=local_rank)
dataset_all = ImageDataset(tokenizer_pre=tokenizer)
train_dataset, test_dataset = train_test_split(dataset_all, test_size=0.2, random_state=42)
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)
train_sampler = DistributedSampler(train_dataset,
rank=local_rank)
train_loader = DataLoader(train_dataset, batch_size=96, sampler=train_sampler, num_workers=32,
collate_fn=lambda x: tokenizer(x, padding='max_length', truncation=True, max_length=71,
return_tensors="pt"))
test_sampler = DistributedSampler(test_dataset,
rank=local_rank)
testloader = DataLoader(test_dataset, batch_size=96, sampler=test_sampler, num_workers=32,
collate_fn=lambda x: tokenizer(x, padding='max_length', truncation=True, max_length=71,
return_tensors="pt"))
optimizer.zero_grad()
step = 0
min_loss = 10000
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=(0.01 * num_training_steps),
num_training_steps=num_training_steps)
if local_rank == 0:
print("Total Steps:", num_training_steps)
progress_bar = tqdm(range(num_training_steps), disable=not local_rank == 0)
for epoch in range(num_epochs):
train_sampler.set_epoch(epoch)
for data in train_loader:
input_ids, attention_mask = data['input_ids'].to(local_rank), data['attention_mask'].to(local_rank)
outputs = ddp_model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
loss = outputs[0]
loss.backward()
optimizer.step()
optimizer.zero_grad()
step = step + 1
progress_bar.set_postfix(loss=loss.item(), lr=lr_scheduler.get_last_lr()[0])
progress_bar.update(1)
if local_rank == 0:
writer.add_scalar('train_loss', loss, step)
lr_scheduler.step()
dist.barrier()
if local_rank == 0:
print(f'\nStart {epoch + 1} eval')
ddp_model.eval()
total_loss = 0
with torch.no_grad():
for data in tqdm(testloader, disable=not local_rank == 0):
input_ids, attention_mask = data['input_ids'].to(local_rank), data['attention_mask'].to(
local_rank)
outputs = ddp_model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
loss = outputs[0]
total_loss += loss.item()
dist.barrier()
total_loss = torch.tensor(total_loss, requires_grad=False).to(local_rank)
dist.all_reduce(total_loss)
avg_loss = total_loss.item() / len(testloader)
ddp_model.train()
if local_rank == 0:
print(f"step {step}, avg_loss {avg_loss}")
writer.add_scalar('eval_loss', avg_loss, epoch + 1)
if avg_loss < min_loss:
min_loss = avg_loss
model_to_save = model.module if hasattr(model, 'module') else model
torch.save(model_to_save.state_dict(), output_model_file)
model_to_save.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(output_dir)
dist.barrier()
cleanup()
if __name__ == "__main__":
run_demo()