import os
import random
import time
from pathlib import Path
from typing import Optional, Dict, Any
import fire
import numpy as np
import torch
from torch.cuda import amp
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader
from transformers import T5TokenizerFast, T5ForConditionalGeneration, TrainingArguments, T5Config, Adafactor,AutoTokenizer
from transformers import AdamW, get_cosine_schedule_with_warmup
from transformers.utils.model_parallel_utils import get_device_map
from datetime import datetime
from args import ARGS, CONFIGS
from data import LargeCorpusDatasetFromServerV2, DataCollatorForSeq2Seq, NUM_EXTRA_IDS, LargeCorpusDatasetFromServer, DataCollatorForT5MLM
import jsonlines
import json
import torch.distributed as dist
import socket
import logging
def get_logger(filename=None):
logger = logging.getLogger('logger')
logger.setLevel(logging.DEBUG)
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',
level=logging.INFO)
if filename is not None:
handler = logging.FileHandler(filename)
handler.setLevel(logging.DEBUG)
handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
logging.getLogger().addHandler(handler)
return logger
def read_json(path):
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
def read_jsonl(path):
items = []
with open(path, 'r', encoding='utf-8') as f:
for item in jsonlines.Reader(f):
items.append(item)
return items
def train(model_size="base", tokenizer_path="/common-data/zhangjing/xiaoyingzuo/pko-t5-main/pkot5/pretrained/T5-base-korean-pko copy", job_name: str = "pkot5-pretraining", resume_checkpoint: Optional[int] = None, version: Optional[str] = 1):
current_time = datetime.now().strftime('%Y-%m-%d_%H-%M')
logger = get_logger(os.path.join('./log_dir', f'{current_time}_bs=128_inverselr=1e-4_log.txt'))
# logger.info(args)
logger.info("-------------start training------------------------")
local_rank = int(os.getenv("LOCAL_RANK", "-1"))
model_size = model_size.lower()
default_args = ARGS[model_size]
n_pp = default_args.pop("pipeline_parallelism", 1) #base是1
args = TrainingArguments(
output_dir=f'./models/pko-t5/inverselr=1e-4_{model_size}',
local_rank=local_rank,
**default_args
)
random.seed(args.seed)
torch.random.manual_seed(args.seed)
np.random.seed(args.seed)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, unk_token='<pad>', extra_ids=NUM_EXTRA_IDS)
config = T5Config(**CONFIGS[model_size])
config.dropout_rate = 0.0
## added code here, xiaoyingzuo, 0713
## changed here 0721
#-----------------load_dataset-------------------------------------
all_data = []
manual_iterms = read_jsonl('context_1.jsonl')
logger.info("-------------finished read jsonl: context_1.jsonl----------------")
input_texts = []
for page_iterm in manual_iterms:
for context_dict in page_iterm['context_list']:
input_texts.append(f"{context_dict['context']}".strip())
# print('input text append \n', input_texts)
input_texts_ids = tokenizer(input_texts, add_special_tokens=False).input_ids
# added here 230721 xiaoying
#-------------preprocess data -------------------------------------
start = 0
last_pass = np.array([0] * 564)
try_pass = np.array([0] * 564)
for i in range(len(input_texts_ids)):
turn_tmp = int(len(input_texts_ids[i]) / 564)
if turn_tmp > 0:
for j in range(turn_tmp):
try_pass = np.array(input_texts_ids[i])[j * 564 : (j + 1) * 564]
all_data.append({'input_ids': try_pass.copy()})
if len(input_texts_ids[i]) > turn_tmp * 564:
middle_tmp = len(input_texts_ids[i]) - turn_tmp * 564
if middle_tmp + start < 564:
last_pass[start:start + middle_tmp] = np.array(input_texts_ids[i])[turn_tmp * 564 :]
start = start + middle_tmp
else:
last_pass[start:] = np.array(input_texts_ids[i])[turn_tmp * 564: turn_tmp * 564 + 564 - start]
all_data.append({'input_ids': last_pass.copy()})
last_pass[: len(input_texts_ids[i]) - (turn_tmp * 564 +564-start)] = np.array(input_texts_ids[i])[turn_tmp *564 +564-start :]
start = len(input_texts_ids[i]) - (turn_tmp *564 + 564 -start)
# 将编码后的数字转换回原始句子
# for i in range(len(all_data)):
# decoded_input = tokenizer.decode(all_data[i]['input_ids'], skip_special_tokens=False)
# logger.info(f"----------------------decodeed input + 170 {i}------------------------------")
# logger.info(decoded_input)
# 打印原始句子
# print(decoded_input)
logger.info(f'-------------total tokenizer number {len(all_data)} finished preprocess-----------------------')
if n_pp > 1:
torch.cuda.set_device(local_rank * n_pp)
model = T5ForConditionalGeneration(config)
devices = list(range(local_rank * n_pp, (local_rank + 1) * n_pp))
model.parallelize(get_device_map(len(model.encoder.block), devices))
if not dist.is_initialized():
dist.init_process_group(backend='nccl')
# dist.init_process_group(backend='nccl')
# ddp_model = torch.nn.parallel.DistributedDataParallel(model)
else:
torch.cuda.set_device(local_rank)
# model = T5ForConditionalGeneration(config).cuda()
model = T5ForConditionalGeneration(config)
print("----dist init---")
if not dist.is_initialized():
dist.init_process_group(backend='nccl')
# dist.init_process_group(backend='nccl')
# ddp_model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank])
initial_lr = 1e-4
total_steps = args.max_steps
warmup_steps = 72
# optimizer = Adafactor(model.parameters(), scale_parameter=False, relative_step=False, warmup_init=False)
optimizer = AdamW(model.parameters(), lr=initial_lr)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
scaler = amp.GradScaler()
logger.info(f'--initial_lr:{initial_lr}, total_steps{total_steps}, warmup_steps{warmup_steps}, optimizer{optimizer}------')
# def inverse_sqrt_schedule(step):
# return 1.0 / (step ** 0.5)
# scheduler = LambdaLR(optimizer, lr_lambda=inverse_sqrt_schedule)
logger.info("----optimiter finished adafactor----------")
if resume_checkpoint is not None:
step = resume_checkpoint
# ckpt_dir = Path(default_args.start_ckpt_dir)
model.load_state_dict(torch.load( "/common-data/zhangjing/xiaoyingzuo/pko-t5-main/pkot5/pretrained/T5-base-korean-pko/pytorch_model.bin", map_location='cpu'))
logger.info('--------------loaded the T5-base-korean-pko/pytorch_model.bin-----------')
# optimizer.load_state_dict(torch.load(ckpt_dir / "optimizer.pt", map_location='cpu'))
epoch = 0
max_epoch = 10
# max_steps = len(all_data)
# max_steps = 80
for epoch in range(max_epoch):
train_loader = DataLoader(all_data, batch_size=args.per_device_train_batch_size, collate_fn=DataCollatorForT5MLM(tokenizer, prefix="fill: "))
logger.info(f"Start pretraining of t5-{model_size} epoch-{epoch}")
step = 0
while step < args.max_steps:
total_loss = 0
dt = time.time()
gradient_accumulation_step = 0
optimizer.zero_grad()
for data in train_loader:
# print("data in training...", data)
if step >= args.max_steps:
break
print("--------------data convert to tensor--------------")
# data = data.convert_to_tensors('pt').to(device='cuda')
data = data.convert_to_tensors('pt')
for i in range(len(data['input_ids'])):
# print(f"------------------data_input_ids['input_ids'][{i}]--------", data['input_ids'][i])
print(f"------------------len of data_input_ids['input_ids'][{i}]--------", len(data['input_ids'][i]))
for i in range(len(data['labels'])):
print(f"------------------len of data_input_ids['labels'][{i}]--------", len(data['labels'][i]))
print("-------------- finished data convert to tensor--------------", type(data))
with amp.autocast():
# loss = ddp_model(**data).loss
# print('----------**data: ', **data)
loss = model(**data).loss
loss = loss / args.gradient_accumulation_steps
scaler.scale(loss).backward(retain_graph=True)
gradient_accumulation_step += 1
if gradient_accumulation_step == args.gradient_accumulation_steps:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
optimizer.step()
optimizer.zero_grad()
scheduler.step()
current_lr = optimizer.param_groups[0]['lr']
logger.info(f"lr scheduler: {current_lr}, step={step}--------")
dt = time.time() - dt
step += 1
total_loss = loss.detach().item() + total_loss
logger.info(f"epoch={epoch} step={step} loss={total_loss:.4f} time={dt:.4f}s")
if step % args.save_steps == 0:
logger.info(f"epoch={epoch} step={step} loss={total_loss:.4f} time={dt:.4f}s")
ckpt_dir = Path(args.output_dir) / f"epoch{epoch}-checkpoint{step}"
ckpt_dir.mkdir(exist_ok=True, parents=True)
config.dropout_rate = 0.1
if local_rank == 0:
config.save_pretrained(ckpt_dir)
torch.save(model.state_dict(), ckpt_dir / "pytorch_model.bin")
torch.save(optimizer.state_dict(), ckpt_dir / "optimizer.pt")
logger.info(f"{ckpt_dir}/pytorch_model, and {ckpt_dir}/optimizer.pt have been saved!")
gradient_accumulation_step = 0
total_loss = 0
dt = time.time()
else:
total_loss = loss.detach().item() + total_loss
logger.info(f"End of pretraining t5-{model_size}")
if __name__ == '__main__':
fire.Fire(train)
pretrain代码
最新推荐文章于 2024-10-08 12:37:10 发布