import os
import random
import time
from pathlib import Path
from typing import Optional, Dict, Any
import fire
import numpy as np
import torch
from torch.cuda import amp
from torch.utils.data import DataLoader
from transformers import T5TokenizerFast, T5ForConditionalGeneration, TrainingArguments, T5Config, Adafactor
from transformers.utils.model_parallel_utils import get_device_map
from datasets import load_dataset
from pkot5.args import ARGS, CONFIGS
from pkot5.data import LargeCorpusDatasetFromServerV2, DataCollatorForSeq2Seq, NUM_EXTRA_IDS, \
LargeCorpusDatasetFromServer, DataCollatorForT5MLM
import jsonlines
def read_jsonl(path):
"""Read jsonlines file into python list
args:
path - directory of the jsonlines file
return:
jsonlines file content in List
"""
items = []
with open(path, 'r', encoding='utf-8') as f:
for item in jsonlines.Reader(f):
items.append(item)
return items
def train(model_size: str = "base", tokenizer_path: str = "paust/pko-t5-base", job_name: str = "pkot5-pretraining",
resume_checkpoint: Optional[int] = None, version: Optional[str] = "1"):
local_rank = int(os.getenv("LOCAL_RANK", "-1"))
model_size = model_size.lower()
default_args = ARGS[model_size]
n_pp = default_args.pop("pipeline_parallelism", 1)
args = TrainingArguments(
output_dir=f'./models/pko-t5/{model_size}',
local_rank=local_rank,
**default_args
)
random.seed(args.seed)
torch.random.manual_seed(args.seed)
np.random.seed(args.seed)
tokenizer = T5TokenizerFast.from_pretrained(tokenizer_path, unk_token='<pad>', extra_ids=NUM_EXTRA_IDS)
config = T5Config(**CONFIGS[model_size])
config.dropout_rate = 0.0
##我加的
all_data = []
# data_try = read_jsonl('./simplified-nq-test.jsonl')
qa_pairs = []
qaid2dataid = {}
# qaid = 0
# input_texts = []
# for data_dict in data_try:
datasets = load_dataset('squad_kor_v1')
for data_split in ['train', 'validation']:
input_texts, label_texts = [], []
for data in datasets[data_split]:
input_texts.append(f"질의: {data['question']} 제목: {data['title']} 본문: {data['context']}".strip())
label_texts.append(data['answers']['text'][0].strip())
input_text_ids = tokenizer(input_texts, add_special_tokens=False).input_ids
label_ids = tokenizer(label_texts, add_special_tokens=False).input_ids
start = 0
shengyu_pass = np.array([0] * 564)
try_pass = np.array([0] * 564)
for i in range(len(input_text_ids)):
biaozhi = int(len(input_text_ids[i]) / 564)
if biaozhi > 0:
for j in range(biaozhi):
try_pass = np.array(input_text_ids[i])[j * 564:(j + 1) * 564]
all_data.append({
'input_ids': try_pass.copy(),
})
if len(input_text_ids[i]) > biaozhi * 564:
zhongjian = len(input_text_ids[i]) - biaozhi * 564
if zhongjian + start < 564:
shengyu_pass[start:start + zhongjian] = np.array(input_text_ids[i])[biaozhi * 564:]
start = start + zhongjian
else:
shengyu_pass[start:] = np.array(input_text_ids[i])[biaozhi * 564:biaozhi * 564 + 564 - start]
all_data.append({
'input_ids': shengyu_pass.copy(),
})
shengyu_pass[:len(input_text_ids[i]) - (biaozhi * 564 + 564 - start)] = np.array(input_text_ids[i])[
biaozhi * 564 + 564 - start:]
start = len(input_text_ids[i]) - (biaozhi * 564 + 564 - start)
if n_pp > 1:
torch.cuda.set_device(local_rank * n_pp)
model = T5ForConditionalGeneration(config)
devices = list(range(local_rank * n_pp, (local_rank + 1) * n_pp))
model.parallelize(get_device_map(len(model.encoder.block), devices))
ddp_model = torch.nn.parallel.DistributedDataParallel(model)
else:
# torch.cuda.set_device(local_rank)
# model = T5ForConditionalGeneration(config).cuda()
model = T5ForConditionalGeneration(config)
# ddp_model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank])
optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True)
scaler = amp.GradScaler()
step = 0
if resume_checkpoint is not None:
step = resume_checkpoint
ckpt_dir = Path(args.output_dir + f'/checkpoint-{step}')
model.load_state_dict(torch.load(ckpt_dir / "pytorch_model.bin", map_location='cpu'))
optimizer.load_state_dict(torch.load(ckpt_dir / "optimizer.pt", map_location='cpu'))
if version == "1":
# train_data = LargeCorpusDatasetFromServer(job_name, grpc_endpoint, seed=args.data_seed)
train_loader = DataLoader(all_data, batch_size=args.per_device_train_batch_size,
collate_fn=DataCollatorForT5MLM(tokenizer, prefix="fill: "))
# else:
# train_data = LargeCorpusDatasetFromServerV2(job_name, tokenizer, grpc_endpoint, seed=args.data_seed)
# train_loader = DataLoader(train_data, batch_size=args.per_device_train_batch_size, collate_fn=DataCollatorForSeq2Seq(tokenizer))
print(f"Start pretraining of t5-{model_size}")
epoch=0
max_epoch=500
for epoch in range(max_epoch):
while step < args.max_steps:
total_loss = 0
dt = time.time()
gradient_accumulation_step = 0
optimizer.zero_grad()
for data in train_loader:
if step >= args.max_steps:
break
data = data.convert_to_tensors('pt').to(device='cuda')
with amp.autocast():
loss = ddp_model(**data).loss
loss = loss / args.gradient_accumulation_steps
scaler.scale(loss).backward(retain_graph=True)
gradient_accumulation_step += 1
if gradient_accumulation_step == args.gradient_accumulation_steps:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
dt = time.time() - dt
step += 1
total_loss = loss.detach().item() + total_loss
print(f"step={step} loss={total_loss:.4f} time={dt:.4f}s")
if step % args.save_steps == 0:
ckpt_dir = Path(args.output_dir) / f"checkpoint-{step}"
ckpt_dir.mkdir(exist_ok=True, parents=True)
config.dropout_rate = 0.1
if local_rank == 0:
config.save_pretrained(ckpt_dir)
torch.save(model.state_dict(), ckpt_dir / "pytorch_model.bin")
torch.save(optimizer.state_dict(), ckpt_dir / "optimizer.pt")
gradient_accumulation_step = 0
total_loss = 0
dt = time.time()
else:
total_loss = loss.detach().item() + total_loss
print(f"End of pretraining t5-{model_size}")
if __name__ == '__main__':
fire.Fire(train)
预处理程序
最新推荐文章于 2024-03-08 00:00:00 发布