pretrain代码

import os
import random
import time
from pathlib import Path
from typing import Optional, Dict, Any

import fire
import numpy as np
import torch
from torch.cuda import amp
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader
from transformers import T5TokenizerFast, T5ForConditionalGeneration, TrainingArguments, T5Config, Adafactor,AutoTokenizer
from transformers import AdamW, get_cosine_schedule_with_warmup
from transformers.utils.model_parallel_utils import get_device_map
from datetime import datetime
from args import ARGS, CONFIGS
from data import LargeCorpusDatasetFromServerV2, DataCollatorForSeq2Seq, NUM_EXTRA_IDS, LargeCorpusDatasetFromServer, DataCollatorForT5MLM
import jsonlines
import json
import torch.distributed as dist
import socket
import logging

def get_logger(filename=None):
    logger = logging.getLogger('logger')
    logger.setLevel(logging.DEBUG)
    logging.basicConfig(format='%(asctime)s - %(levelname)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
    if filename is not None:
        handler = logging.FileHandler(filename)
        handler.setLevel(logging.DEBUG)
        handler.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s: %(message)s'))
        logging.getLogger().addHandler(handler)
    return logger

def read_json(path):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def read_jsonl(path):

    items = []
    with open(path, 'r', encoding='utf-8') as f:
        for item in jsonlines.Reader(f):
            items.append(item)
    return items
        

def train(model_size="base", tokenizer_path="/common-data/zhangjing/xiaoyingzuo/pko-t5-main/pkot5/pretrained/T5-base-korean-pko copy", job_name: str = "pkot5-pretraining", resume_checkpoint: Optional[int] = None, version: Optional[str] = 1):
    current_time = datetime.now().strftime('%Y-%m-%d_%H-%M')
    logger = get_logger(os.path.join('./log_dir', f'{current_time}_bs=128_inverselr=1e-4_log.txt'))
    # logger.info(args)
    
    logger.info("-------------start training------------------------")
    local_rank = int(os.getenv("LOCAL_RANK", "-1"))
    model_size = model_size.lower()
    default_args = ARGS[model_size]
    n_pp = default_args.pop("pipeline_parallelism", 1)      #base是1 
    args = TrainingArguments(
        output_dir=f'./models/pko-t5/inverselr=1e-4_{model_size}',
        local_rank=local_rank,
        **default_args
    )

    random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    np.random.seed(args.seed)

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, unk_token='<pad>', extra_ids=NUM_EXTRA_IDS)


    config = T5Config(**CONFIGS[model_size])
    config.dropout_rate = 0.0

    ## added code here, xiaoyingzuo, 0713
    ## changed here 0721
    #-----------------load_dataset-------------------------------------

    all_data = []
    manual_iterms = read_jsonl('context_1.jsonl')
    logger.info("-------------finished read jsonl: context_1.jsonl----------------")
    input_texts = []
    for page_iterm in manual_iterms:
        for context_dict in page_iterm['context_list']:

            input_texts.append(f"{context_dict['context']}".strip())
            # print('input text append \n', input_texts)
    
    input_texts_ids = tokenizer(input_texts, add_special_tokens=False).input_ids

    # added here 230721 xiaoying
    #-------------preprocess data -------------------------------------
    start = 0
    last_pass = np.array([0] * 564)
    try_pass = np.array([0] * 564)
    
    for i in range(len(input_texts_ids)):
        
        turn_tmp = int(len(input_texts_ids[i]) / 564)
        if  turn_tmp > 0:
            for j in range(turn_tmp):
                try_pass = np.array(input_texts_ids[i])[j * 564 : (j + 1) * 564]
                all_data.append({'input_ids': try_pass.copy()})
        if len(input_texts_ids[i]) > turn_tmp * 564:
            middle_tmp = len(input_texts_ids[i]) - turn_tmp * 564
            if middle_tmp + start < 564:
                last_pass[start:start + middle_tmp] = np.array(input_texts_ids[i])[turn_tmp * 564 :]
                start = start + middle_tmp
            else:
                last_pass[start:] = np.array(input_texts_ids[i])[turn_tmp * 564: turn_tmp * 564 + 564 - start]
                all_data.append({'input_ids': last_pass.copy()})

                last_pass[: len(input_texts_ids[i]) - (turn_tmp * 564 +564-start)] = np.array(input_texts_ids[i])[turn_tmp *564 +564-start :]

                start = len(input_texts_ids[i]) - (turn_tmp *564 + 564 -start)
    
    # 将编码后的数字转换回原始句子
    # for i in range(len(all_data)):

    #     decoded_input = tokenizer.decode(all_data[i]['input_ids'], skip_special_tokens=False)
    #     logger.info(f"----------------------decodeed input + 170 {i}------------------------------")
    #     logger.info(decoded_input)
    

# 打印原始句子
    # print(decoded_input)
    
    logger.info(f'-------------total tokenizer number {len(all_data)} finished preprocess-----------------------')

    if n_pp > 1:
        torch.cuda.set_device(local_rank * n_pp)
        model = T5ForConditionalGeneration(config)
        devices = list(range(local_rank * n_pp, (local_rank + 1) * n_pp))
        model.parallelize(get_device_map(len(model.encoder.block), devices))
        if not dist.is_initialized():
            dist.init_process_group(backend='nccl')
        # dist.init_process_group(backend='nccl')
        # ddp_model = torch.nn.parallel.DistributedDataParallel(model)
    else:
        torch.cuda.set_device(local_rank)
        # model = T5ForConditionalGeneration(config).cuda()
        model = T5ForConditionalGeneration(config)
        print("----dist init---")
        if not dist.is_initialized():
            dist.init_process_group(backend='nccl')
        # dist.init_process_group(backend='nccl')
        # ddp_model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank])

    initial_lr = 1e-4
    total_steps = args.max_steps
    warmup_steps = 72
    # optimizer = Adafactor(model.parameters(), scale_parameter=False, relative_step=False, warmup_init=False)
    optimizer = AdamW(model.parameters(), lr=initial_lr)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
    scaler = amp.GradScaler()
    logger.info(f'--initial_lr:{initial_lr}, total_steps{total_steps}, warmup_steps{warmup_steps}, optimizer{optimizer}------')

    # def inverse_sqrt_schedule(step):
    #     return 1.0 / (step ** 0.5)
    # scheduler = LambdaLR(optimizer, lr_lambda=inverse_sqrt_schedule)
    
    logger.info("----optimiter finished  adafactor----------")

    if resume_checkpoint is not None:
        step = resume_checkpoint
        # ckpt_dir = Path(default_args.start_ckpt_dir)
        model.load_state_dict(torch.load( "/common-data/zhangjing/xiaoyingzuo/pko-t5-main/pkot5/pretrained/T5-base-korean-pko/pytorch_model.bin", map_location='cpu'))
        logger.info('--------------loaded the T5-base-korean-pko/pytorch_model.bin-----------')
        # optimizer.load_state_dict(torch.load(ckpt_dir / "optimizer.pt", map_location='cpu'))

    epoch = 0
    max_epoch = 10
    # max_steps = len(all_data)
    # max_steps = 80
    for epoch in range(max_epoch):
        train_loader = DataLoader(all_data, batch_size=args.per_device_train_batch_size, collate_fn=DataCollatorForT5MLM(tokenizer, prefix="fill: "))
        logger.info(f"Start pretraining of t5-{model_size} epoch-{epoch}")
        step = 0
        
        while step < args.max_steps:
            total_loss = 0
            dt = time.time()
            gradient_accumulation_step = 0
            optimizer.zero_grad()
            for data in train_loader:
                # print("data in training...", data)
                if step >= args.max_steps:
                    break
                
                print("--------------data convert to tensor--------------")
                # data = data.convert_to_tensors('pt').to(device='cuda')
                data = data.convert_to_tensors('pt')
                for i in range(len(data['input_ids'])):
                    # print(f"------------------data_input_ids['input_ids'][{i}]--------", data['input_ids'][i])
                    print(f"------------------len of data_input_ids['input_ids'][{i}]--------", len(data['input_ids'][i]))
                for i in range(len(data['labels'])):
                    print(f"------------------len of data_input_ids['labels'][{i}]--------", len(data['labels'][i]))
                print("-------------- finished  data convert to tensor--------------", type(data))
                
                with amp.autocast():
                    # loss = ddp_model(**data).loss
                    # print('----------**data: ', **data)
                    
                    loss = model(**data).loss
                    loss = loss / args.gradient_accumulation_steps

                scaler.scale(loss).backward(retain_graph=True)
                gradient_accumulation_step += 1
                
                if gradient_accumulation_step == args.gradient_accumulation_steps:
                   
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.step()
                    optimizer.zero_grad()
                    scheduler.step()
                    current_lr = optimizer.param_groups[0]['lr']
                    logger.info(f"lr scheduler: {current_lr}, step={step}--------")
                    dt = time.time() - dt

                    step += 1
                    total_loss = loss.detach().item() + total_loss

                    
                    logger.info(f"epoch={epoch} step={step} loss={total_loss:.4f} time={dt:.4f}s")

                    if step % args.save_steps == 0:
                        logger.info(f"epoch={epoch} step={step} loss={total_loss:.4f} time={dt:.4f}s")
                        ckpt_dir = Path(args.output_dir) / f"epoch{epoch}-checkpoint{step}"
                        ckpt_dir.mkdir(exist_ok=True, parents=True)
                        config.dropout_rate = 0.1


                        if local_rank == 0:
                            config.save_pretrained(ckpt_dir)
                            torch.save(model.state_dict(), ckpt_dir / "pytorch_model.bin")
                            torch.save(optimizer.state_dict(), ckpt_dir / "optimizer.pt")
                            logger.info(f"{ckpt_dir}/pytorch_model, and {ckpt_dir}/optimizer.pt have been saved!")

                    gradient_accumulation_step = 0
                    total_loss = 0
                    dt = time.time()
                else:
                    total_loss = loss.detach().item() + total_loss
            
            

    logger.info(f"End of pretraining t5-{model_size}")


if __name__ == '__main__':
    fire.Fire(train)

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值