#!/bin/bash
set +x
export NCCL_DEBUG=WARN
export NCCL_IB_QPS_PER_CONNECTION=32
# ^(=.=)^
PYTHON=$(which python)
sed -i "s|barrier_timeout: float = 300|barrier_timeout: float = 1800|g" $PYTHON/site-packages/torch/distributed/elastic/utils/store.py
pip install transformers==4.37.0 -i https://pypi.doubanio.com/simple/
#
EPOCH=2
SEQUENCE_LENGTH=4096
WARMUP_TOKENS=0
MICRO_BATCH_SIZE=1
GLOBAL_BATCH_SIZE=1024
MODEL_SIZE=72B
TP_SIZE=8
PP_SIZE=4
PAD_LEN=$SEQUENCE_LENGTH
SAVE_INTERVAL=1000
DIR_NAME="qwen2_72B_ct"
DATA_OUTPUT_PATH="/mnt/nas/pretrain/runs/qwen2-72B-ct-06181747"
CHECKPOINT_PATH="$DATA_OUTPUT_PATH/checkpoints"
CURRENT_TIME=&#
[论文笔记] Qwen2 CT 提交脚本
于 2024-06-18 19:52:22 首次发布