数据汇总
to be continued
下载脚本
下载parquet文件:
——————————————— Pretrain—————————————————————
download_pretrain_data.sh
#!/bin/bash
# 21lan without en
langs=(zh ko ar fr es pt tr de ur bn it ja uk pl ru vi he id ms nl th)
for lang in "${langs[@]}"; do
echo "download $lang dataset"
python download_pretrain_data.py $lang
done
————————————————参考 SFT 脚本(aya)————————————————
download_pretrain_data.py
from huggingface_hub import snapshot_download
import os
import sys
lang = sys.argv[1]
print(lang)
# 设置环境变量
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
for i in range(1000):
try:
# token="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"没用,可以删掉
# allow_patterns=f"{lang}/{lang}_part_000[0-2][0-9].parquet"
snapshot_download