微调训练ChatGLM
模型下载
批量下载 huggingface 模型脚本:
import os
import requests
from lxml import etree
import time
import re
import urllib.request
from tqdm import tqdm
from colorama import init, Fore
class HuggingfaceDownUrl_model:
def __init__(self):
# os.environ['http_proxy'] = '127.0.0.1:58591'
# os.environ['https_proxy'] = '127.0.0.1:58591'
init()
# self.root_url = 'https://huggingface.co'
self.root_url = 'https://hf-mirror.com'
self.GREEN = "\x1b[32m"
self.RESET = "\x1b[0m"
self.block_size = 10240 # 1万字节
# 开始
def start_run(self, url, head, root_dir_name):
# 状态码
respone = requests.get(url , headers=head)
print(respone)
#状态码是否正常
if respone.status_code >= 200 or respone.status_code < 300:
html = etree.HTML(respone.text)
model_dir_list = html.xpath('/html/body/div[1]/main/div[2]/section/div[3]/ul/li')
for model_dir in model_dir_list:
test = len(model_dir.xpath('div[1]/@class')[0])
# 文件
if test>24:
model_down_url = model_dir.xpath('a[1]/@href')[0]
model_url = self.root_url+model_down_url
model_name = model_down_url.split('/')[-1].split('?')[0]
self.down_url(model_url, model_name, dir_name=root_dir_name)
# 文件夹
else:
dir_name = model_dir.xpath('a[1]/span/text()')[0]
dir_name = root_dir_name+'/'+dir_name
# 判断文件夹是否存在
if not os.path.exists(dir_name):
# 如果不存在,创建文件夹
os.makedirs(dir_name)
print(f"{Fore.GREEN}\n子目录文件夹 {dir_name} 创建成功{Fore.RESET}")
else:
print(f"{Fore.YELLOW}\n子目录文件夹 {dir_name} 已经存在{Fore.RESET}")
model_url = model_dir.xpath('a[1]/@href')[0]
next_url = self.root_url+model_url
self.next_rep(next_url, dir_name)
# 翻到下一页
def next_rep(self, next_url, dir_name):
next_respone = requests.get(url=next_url , headers=head)
html = etree.HTML(next_respone.text)
model_down_url_list = html.xpath('/html/body/div[1]/main/div[2]/section/div[3]/ul/li')
for value in model_down_url_list:
model_down_url = value.xpath(f'a[1]/@href')[0]
model_url = self.root_url+model_down_url
model_name = model_down_url.split('/')[-1].split('?')[0]
self.down_url(model_url, model_name, dir_name)
def down_url(self, model_url, model_name, dir_name=''):
if dir_name:
model_name = os.path.join(dir_name, model_name)
try:
# 发起请求
response = requests.get(model_url, stream=True)
response.raise_for_status()
# 获取文件总大小
total_size_in_bytes = int(response.headers.get('content-length', 0))
# 使用tqdm显示进度条
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
with open(model_name, 'wb') as file:
for data in response.iter_content(self.block_size):
progress_bar.update(len(data))
file.write(data)
progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
print("ERROR, something went wrong")
except Exception as e:
print(f"An error occurred: {e}")
if __name__ == '__main__':
start_time = time.time()
# 爬取的链接(下载只需修改此处即可)
url_list = [
'https://hf-mirror.com/THUDM/chatglm2-6b/tree/main'
]
# 请求头
head = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'
}
for url in url_list:
root_dir_name = url.split('/')[-3]
if not os.path.exists(root_dir_name):
# 如果不存在,创建文件夹
os.makedirs(root_dir_name)
print(f"{Fore.GREEN}根目录文件夹 {root_dir_name} 创建成功{Fore.RESET}")
else:
print(f"{Fore.YELLOW}根目录文件夹 {root_dir_name} 已经存在\n{Fore.RESET}")
hdum = HuggingfaceDownUrl_model()
hdum.start_run(url, head, root_dir_name)
end_time = time.time()
elapsed_time = end_time - start_time
print("总时间:"+elapsed_time)
P-Tuning-v2
git clone https://github.com/THUDM/ChatGLM2-6B
cd ChatGLM2-6B
pip install -r requirements.txt
cd ptuning/
pip install rouge_chinese nltk jieba datasets
数据集下载
AdvertiseGen_Dataset
训练脚本:train.sh
PRE_SEQ_LEN=128 # 预序列长度128。
LR=2e-2 # 设置学习率(LR)为0.02
NUM_GPUS=1 # 单卡
torchrun --standalone --nnodes=1 --nproc-per-node=$NUM_GPUS main.py \
--do_train \
--train_file di/train.json \ # 训练文件地址
--validation_file di/fval.json \ # 验证文件地址
--prompt_column content \ # 训练集中prompt名称
--response_column summary \ # 训练集中答案明细
--overwrite_cache \ # 重复训练一个训练集时候可删除
--model_name_or_path THUDM/chatglm-6b \ # 加载模型文件地址,可修改为本地路径,第五章讲怎么找
--output_dir output/adgen-chatglm-6b-pt-$PRE_SEQ_LEN-$LR \ # 保存训练模型文件地址
--overwrite_output_dir \
--max_source_length 64 \ # 最大输入文本的长度
--max_target_length 128 \
--per_device_train_batch_size 1 \ # batch_size 根据显存调节
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 16 \
--predict_with_generate \
--max_steps 2000 \ # 最大保存模型的步数
--logging_steps 10 \ # 打印日志间隔
--save_steps 500 \ # 多少部保存一次模型
--learning_rate $LR \
--pre_seq_len $PRE_SEQ_LEN \
--quantization_bit 4 # 可修改为int8
bash train.sh
输出:
loss 损失
learning_rate 学习率
epoch 当前训练轮数
{'loss': 4.7301, 'learning_rate': 0.019933333333333334, 'epoch': 0.0}
{'loss': 4.003, 'learning_rate': 0.019866666666666668, 'epoch': 0.0}
{'loss': 3.9701, 'learning_rate': 0.0198, 'epoch': 0.0}
{'loss': 3.9172, 'learning_rate': 0.019733333333333335, 'epoch': 0.01}
{'loss': 3.8953, 'learning_rate': 0.019666666666666666, 'epoch': 0.01}
{'loss': 3.8656, 'learning_rate': 0.0196, 'epoch': 0.01}
{'loss': 3.8305, 'learning_rate': 0.019533333333333333, 'epoch': 0.01}
{'loss': 3.7731, 'learning_rate': 0.019466666666666667, 'epoch': 0.01}
{'loss': 3.7685, 'learning_rate': 0.0194, 'epoch': 0.01}
{'loss': 3.7229, 'learning_rate': 0.019333333333333334, 'epoch': 0.01}
{'loss': 3.7493, 'learning_rate': 0.019266666666666668, 'epoch': 0.02}
{'loss': 3.6988, 'learning_rate': 0.0192, 'epoch': 0.02}
{'loss': 3.5724, 'learning_rate': 0.019133333333333332, 'epoch': 0.02}
{'loss': 3.6526, 'learning_rate': 0.01906666666666667, 'epoch': 0.02}
{'loss': 3.652, 'learning_rate': 0.019, 'epoch': 0.02}
{'loss': 3.6141, 'learning_rate': 0.018933333333333333, 'epoch': 0.02}
{'loss': 3.6451, 'learning_rate': 0.018866666666666667, 'epoch': 0.02}
{'loss': 3.6273, 'learning_rate': 0.0188, 'epoch': 0.03}
{'loss': 3.6051, 'learning_rate': 0.018733333333333334, 'epoch': 0.03}
{'loss': 3.5769, 'learning_rate': 0.018666666666666668, 'epoch': 0.03}
{'loss': 3.6073, 'learning_rate': 0.018600000000000002, 'epoch': 0.03}
{'loss': 3.657, 'learning_rate': 0.018533333333333332, 'epoch': 0.03}
{'loss': 3.587, 'learning_rate': 0.018466666666666666, 'epoch': 0.03}
{'loss': 3.5704, 'learning_rate': 0.0184, 'epoch': 0.03}
{'loss': 3.5417, 'learning_rate': 0.018333333333333333, 'epoch': 0.03}
{'loss': 3.5782, 'learning_rate': 0.018266666666666667, 'epoch': 0.04}
{'loss': 3.5561, 'learning_rate': 0.0182, 'epoch': 0.04}
{'loss': 3.616, 'learning_rate': 0.01813333333333333, 'epoch': 0.04}
{'loss': 3.5763, 'learning_rate': 0.01806666666666667, 'epoch': 0.04}
{'loss': 3.4752, 'learning_rate': 0.018000000000000002, 'epoch': 0.04}
{'loss': 3.4914, 'learning_rate': 0.017933333333333332, 'epoch': 0.04}
{'loss': 3.6215, 'learning_rate': 0.017866666666666666, 'epoch': 0.04}
{'loss': 3.5179, 'learning_rate': 0.0178, 'epoch': 0.05}
{'loss': 3.5763, 'learning_rate': 0.017733333333333334, 'epoch': 0.05}
{'loss': 3.534, 'learning_rate': 0.017666666666666667, 'epoch': 0.05}
12%|████████▉ | 356/3000 [29:20<3:37:57, 4.95s/it]