环境
Ubuntu20.04、 cuda11.8、python conda python3.10、gpt RTX4090 24G、memory 128G 、
cpu 13900k
1、第三方库 requirements.txt
# int8
bitsandbytes==0.37.1
accelerate==0.17.1
# chatglm
protobuf>=3.19.5,<3.20.1
transformers==4.27.1
icetk
cpm_kernels==1.0.11
#torch>=1.13.1
tensorboard
datasets==2.10.1
2、安装peft 0.3.0.dev()
git clone https://github.com/huggingface/peft.git
cd peft/
pip install .
3、下载punting 代码
git clone https://github.com/mymusise/ChatGLM-Tuning.git
cd ChatGLM-Tuning
4、 新数据集处理
格式
[{
"instruction": "Give three tips for staying healthy.",
"input": "",
"output": "1.Eat a balanced diet and make sure to include plenty of fruits and
vegetables. \n2. Exercise regularly to keep your body active and
strong. \n3. Get enough sleep and maintain a consistent sleep
schedule."
}
]
处理
python cover_alpaca2jsonl.py \
--data_path alpaca_data_cleaned.json \
--save_path alpaca_data_cleaned.jsonl
创建分词目录 分词
mkdir -p alpaca_tokenize
python tokenize_dataset_rows.py \
\--jsonl_path alpaca_data_cleaned.jsonl \
\--save_path alpaca_tokenize \
\--max_seq_length 200 \
\--skip_overlength True
5 单卡训练
#修改分词
tokenizer = AutoTokenizer.from_pretrained("/home/chatglm-6b", trust_remote_code=True)
...
def main():
#修改模型预训练库
model = AutoModel.from_pretrained(
"/home/chatglm-6b", load_in_8bit=True, trust_remote_code=True, device_map="auto"
)
运行命令 50条语句的训练大概需要24小时
python finetune.py \
--dataset_path alpaca_tokenize \
--lora_rank 8 \
--per_device_train_batch_size 32 \
--gradient_accumulation_steps 4 \
--max_steps 5200 \
--save_steps 1000 \
--save_total_limit 2 \
--learning_rate 1e-4 \
--fp16 \
--remove_unused_columns false \
--logging_steps 50 \
--output_dir chatglm-6b-lora
推理代码inference.py
from transformers import AutoModel,AutoTokenizer
import torch
from peft import PeftModel
import json
from cover_alpaca2jsonl import format_example
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
model = AutoModel.from_pretrained("/home/chatglm-6b", trust_remote_code=True, load_in_8bit=True, device_map='auto', revision="")
tokenizer = AutoTokenizer.from_pretrained("/home/chatglm-6b", trust_remote_code=True, revision="")
#训练后的模型
model = PeftModel.from_pretrained(model, "/home/tuning/chatglm-6b-lora")
# TODO
instructions = json.load(open("/home/tuning/alpaca_data_cleaned.json"))
answers = []
with torch.no_grad():
for idx, item in enumerate(instructions[:3]):
feature = format_example(item)
input_text = feature['context']
ids = tokenizer.encode(input_text)
input_ids = torch.LongTensor([ids])
input_ids = input_ids.to(device)
out = model.generate(
input_ids=input_ids,
max_length=150,
do_sample=False,
temperature=0
)
out_text = tokenizer.decode(out[0])
answer = out_text.replace(input_text, "").replace("\nEND", "").strip()
item['infer_answer'] = answer
print(out_text)
print(f"### {idx+1}.Answer:\n", item.get('output'), '\n\n')
answers.append({'index': idx, **item})
运行命令
CUDA_VISIBLE_DEVICES=0 python inference.py
参考文档