# pip install modelscope transformers peft datasets
import torch
from modelscope import BitsAndBytesConfig, snapshot_download, AutoTokenizer, AutoModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset
import json
import torch
# load model
if torch.cuda.is_available():
device = 'cuda'
print(device)
# download model
model_dir_1 = snapshot_download("qwen/Qwen2-0.5B-Instruct")
model_dir_2 = snapshot_download("qwen/Qwen2-0.5B")
# 量化加载模型
_bnb_config = BitsAndBytesConfig(load_in_4bit=True, # 加载为4位
bnb_4bit_use_double_quant=True,# 双量化 权重和激活都量化
bnb_4bit_quant_type="nf4", # 非对称4位
bnb_4bit_compute_dtype=torch.float32)# 放到32位精度上训练
# load model & tokenizer
model_path = "YOUR_MODEL_PATH"
_model = AutoModelForCausalLM.from_pretrained(model_path,
quantization_config=_bnb_config,
device_map="auto",
torch_dtype="auto")
_tokenizer = AutoTokenizer.from_pretrained(model_path)
messages = [
{"role":"system", "content":"you are a helpful assistant"},
{"role":"user", "content":"你是谁?"}
]
# 查看template
_tokenizer.apply_chat_template(messages, tokenize=False)
# 处理 json 文件(假设已经有)
# [{"instruction":"问题", "output":"答案"}]
_datasets = load_dataset("json", data_files="json_file_name.json", split="train")
# 数据预处理
def preprocess_dataset(example):
MAX_LENGTH = 256
_input_ids, _attention_mask, _labels = [], [], []
_instruction = _tokenizer(f'<|im_start|>system\nyou are a helpful assistant<|im_end|>\n<|im_start|>user\n{example["instruction"]}<|im_end|>\n',
add_special_tokens=False)
# 注意这里需要加上add_special_tokens=False以及在output那里加上_tokenizer.eos_token
# 注意这里不需要return为tensors,不然会报错
_response = _tokenizer(f'<|im_start|>assistant\n{example["output"] + _tokenizer.eos_token}<|im_end|>', add_special_tokens=False)
_input_ids = _instruction["input_ids"] + _response["input_ids"]
_attention_mask = _instruction["attention_mask"] + _response["attention_mask"]
_labels = [100] * len(_instruction["input_ids"]) + _response["input_ids"]
if len(_input_ids) > MAX_LENGTH:
_input_ids = _input_ids[:MAX_LENGTH]
_attention_mask = _attention_mask[:MAX_LENGTH]
_labels = _labels[:MAX_LENGTH]
return {
"input_ids":_input_ids,
"attention_mask":_attention_mask,
"labels":_labels
}
# 操作dataset
_datasets = _datasets.map(preprocess_dataset, remove_columns=_datasets.column_names)
_datasets = _datasets.shuffle()
# fine tune
# trainining config
# trainining config
lora_config = LoraConfig(
r=8,
lora_alpha=16,
target_modules="all-linear",
task_type=TaskType.CAUSAL_LM, # 因果语言模型
)
# 加载预训练模型和Lora config
_model = get_peft_model(_model, lora_config)
# trainig
# training parameters
_training_args = TrainingArguments(
output_dir='checkpoints/lora',
gradient_accumulation_steps=2,
per_device_train_batch_size=16, # batch_size
save_steps=300,
logging_steps=100,
num_train_epochs=300
)
_trainer = Trainer(
model=_model,
args=_training_args,
train_dataset=_datasets,
data_collator=DataCollatorForSeq2Seq(tokenizer=_tokenizer, padding=True)
)
_trainer.train()
# 加载预训练过后的模型
# 加载微调过后的模型(还没有merge), 通过 PeftModel
from peft import PeftModel
from modelscope import AutoModelForCausalLM, AutoTokenizer
_model = AutoModelForCausalLM.from_pretrained("/root/.cache/modelscope/hub/qwen/Qwen2-0___5B",
torch_dtype="auto",
device_map="auto")
_tokenizer = AutoTokenizer.from_pretrained("/root/.cache/modelscope/hub/qwen/Qwen2-0___5B")
peft_model = PeftModel.from_pretrained(model=_model, model_id="your_path/checkpoint-3750")
# 问问题
# use model to generate QA pair
def ask(question, model, tokenizer):
messages = [
{"role":"system", "content":"you are a helpful assistant"},
{"role":"user", "content": question}
]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
model_inputs = tokenizer([text], return_tensors="pt").to("cuda")
generated_ids = model.generate(**model_inputs, max_new_tokens=128)
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
answer = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
return answer
ask("你是谁?", model, tokenizer)
# merge model
ckpt_list = ["checkpoint-3750"] # checkpoint name
for checkpoint in ckpt_list:
print('Merge checkpoint: {}'.format(checkpoint))
model = PeftModel.from_pretrained(_model, os.path.join("your_path", checkpoint))
model = model.merge_and_unload()
print('merge config =', model.config)
# save model
model.save_pretrained("fine-tuned-model_path")
# 注意tokenizer不变
tokenizer.save_pretrained("fine-tuned-model_path")
# load new model
model = AutoModelForCausalLM.from_pretrained("fine-tuned-model_path")
tokenizer = AutoTokenizer.from_pretrained("fine-tuned-model_path")