前提:任务要求3w篇pdf文献用来预训练,采用MinerU转换为markdown格式,对markdown格式初步清洗用来预训练测试。
选用qwen为基座模型,采用lora预训练。以下是预训练代码。
from modelscope import snapshot_download
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import TaskType, LoraConfig, get_peft_model, AutoPeftModelForCausalLM
import torch
import os
import markdown
from bs4 import BeautifulSoup
import nltk
import matplotlib.pyplot as plt
# 下载句子分割器
# nltk.download()
# 下载punkt数据,如果未下载过
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt', download_dir=os.path.expanduser('~/.nltk_data'))
# 设置可见的CUDA设备
#os.environ["CUDA_VISIBLE_DEVICES"] = "0"
model_dir = snapshot_download('qwen/Qwen2-0.5B-Instruct')
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_dir,
torch_dtype="auto",
device_map="auto",
trust_remote_code=True
).train()
model.to("cuda")
peft_config = LoraConfig(
r=8,
target_modules=["q_proj",
"v_proj",
"k_proj",
"o_proj",
"gate_proj",
"down_proj",
"up_proj"
],
task_type=TaskType.CAUSAL_LM,
lora_alpha=16,
lora_dropout=0.05
)
model = get_peft_model(model, peft_config)
# 定义训练轮次
num_epochs = 1
optimizer = torch.optim.AdamW(model.parameters(),lr=1e-5) # 将学习率调小
# 用于存储每个batch的loss值
loss_values = []
#####后面需要关注batchsize的大小
def read_complete_sentences(file_path, chunk_size=512):
"""读取Markdown文件并返回完整的句子或段落"""
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
sentences = nltk.sent_tokenize(text) # 分割为句子
chunk = ""
for sentence in sentences:
if len(chunk) + len(sentence) <= chunk_size:
chunk += sentence + " "
else:
#print(f"读取到的句子块: {chunk.strip()}") # 打印读取到的句子块
yield chunk.strip() # 返回完整的chunk
chunk = sentence + " " # 开始新的chunk
if chunk: # 返回最后一个chunk
#print(f"读取到的句子块: {chunk.strip()}") # 打印最后一个chunk
yield chunk.strip()
# 定义一个函数来逐个处理Markdown文件
def process_markdown_files(directory):
"""处理目录中的所有Markdown文件进行训练"""
for epoch in range(num_epochs):
print(f"第 {epoch + 1} 轮训练开始")
for filename in os.listdir(directory):
if filename.endswith('.md'):
file_path = os.path.join(directory, filename)
print(f"处理文件: {filename}")
for chunk in read_complete_sentences(file_path):
inputs = tokenizer(chunk, return_tensors="pt")
inputs = {k: v.to("cuda").long() if k == "input_ids" else v.to("cuda") for k, v in inputs.items()}
inputs["labels"] = inputs["input_ids"].clone()
output = model(**inputs)
loss = output.loss
# 存储loss值
loss_values.append(loss.item())
loss.backward()
optimizer.step()
optimizer.zero_grad()
# 调用函数,传入Markdown文件的路径
process_markdown_files('/home/yunfei/Qwen/clean_MD')
# 最终保存模型和tokenizer
model.save_pretrained("output_model_1", is_main_process=True)
tokenizer.save_pretrained("output_model_1")
# 绘制loss曲线
plt.figure(figsize=(10, 5))
plt.plot(loss_values, label='Loss', color='blue')
plt.title('Training Loss')
plt.xlabel('Batch Number')
plt.ylabel('Loss')
plt.legend()
plt.grid()
plt.show()
遇到的问题:训练崩掉了(模型不讲人话)。因为学习率太高需要调低点,lr=1e-5就可以,同时要修改loss更新的频率,不能每段chunk都更新,大概4chunk一次更新差不多。
加载测试代码采用qwen官方的
from modelscope import AutoModelForCausalLM, AutoTokenizer
from modelscope import snapshot_download
import torch
model_name = snapshot_download('qwen/Qwen2-0.5B-Instruct')
model_pretrain = '/home/yunfei/Qwen/output_model_1'
model = AutoModelForCausalLM.from_pretrained(
model_pretrain,
torch_dtype="auto",
device_map="auto",
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_pretrain)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
prompt = "你好"
messages = [
{"role": "system", "content": "你是一个智能助理,必须帮助用户解决问题."},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model.generate(
**model_inputs,
max_new_tokens=512,
temperature=0.00001
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)