大模型预训练记录

前提:任务要求3w篇pdf文献用来预训练,采用MinerU转换为markdown格式,对markdown格式初步清洗用来预训练测试。

选用qwen为基座模型,采用lora预训练。以下是预训练代码。

from modelscope import snapshot_download
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import TaskType, LoraConfig, get_peft_model, AutoPeftModelForCausalLM
import torch
import os
import markdown
from bs4 import BeautifulSoup
import nltk
import matplotlib.pyplot as plt



# 下载句子分割器
# nltk.download()


# 下载punkt数据,如果未下载过
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', download_dir=os.path.expanduser('~/.nltk_data'))
    
# 设置可见的CUDA设备
#os.environ["CUDA_VISIBLE_DEVICES"] = "0"
model_dir = snapshot_download('qwen/Qwen2-0.5B-Instruct')
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    torch_dtype="auto", 
    device_map="auto",
    trust_remote_code=True
).train()

model.to("cuda")
peft_config = LoraConfig(
        r=8,
        target_modules=["q_proj",
                        "v_proj",
                        "k_proj",
                        "o_proj",
                        "gate_proj",
                        "down_proj",
                        "up_proj"
                        ],
        task_type=TaskType.CAUSAL_LM,
        lora_alpha=16,
        lora_dropout=0.05
    )
model = get_peft_model(model, peft_config)
# 定义训练轮次
num_epochs = 1  
optimizer = torch.optim.AdamW(model.parameters(),lr=1e-5)  # 将学习率调小

# 用于存储每个batch的loss值
loss_values = []


#####后面需要关注batchsize的大小


def read_complete_sentences(file_path, chunk_size=512):
    """读取Markdown文件并返回完整的句子或段落"""
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        sentences = nltk.sent_tokenize(text)  # 分割为句子
        
        chunk = ""
        for sentence in sentences:
            if len(chunk) + len(sentence) <= chunk_size:
                chunk += sentence + " "
            else:
                #print(f"读取到的句子块: {chunk.strip()}")  # 打印读取到的句子块
                yield chunk.strip()  # 返回完整的chunk
                chunk = sentence + " "  # 开始新的chunk

        if chunk:  # 返回最后一个chunk
            #print(f"读取到的句子块: {chunk.strip()}")  # 打印最后一个chunk
            yield chunk.strip()


# 定义一个函数来逐个处理Markdown文件
def process_markdown_files(directory):
    """处理目录中的所有Markdown文件进行训练"""
    for epoch in range(num_epochs):
        print(f"第 {epoch + 1} 轮训练开始")
        
        for filename in os.listdir(directory):
            if filename.endswith('.md'):
                file_path = os.path.join(directory, filename)
                print(f"处理文件: {filename}")

                for chunk in read_complete_sentences(file_path):
                    inputs = tokenizer(chunk, return_tensors="pt")
                    inputs = {k: v.to("cuda").long() if k == "input_ids" else v.to("cuda") for k, v in inputs.items()}

                    inputs["labels"] = inputs["input_ids"].clone()

                    output = model(**inputs)
                    loss = output.loss
                    
                    # 存储loss值
                    loss_values.append(loss.item())
                    
                    loss.backward()
                    optimizer.step()
                    optimizer.zero_grad()

# 调用函数,传入Markdown文件的路径
process_markdown_files('/home/yunfei/Qwen/clean_MD')

# 最终保存模型和tokenizer
model.save_pretrained("output_model_1", is_main_process=True)
tokenizer.save_pretrained("output_model_1")

# 绘制loss曲线
plt.figure(figsize=(10, 5))
plt.plot(loss_values, label='Loss', color='blue')
plt.title('Training Loss')
plt.xlabel('Batch Number')
plt.ylabel('Loss')
plt.legend()
plt.grid()
plt.show()

遇到的问题:训练崩掉了(模型不讲人话)。因为学习率太高需要调低点,lr=1e-5就可以,同时要修改loss更新的频率,不能每段chunk都更新,大概4chunk一次更新差不多。

加载测试代码采用qwen官方的

from modelscope import AutoModelForCausalLM, AutoTokenizer
from modelscope import snapshot_download
import torch

model_name = snapshot_download('qwen/Qwen2-0.5B-Instruct')
model_pretrain = '/home/yunfei/Qwen/output_model_1'

model = AutoModelForCausalLM.from_pretrained(
    model_pretrain,
    torch_dtype="auto",
    device_map="auto",
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_pretrain)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

prompt = "你好"
messages = [
    {"role": "system", "content": "你是一个智能助理,必须帮助用户解决问题."},
    {"role": "user", "content": prompt}
]

text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)

generated_ids = model.generate(
                            
    **model_inputs,
    max_new_tokens=512,
    temperature=0.00001
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值