手动pip install安装版实现-AI超元域-超哥【零代码微调Llama3.1 8b大模型！中文文本分块+数据集制作！Axolotl+qLoRA十分钟光速微调打造法律大模型】-CSDN博客

本文链接：https://blog.csdn.net/fengxiaoyangfeng/article/details/140901872

相应的autodl镜像我已经制作好了，目前在审核中，可能现在还打不开。

01 cuda+cudnn安装

可以参考 https://blog.csdn.net/fengxiaoyangfeng/article/details/134315290

### 版本要求 11.8和12.1都可以
cuda: 12.1
cudnn:cudnn-linux-x86_64-8.8.1.3_cuda12-archive.tar

02 torch-cuda版安装

注意：这里python环境不要超过3.10，否则后面axolotl相关的包会有不兼容

# 虚拟环境创建
conda create -n chat310 python=3.10
# torch安装
(chat310) E:\share\github\08yue\axolotl> pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

03 axolotl包安装

git clone https://github.com/axolotl-ai-cloud/axolotl
cd axolotl

pip3 install packaging ninja
pip3 install -e '.[flash-attn,deepspeed]'

上面可能报错fschat包相关问题

原因是pip3 install -e '.[flash-attn,deepspeed]' 执行安装的是requirements.txt中的包
fschat @ git+https://github.com/lm-sys/FastChat.git@27a05b04a35510afb1d767ae7e5990cbd278f8fe
上面这一行声明的包，fscha官方已经删掉了。
所以只能把requirements.txt中上面一行 @后面去掉，直接安装fscha
（但是这样又会导致后面训练时 AttributeError: LLAMA3. Did you mean: 'LLAMA2'? 报错，请看后文）

04 训练

1.预处理数据集

rm examples/llama-3/qlora.yml
wget -P examples/llama-3/ https://raw.githubusercontent.com/win4r/mytest/main/qlora.yml

CUDA_VISIBLE_DEVICES="" python -m axolotl.cli.preprocess examples/llama-3/qlora.yml

2.qlora微调

accelerate launch -m axolotl.cli.train examples/llama-3/qlora.yml

3. 微调后进行推理


accelerate launch -m axolotl.cli.inference examples/llama-3/qlora.yml \
    --lora_model_dir="./outputs/qlora-out"

# gradio
accelerate launch -m axolotl.cli.inference examples/llama-3/qlora.yml \
    --lora_model_dir="./outputs/qlora-out" --gradio

如果报错：


  File "/root/axolotl/src/axolotl/prompt_strategies/sharegpt.py", line 50, in register_llama3_template
    sep_style=SeparatorStyle.LLAMA3,
  File "/root/miniconda3/lib/python3.10/enum.py", line 437, in __getattr__
    raise AttributeError(name) from None
AttributeError: LLAMA3. Did you mean: 'LLAMA2'?

则到下面文件进行修改：
axolotl/src/axolotl/prompt_strategies/sharegpt.py
def register_llama3_template(system_message=None):
    system_message = system_message or "You are a helpful assistant."
    register_conv_template(
        Conversation(
            name="llama3",
            system_template="<|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|>",
            system_message=system_message,
            roles=("user", "assistant"),
            # 这里修改
            sep_style=SeparatorStyle.LLAMA2,
            sep="",
            stop_str="<|eot_id|>",
            stop_token_ids=[128001, 128009],
        )
    )
SeparatorStyle.LLAMA2 改为sep_style=SeparatorStyle.LLAMA2,

05 模型合并后推送到hugging face

#合并模型
python3 -m axolotl.cli.merge_lora examples/llama-3/qlora.yml --lora_model_dir="./outputs/qlora-out"

#模型合并后 推送到hugging face
huggingface-cli login
#然后输入write权限的token
huggingface-cli upload leo009/merged-llama3.1-8b  outputs/qlora-out/merged

数据集分块

超哥的代码，在 “使用bert-base-chinese进行文本分块” 生成数据集那块的代码，简单写了一下，没加cuda和batch_size可能有点慢，这里
我稍微修改了一下，可以快一写：

import torch
from transformers import BertTokenizer, BertModel
import re
import os
from scipy.spatial.distance import cosine
from torch.utils.data import DataLoader, Dataset

# 检查CUDA是否可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class SentenceDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_length):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.tokenizer(
            self.sentences[idx],
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
        )


def collate_fn(batch):
    input_ids = torch.cat([item["input_ids"] for item in batch], dim=0)
    attention_mask = torch.cat([item["attention_mask"] for item in batch], dim=0)
    return {"input_ids": input_ids, "attention_mask": attention_mask}


def get_sentence_embeddings(sentences, model, tokenizer, batch_size=8):
    dataset = SentenceDataset(sentences, tokenizer, max_length=512)
    dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)

    all_embeddings = []
    with torch.no_grad():
        for batch in dataloader:
            # 将数据移动到GPU
            batch = {key: value.to(device) for key, value in batch.items()}
            outputs = model(**batch)
            embeddings = outputs.last_hidden_state.mean(dim=1)
            all_embeddings.append(embeddings.cpu())  # 移回CPU

    return torch.cat(all_embeddings).numpy()


def split_text_by_semantic(text, max_length, similarity_threshold=0.5, batch_size=8):
    tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
    model = BertModel.from_pretrained("bert-base-chinese").to(device)
    model.eval()

    # 定义中文标点符号
    chinese_punctuation = "。！？；“”（）……"

    # 按中文标点符号拆分
    sentences = []
    for part in text:
        part = part.strip()
        sub_parts = re.split(f"([{chinese_punctuation}])", part)
        sentences.extend([sub for sub in sub_parts if sub])  # 排除空字符串

    # sentences = re.split(r"(。|！|？|；)", text)
    sentences = [s + p for s, p in zip(sentences[::2], sentences[1::2]) if s]

    chunks = []
    current_chunk = sentences[0]
    embeddings = get_sentence_embeddings(sentences, model, tokenizer, batch_size)
    current_embedding = embeddings[0]

    for idx in range(1, len(sentences)):
        sentence_embedding = embeddings[idx]
        similarity = 1 - cosine(current_embedding, sentence_embedding)

        if (
            similarity > similarity_threshold
            and len(tokenizer.tokenize(current_chunk + sentences[idx])) <= max_length
        ):
            current_chunk += sentences[idx]
            current_embedding = (current_embedding + sentence_embedding) / 2
        else:
            chunks.append(current_chunk)
            current_chunk = sentences[idx]
            current_embedding = sentence_embedding

    if current_chunk:
        chunks.append(current_chunk)

    return chunks


def read_text_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return file.readlines()


def save_chunks_to_files(chunks, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for i, chunk in enumerate(chunks):
        chunk_file_path = os.path.join(output_dir, f"chunk_{i + 1}.txt")
        with open(chunk_file_path, "w", encoding="utf-8") as file:
            file.write(chunk)
        print(f"已保存第 {i + 1} 个文本块到 {chunk_file_path}")


# 主程序


input_file_path = r"./book/1.txt"
output_dir = "./红楼梦chunk/"

long_text = read_text_file(input_file_path)

max_length = 512
similarity_threshold = 0.7
batch_size = 64

text_chunks = split_text_by_semantic(
    long_text, max_length, similarity_threshold, batch_size
)

save_chunks_to_files(text_chunks, output_dir)