个人编程助手: 使用LLM训练你自己的编码助手

小鸡不简单

已于 2023-12-19 14:33:13 修改

阅读量1.2k

点赞数 18

文章标签：语言模型深度学习人工智能 pytorch python

于 2023-12-19 14:25:02 首次发布

本文链接：https://blog.csdn.net/qq_50097745/article/details/135083808

版权

什么是Tuner（adapter)HuggingFace上的详解

在编程和软件开发这个不断演变的领域中，对效率和生产力的追求催生了许多卓越的创新。其中一个显著的创新就是代码生成模型的出现，如 Codex、StarCoder 和 Code Llama。这些模型在生成类似人类编写的代码片段方面表现出惊人能力，显示出了作为编程助手的巨大潜力。

你能够根据自己的特定需求定制代码生成模型，并且这种个性化的编程助手能够在企业规模上得到应用。

数据的收集

数据集在概念上非常简单，我们像下面所示那样构建它。


仓库名	仓库中的文件路径	文件内容
—	—	—
—	—	—

使用github的API去抓取代码可能会有速率限制的问题，所以，直接将所以得公共仓库下载到本地,下载脚本如下

import os
import subprocess
from multiprocessing import Pool
from github import Github

ORG = "huggingface"
MIRROR_DIRECTORY = "hf_public_repos"


def get_repos(username, access_token=None, include_fork=False):
    """Fetches repositories for a particular GitHub user.

    Courtesy: Chansung Park.
    """
    g = Github(access_token)
    user = g.get_user(username)

    results = []
    for repo in user.get_repos():
        if repo.fork is False:
            results.append(repo.name)
        else:
            if include_fork is True:
                results.append(repo.name)

    return results


def mirror_repository(repository):
    """Locally clones a repository."""
    repository_url = f"https://github.com/{ORG}/{repository}.git"
    repository_path = os.path.join(MIRROR_DIRECTORY, repository)

    # Clone the repository
    subprocess.run(["git", "clone", repository_url, repository_path])


def mirror_repositories():
    #ORG = "huggingface"
    #MIRROR_DIRECTORY = "hf_public_repos"
    #创建镜像文件夹
    if not os.path.exists(MIRROR_DIRECTORY):
        os.makedirs(MIRROR_DIRECTORY)

    # Get the list of repositories in the organization
    #获取组织中的存储库列表 GH_ACCESS_TOKEN
    #GH_ACCESS_TOKEN 通常指的是 GitHub Access Token，是用于对 GitHub 进行身份验证和授权的一种凭证。GitHub Access Token 可以用于访问 GitHub API、执行 Git 操作，或者其他需要身份验证的 GitHub 操作。可以在 GitHub 的设置页面中生成一个个人访问令牌（Personal Access Token）或创建一个 OAuth 应用程序以获取访问令牌
    if not os.environ["GH_ACCESS_TOKEN"]:
        raise ValueError("You must set `GH_ACCESS_TOKEN` as an env variable.")
    #得到github仓库 ORG=username
    repositories = get_repos(ORG, os.environ["GH_ACCESS_TOKEN"])
    print(f"Total repositories found: {len(repositories)}.")
    
    # Mirror repositories using multiprocessing
    #开启多进程，下载
    print("Cloning repositories.")
    with Pool() as pool:
        pool.map(mirror_repository, repositories)


if __name__ == "__main__":
    mirror_repositories()

为了保持这些内容的序列化相对内存友好 (即处理代码时不会过多占用内存)，我们使用了分块处理方法和 feather 格式完整实现请参见这个脚本。

import os
import pandas as pd
from nbformat import reads, NO_CONVERT
from tqdm import tqdm
from datasets import Dataset
from typing import Dict
from huggingface_hub import HfApi, create_repo
import tempfile
import subprocess

MIRROR_DIRECTORY = "hf_public_repos"
DATASET_ID = "hf-codegen"
SERIALIZE_IN_CHUNKS = 10000
FEATHER_FORMAT = "ftr"

# Block the following formats. 
#阻止以下格式
IMAGE = ["png", "jpg", "jpeg", "gif"]
VIDEO = ["mp4", "jfif"]
DOC = [
    "key",
    "PDF",
    "pdf",
    "docx",
    "xlsx",
    "pptx",
]
AUDIO = ["flac", "ogg", "mid", "webm", "wav", "mp3"]
ARCHIVE = ["jar", "aar", "gz", "zip", "bz2"]
MODEL = ["onnx", "pickle", "model", "neuron"]
OTHERS = [
    "npy",
    "index",
    "inv",
    "index",
    "DS_Store",
    "rdb",
    "pack",
    "idx",
    "glb",
    "gltf",
    "len",
    "otf",
    "unitypackage",
    "ttf",
    "xz",
    "pcm",
    "opus",
]
#打包成一个元祖
ANTI_FOMATS = tuple(IMAGE + VIDEO + DOC + AUDIO + ARCHIVE + OTHERS)


def upload_to_hub(file_format: str, repo_id: str):
    """Moves all the files matching `file_format` to a folder and
    uploads the folder to the Hugging Face Hub."""
    api = HfApi()
    repo_id = create_repo(repo_id=repo_id, exist_ok=True, repo_type="dataset").repo_id

    with tempfile.TemporaryDirectory() as tmpdirname:
        os.makedirs(tmpdirname)
        command = f"mv *.{file_format} {tmpdirname}"
        _ = subprocess.run(command.split())
        api.upload_folder(repo_id=repo_id, folder_path=tmpdirname, repo_type="dataset")


def filter_code_cell(cell) -> bool:
    """Filters a code cell w.r.t shell commands, etc."""
    only_shell = cell["source"].startswith("!")
    only_magic = "%%capture" in cell["source"]
    if only_shell or only_magic:
        return False
    else:
        return True


def process_file(directory_name: str, file_path: str) -> Dict[str, str]:
    """Processes a single file."""
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()
            if file_path.endswith("ipynb"):
                #如果是notebook里面的代码则添加进来
                code_cell_str = ""
                #from nbformat import reads, NO_CONVERT
                #读取notebook的代码
                notebook = reads(content, NO_CONVERT)
                #notebook是个json
                code_cells = [
                    c #找到notebook中的所有cells模块，如果是代码且
                    for c in notebook["cells"]
                    if c["cell_type"] == "code"
                    # 不能是commands，所以将所有以！开头的去除，将所有包含%%capture的去除
                    if filter_code_cell(c)
                ]
                # 将source里面的数据提取出来
                for cell in code_cells:
                    code_cell_str += cell["source"]
                content = code_cell_str
    except Exception:
        content = ""

    return {
        "repo_id": directory_name,
        "file_path": file_path,
        "content": content,
    }


def read_repository_files(directory) -> pd.DataFrame:
    #MIRROR_DIRECTORY = "hf_public_repos" 
    """Reads the files from the locally cloned repositories."""
    file_paths = []
    #生成三列 #对应着 ： 仓库名   仓库中的文件   路径文件内容
    df = pd.DataFrame(columns=["repo_id", "file_path", "content"])
    chunk_flag = 0

    # Recursively find all files within the directory
    # 递归地查找目录中的所有文件
    for root, _, files in os.walk(directory):
        for file in files:
     
            file_path = os.path.join(root, file)
            
            if not file_path.endswith(ANTI_FOMATS) and all(
                
                k not in file_path for k in [".git", "__pycache__", "xcodeproj"]
            ):
                #找到所有的文件路径
                file_paths.append((os.path.dirname(root), file_path))

    # Process files sequentially.
    print(f"Total file paths: {len(file_paths)}.")
    print("Reading file contents...")
    
    for i, (directory_name, file_path) in enumerate(tqdm(file_paths)):
        #数据处理，得到一个json文件：{
        #"repo_id": directory_name,
        #"file_path": file_path,
       # "content": content,}
        file_content = process_file(directory_name, file_path)
        #如果内容不为空，则转换为t_df
        if file_content["content"] != "":
            temp_df = pd.DataFrame.from_dict([file_content])
            #将新的数据拼接到原始的df上
            df = pd.concat([df, temp_df])

            if (
                #CHUNKS大小为10000
                SERIALIZE_IN_CHUNKS
                and len(df) != 0
                #df的长度 % 10000 == 0 的时候保存
                and (len(df) % SERIALIZE_IN_CHUNKS == 0)
            ):  
                #FEATHER_FORMAT = "ftr" 文件的格式
                df_path = f"df_chunk_{chunk_flag}_{len(df)}.{FEATHER_FORMAT}"
                print(f"Serializing dataframe to {df_path}...")
                # reset_index（）重新加index这一列，将
                df.reset_index().to_feather(df_path)
                del df
                df = pd.DataFrame(columns=["repo_id", "file_path", "content"])
                chunk_flag += 1

    return df


if __name__ == "__main__":
    #从一个仓库读取文件 MIRROR_DIRECTORY = "hf_public_repos"
    df = read_repository_files(MIRROR_DIRECTORY)
    print("DataFrame created, creating dataset...")
    #将数据上传到hub
    upload_to_hub(file_format=FEATHER_FORMAT, repo_id=DATASET_ID)
    print(f"{FEATHER_FORMAT} files uploaded to the Hub.")
    #如果没有序列化，则
    if not SERIALIZE_IN_CHUNKS:
        dataset = Dataset.from_pandas(df)
        dataset.push_to_hub(DATASET_ID, private=True)

最终的数据可以在这儿获得：sayakpaul/hf-codegen-v2 · Datasets at Hugging Face

smangrul/hf-stack-v1 at main (huggingface.co)

找的数据是基于点赞数排名前十的HF，他们分别是：

['transformers', 'pytorch-image-models', 'datasets', 'diffusers', 'peft', 'tokenizers', 'accelerate', 'text-generation-inference', 'chat-ui', 'deep-rl-class']

微调大模型

hgface上的微调

bigcode/starcoder · Hugging Face

StarCoder 模型是使用 The Stack （v1.2）的 80+ 编程语言训练的 15.5B 参数模型，不包括选择退出请求。该模型使用多查询注意力，这是一个包含 8192 个标记的上下文窗口，并使用 Fill-in-the-Middle 目标对 1 万亿个标记进行训练。

不是一个指令模型，该模型是在 GitHub 代码上训练的。因此，它不是一个指令模型，像“编写一个计算平方根的函数”这样的命令不能很好地工作。但是，通过使用技术助手提示，您可以将其变成一个有能力的技术助手。

微调

已经训练好的模型+

DHS-LLM-Workshop/personal_copilot/training at main · pacman100/DHS-LLM-Workshop · GitHub

模型: bigcode/stacoder

数据集: smangrul/hf-stack-v1

#!/usr/bin/env python # coding=utf-8 # Copyright 2022 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset without using HuggingFace Trainer. Here is the full list of checkpoints on the hub that can be fine-tuned by this script: https://huggingface.co/models?filter=text-generation """ # You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. import argparse import json import logging import math import os import random from itertools import chain from pathlib import Path import datasets import torch import transformers from datasets import load_dataset from huggingface_hub import Repository from torch.utils.data import DataLoader from tqdm.auto import tqdm from transformers import ( CONFIG_MAPPING, MODEL_MAPPING, AutoConfig, AutoModelForCausalLM, AutoTokenizer, SchedulerType, default_data_collator, get_scheduler, ) from transformers.utils import get_full_repo_name from transformers.utils.versions import require_version from accelerate import Accelerator, DistributedType from accelerate.logging import get_logger from accelerate.utils import DummyOptim, DummyScheduler, set_seed logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) def parse_args(): parser = argparse.ArgumentParser(description="Finetune a transformers model on a causal language modeling task") parser.add_argument( "--dataset_name", type=str, default=None, help="The name of the dataset to use (via the datasets library).", ) parser.add_argument( "--dataset_config_name", type=str, default=None, help="The configuration name of the dataset to use (via the datasets library).", ) parser.add_argument( "--train_file", type=str, default=None, help="A csv or a json file containing the training data." ) parser.add_argument( "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data." ) parser.add_argument( "--validation_split_percentage", default=5, help="The percentage of the train set used as validation set in case there's no validation split", ) parser.add_argument( "--model_name_or_path", type=str, help="Path to pretrained model or model identifier from huggingface.co/models.", required=False, ) parser.add_argument( "--config_name", type=str, default=None, help="Pretrained config name or path if not the same as model_name", ) parser.add_argument( "--tokenizer_name", type=str, default=None, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--use_slow_tokenizer", action="store_true", help="If passed, will use a slow tokenizer (not backed by the 🤗 Tokenizers library).", ) parser.add_argument( "--per_device_train_batch_size", type=int, default=8, help="Batch size (per device) for the training dataloader.", ) parser.add_argument( "--per_device_eval_batch_size", type=int, default=8, help="Batch size (per device) for the evaluation dataloader.", ) parser.add_argument( "--learning_rate", type=float, default=5e-5, help="Initial learning rate (after the potential warmup period) to use.", ) parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.") parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.") parser.add_argument( "--max_train_steps", type=int, default=None, help="Total number of training steps to perform. If provided, overrides num_train_epochs.", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--lr_scheduler_type", type=SchedulerType, default="linear", help="The scheduler type to use.", choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"], ) parser.add_argument( "--num_warmup_steps", type=int, default=0, help="Number of steps for the warmup in the lr scheduler." ) parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.") parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.") parser.add_argument( "--model_type", type=str, default=None, help="Model type to use if training from scratch.", choices=MODEL_TYPES, ) parser.add_argument( "--block_size", type=int, default=None, help=( "Optional input sequence length after tokenization. The training dataset will be truncated in block of" " this size for training. Default to the model max input length for single sentence inputs (take into" " account special tokens)." ), ) parser.add_argument( "--preprocessing_num_workers", type=int, default=None, help="The number of processes to use for the preprocessing.", ) parser.add_argument( "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets" ) parser.add_argument( "--no_keep_linebreaks", action="store_true", help="Do not keep line breaks when using TXT files." ) parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") parser.add_argument( "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`." ) parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") parser.add_argument( "--checkpointing_steps", type=str, default=None, help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.", ) parser.add_argument( "--resume_from_checkpoint", type=str, default=None, help="If the training should continue from a checkpoint folder.", ) # New Code # # Whether to load the best model at the end of training parser.add_argument( "--load_best_model", action="store_true", help="Whether to load the best model at the end of training", ) parser.add_argument( "--with_tracking", action="store_true", help="Whether to enable experiment trackers for logging.", ) parser.add_argument( "--report_to", type=str, default="all", help=( 'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,' ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.' "Only applicable when `--with_tracking` is passed." ), ) args = parser.parse_args() # Sanity checks if args.dataset_name is None and args.train_file is None and args.validation_file is None: raise ValueError("Need either a dataset name or a training/validation file.") else: if args.train_file is not None: extension = args.train_file.split(".")[-1] assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file." if args.validation_file is not None: extension = args.validation_file.split(".")[-1] assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file." if args.push_to_hub: assert args.output_dir is not None, "Need an `output_dir` to create a repo when `--push_to_hub` is passed." return args # New Code # def evaluate(args, model, eval_dataloader, accelerator, eval_dataset): model.eval() losses = [] for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) loss = outputs.loss losses.append(accelerator.gather_for_metrics(loss.repeat(args.per_device_eval_batch_size))) losses = torch.cat(losses) try: eval_loss = torch.mean(losses) perplexity = math.exp(eval_loss) except OverflowError: perplexity = float("inf") return perplexity, eval_loss def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment # when using DeepSpeed, the `gradient_accumulation_steps` is properly set from the DeepSpeed plugin/config # or from `accelerate launch` via `--gradient_accumulation_steps` else # defaulting to the passed `args.gradient_accumulation_steps` accelerator = ( Accelerator( log_with=args.report_to, project_dir=args.output_dir, gradient_accumulation_steps=args.gradient_accumulation_steps, ) if args.with_tracking else Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps) ) # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Handle the repository creation if accelerator.is_main_process: if args.push_to_hub: if args.hub_model_id is None: repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id repo = Repository(args.output_dir, clone_from=repo_name) with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: if "step_*" not in gitignore: gitignore.write("step_*\n") if "epoch_*" not in gitignore: gitignore.write("epoch_*\n") elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) accelerator.wait_for_everyone() # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[:{args.validation_split_percentage}%]", ) raw_datasets["train"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[{args.validation_split_percentage}%:]", ) else: data_files = {} dataset_args = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] if extension == "txt": extension = "text" dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks raw_datasets = load_dataset(extension, data_files=data_files, **dataset_args) # If no validation data is there, validation_split_percentage will be used to divide the dataset. if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( extension, data_files=data_files, split=f"train[:{args.validation_split_percentage}%]", **dataset_args, ) raw_datasets["train"] = load_dataset( extension, data_files=data_files, split=f"train[{args.validation_split_percentage}%:]", **dataset_args, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.config_name) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if args.model_name_or_path: model = AutoModelForCausalLM.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForCausalLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. column_names = raw_datasets["train"].column_names text_column_name = "text" if "text" in column_names else column_names[0] def tokenize_function(examples): return tokenizer(examples[text_column_name]) with accelerator.main_process_first(): tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on dataset", ) if args.block_size is None: block_size = tokenizer.model_max_length if block_size > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --block_size xxx." ) block_size = 1024 else: if args.block_size > tokenizer.model_max_length: logger.warning( f"The block_size passed ({args.block_size}) is larger than the maximum length for the model" f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." ) block_size = min(args.block_size, tokenizer.model_max_length) # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples): # Concatenate all texts. concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. if total_length >= block_size: total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower # to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map with accelerator.main_process_first(): lm_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=args.preprocessing_num_workers, load_from_cache_file=not args.overwrite_cache, desc=f"Grouping texts in chunks of {block_size}", ) train_dataset = lm_datasets["train"] eval_dataset = lm_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=args.per_device_train_batch_size ) eval_dataloader = DataLoader( eval_dataset, collate_fn=default_data_collator, batch_size=args.per_device_eval_batch_size ) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] # New Code # # Creates Dummy Optimizer if `optimizer` was specified in the config file else creates Adam Optimizer optimizer_cls = ( torch.optim.AdamW if accelerator.state.deepspeed_plugin is None or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config else DummyOptim ) optimizer = optimizer_cls(optimizer_grouped_parameters, lr=args.learning_rate) # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties. if accelerator.distributed_type == DistributedType.TPU: model.tie_weights() # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / accelerator.gradient_accumulation_steps) overrode_max_train_steps = False if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch overrode_max_train_steps = True else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # New Code # # Creates Dummy Scheduler if `scheduler` was specified in the config file else creates `args.lr_scheduler_type` Scheduler if ( accelerator.state.deepspeed_plugin is None or "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config ): lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) else: lr_scheduler = DummyScheduler( optimizer, total_num_steps=args.max_train_steps, warmup_num_steps=args.num_warmup_steps ) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader, lr_scheduler ) # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / accelerator.gradient_accumulation_steps) if overrode_max_train_steps: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch # Afterwards we recalculate our number of training epochs args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # Figure out how many steps we should save the Accelerator states checkpointing_steps = args.checkpointing_steps if checkpointing_steps is not None and checkpointing_steps.isdigit(): checkpointing_steps = int(checkpointing_steps) # We need to initialize the trackers we use, and also store our configuration. # The trackers initializes automatically on the main process. if args.with_tracking: experiment_config = vars(args) # TensorBoard cannot log Enums, need the raw value experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value accelerator.init_trackers("clm_no_trainer", experiment_config) # Train! total_batch_size = ( args.per_device_train_batch_size * accelerator.num_processes * accelerator.gradient_accumulation_steps ) logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") logger.info(f" Gradient Accumulation steps = {accelerator.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 starting_epoch = 0 best_metric = None best_metric_checkpoint = None # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: accelerator.load_state(args.resume_from_checkpoint) accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") path = os.path.basename(args.resume_from_checkpoint) training_difference = os.path.splitext(path)[0] if "epoch" in training_difference: starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None completed_steps = starting_epoch * num_update_steps_per_epoch else: resume_step = int(training_difference.replace("step_", "")) starting_epoch = resume_step // num_update_steps_per_epoch resume_step -= starting_epoch * num_update_steps_per_epoch completed_steps = resume_step for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 # skip new `skip_first_batches` to skip the batches when resuming from ckpt if args.resume_from_checkpoint: train_dataloader = accelerator.skip_first_batches(train_dataloader, num_batches=resume_step) for step, batch in enumerate(train_dataloader): # In particular, DeepSpeed handles `gradient_accumulation` via `DeepSpeedEngine`. # Below, we use `accelerator.accumulate` if the user # wants to switch to other approaches such as plain DDP, PyTorch FSDP ... # This avoids having to change any code as things are all handled across different distributed setups. with accelerator.accumulate(model): outputs = model(**batch) loss = outputs.loss accelerator.backward(loss) optimizer.step() lr_scheduler.step() optimizer.zero_grad() if accelerator.sync_gradients: progress_bar.update(1) completed_steps += 1 # We keep track of the loss at each epoch if args.with_tracking: step_loss = accelerator.reduce(loss.detach().clone()).item() total_loss += step_loss if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: output_dir = f"step_{completed_steps}" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) if completed_steps >= args.max_train_steps: break perplexity, eval_loss = evaluate(args, model, eval_dataloader, accelerator, eval_dataset) logger.info(f"epoch {epoch}: perplexity: {perplexity} eval_loss: {eval_loss}") if args.with_tracking: accelerator.log( { "perplexity": perplexity, "eval_loss": eval_loss, "train_loss": total_loss / len(train_dataloader), "epoch": epoch, "step": completed_steps, }, step=completed_steps, ) if isinstance(checkpointing_steps, str) and checkpointing_steps == "epoch": accelerator.save_state(os.path.join(args.output_dir, f"epoch_{epoch}")) # New Code # # Tracks the best checkpoint and best metric if best_metric is None or best_metric > perplexity: best_metric = perplexity best_metric_checkpoint = os.path.join(args.output_dir, "best_checkpoint") accelerator.save_state(best_metric_checkpoint) accelerator.print(f"New best metric: {best_metric} at epoch {epoch}") accelerator.print(f"best_metric_checkpoint: {best_metric_checkpoint}") # New Code # # Loads the best checkpoint after the training is finished if args.load_best_model: accelerator.load_state(best_metric_checkpoint) # New Code # # Evaluates using the best checkpoint perplexity, eval_loss = evaluate(args, model, eval_dataloader, accelerator, eval_dataset) logger.info(f"Best model metrics: perplexity: {perplexity} eval_loss: {eval_loss}") if perplexity != best_metric: raise AssertionError( f"Best metric {best_metric} does not match the metric {perplexity} of the loaded best model." ) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) # New Code # # Saves the whole/unpartitioned fp16 model when in ZeRO Stage-3 to the output directory if # `stage3_gather_16bit_weights_on_model_save` is True in DeepSpeed Config file or # `zero3_save_16bit_model` is True in DeepSpeed Plugin. # For Zero Stages 1 and 2, models are saved as usual in the output directory. # The model name saved is `pytorch_model.bin` unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save, state_dict=accelerator.get_state_dict(model), ) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) if args.push_to_hub: repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True) with open(os.path.join(args.output_dir, "all_results.json"), "w") as f: json.dump({"perplexity": perplexity, "eval_loss": eval_loss.item()}, f) if __name__ == "__main__": main()

将整个代码直接扔进去训练

QLoRA 微调模型: smangrul/peft-lora-starcoder15B-v2-personal-copilot-A100-40GB-colab

代码聊天助手

到目前为止，我们训练的模型特别是作为代码完成任务的个人助手培训。它们没有被训练来进行对话或回答问题。 Octocoder 和 StarChat 是这类模型的绝佳示例。本节简要描述了如何实现这一点。

资源

代码库: 链接。它使用了 Transformers 中最近添加的 Flash Attention V2 支持。
Colab notebook: 链接。请确保选择带有 High RAM 设置的 A100 GPU。
模型: bigcode/stacoderplus
数据集: smangrul/code-chat-assistant-v1。混合了 LIMA+GUANACO 并以适合训练的格式正确格式化。
训练好的peft-chat模型: smangrul/peft-lora-starcoderplus-chat-asst-A100-40GB-colab

So，Lora可以训练多种adapter，将多种adapter的能力混入到一个模型当中，当仍然会出现幻觉（这个也是一个很有挑战的工作）

多种adapter添加到一个模型中去

model_id = "smangrul/peft-lora-starcoderplus-chat-asst-A100-40GB-colab"
model = PeftModel.from_pretrained(model, model_id, adapter_name="assistant") #

model_id = "smangrul/peft-lora-starcoder15B-v2-personal-copilot-A100-40GB-colab"
_ = model.load_adapter(model_id, adapter_name="copilot")

model.add_weighted_adapter(["copilot","assistant"], [1.0, 1.0], "code_buddy", combination_type="cat")
model.set_adapter("code_buddy")

也可以单独设置一个adapter

model.set_adapter("copilot")

额外知识，可以选读。

什么是Tuner（adapter)HuggingFace上的详解

Tuners

一个Tuner(adapter)可以以插件的形式嵌入torch.nn.Module.BaseTuner，对于其他的tuners提供共享方法和属性，可以替换掉目标的module（使用adapter module）

BaseTunerLayer是一个基类对于所有的adapter layers，它提供了用于管理adapters的方法和属性。

BaseTuner

class peft.tuners.tuners_utils.BaseTune

( model，peft_config: Union[PeftConfig, dict[str, PeftConfig]] ，        adapter_name: str )

参数：原始模型
forward（callable可调用)：模型的前向传播的方法
peft_config(Union[PeftConfig, dict[str, PeftConfig]]) :adater配置对象，得是一个对象字典，
config：模型的配置对象，

一个基本的tuner模型提供一些共同的方法和属性给所有的tuners,可以嵌入到torcn.nn.Module中，要添加新的Tuner类，需要覆盖以下方法:

_prepare_adapter_config: 最终准备adapter config 的私有方法，例如在缺少target_modules字段的情况下。
_create_and_replace: 一个私有方法，使用adapter module 创建和替换目标module。

_check_target_module_exists: 一个私有方法，用于检查传入模块的键名是否与adatper_config. conf中的任何目标模块匹配。

可以去查看peft.tuners.lora.LoraModel class是怎么写的

类方法

inject_adapter


( model: nn.Moduleadapter_name: str )

model (nn.Module) — The model to be tuned.
adapter_name (str) — The adapter name.

创建一个adapter layers 替换掉target modules ，如果传递了一个非提示tuning adapter 类，就会调用底层peft.mapping.get_peft_model

相应的PEFT config 直接从BaseTuner类的peft_config属性中检索。

merge_adapter

( adapter_names: Optional[list[str]] = None )
参数
safe_merge(bool,optional)：如果是true，合并操作将在原始权重的副本中执行，并在合并权重之前检查nan。如果您想要检查合并操作是否会产生nan，这是很有用的。默认为False。
adapter_names (list[str], optional) — 应该合并的adapter名称列表。如果为None，则将合并所有活动适配器。默认为None。

该方法将adapter layers 合并到基本模型中。

合并 adapters可以加快forward pass.。A copy of the adapter 的权重仍然保存在内存中，这是取消合并adapter所必需的。为了合并adapter权重而不将它们保存在内存中，请调用merge_and_unload。

unmerge_adapter

这个方法是将原始的adapter 层分离出来

BaseTunerLayer

class peft.tuners.tuners_utils.BaseTunerLayer
参数：
is_plugable(bool,optional):adapter layer是否可以嵌入到pytorch module
active_adapters (Union[Liststr, str], optional)：激活的adapter的名字

tuner layer 混合为所有tuners 提供通用方法和属性。

类方法

delete_adapter

从图层中删除一个适配器

这应该在所有adapter layers上调用，否则我们将得到不一致的状态。

如果删除的适配器是活动适配器，则此方法还将设置一个新的活动适配器。以确定的方式选择新适配器是很重要的，以便在所有层上选择相同的适配器。

enable_adapters

enabled (bool) — True to enable adapters, False to disable adapters
切换适配器的启用和禁用
负责为适配器权重设置requires_grad标志。

get_base_layer

(递归地)获取base_layer。
这对于tuner layer封装另一个tuner layer的情况是必要的。

set_adapter

( adapter_names: str | list[str] )

adapter_name (str或List[str]) -要active adapter(s)的名称。

设置active adapter(s).。

adapter推理案例

dance of loras

%%time
import os
os.environ["CUDA_VISIBLE_DEVICES"]="3"

from dataclasses import dataclass, field
from typing import Optional
import contextlib

import torch
from datasets import load_dataset
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
)

from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    PeftModel
)

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype="bfloat16",
#     bnb_4bit_use_double_quant=False,
# )

# device_map = {"": 0}
model = "bigcode/starcoder"
#加载模型和tokenizer
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model, quantization_config=None,  #量化的config None
    device_map=None,  #默认的device
    trust_remote_code=True,  #
    torch_dtype=torch.bfloat16, #bf16的加载方式
)
 #adapter model，使用PeftModel.from_pretrained来下载模型
model_id = "smangrul/peft-lora-starcoderplus-chat-asst-A100-40GB-colab"
model = PeftModel.from_pretrained(model, model_id, adapter_name="assistant") #

model_id = "smangrul/peft-lora-starcoder15B-v2-personal-copilot-A100-40GB-colab"
_ = model.load_adapter(model_id, adapter_name="copilot")
#判断模型是否有参数，hf_device_map
if not hasattr(model, "hf_device_map"):
    model.cuda()

system_prompt = """You are a helpful, respectful and honest assistant. Always answer as helpfully \
as possible, while being safe. Your answers should not include any harmful, \
unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that \
your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why \
instead of answering something not correct. If you don’t know the answer to a \
question, please don’t share false information."""

def get_model_pred(query, disable=False):
    #上下文
    context = contextlib.nullcontext
    #是否disable 
    if disable:
        context = model.disable_adapter
    text = prompt = f"<|system|> {system_prompt} <|endoftext|> <|prompter|> {query} <|endoftext|> <|assistant|>"
    #推理
    model.eval() 
    with context():
        #此时的outputs为
        outputs = model.generate(input_ids=tokenizer(text, return_tensors="pt").input_ids.cuda(), 
                                 max_new_tokens=1024,
                                 temperature=0.2,
                                 top_k=50,
                                 top_p=0.95,
                                 do_sample=True,
                                 repetition_penalty=1.1,
                                 eos_token_id = tokenizer.eos_token_id)
    return tokenizer.batch_decode(outputs, skip_special_tokens=False)[0]

#model.delete_adapter("code_buddy")
model.add_weighted_adapter(["copilot","assistant"], [1.0, 1.0], "code_buddy", combination_type="cat")
model.set_adapter("code_buddy")

参考

https://huggingface.co/blog/zh/personal-copilot

https://huggingface.co/docs/peft/package_reference/tuners