sunjr_基于Python搭建经营风险预警平台_dayuyanmx1

import os
import copy
from dataclasses import dataclass

import numpy as np
import torch
from datasets import Dataset
from transformers import (
    BitsAndBytesConfig,
    GemmaForSequenceClassification,
    GemmaTokenizerFast,
    Gemma2Config,
    PreTrainedTokenizerBase, 
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from sklearn.metrics import log_loss, accuracy_score


# 这个Python脚本导入了一系列用于构建、训练和评估机器学习模型的库和模块,特别是在使用大型语言模型(如Gemma模型)进行序列分类任务时。下面是对脚本中每个导入项的解释:
#
# os: 用于与操作系统交互,例如获取环境变量或列出目录内容。
#
# copy: 提供对对象进行复制操作的功能。
#
# dataclasses: 从Python 3.7开始,dataclasses模块提供了一个装饰器和相关功能,可以自动为类生成特殊方法,如__init__()和__repr__(),简化数据存储代码的编写。
#
# numpy: 一个用于科学计算的库,提供了大量的数学函数和对多维数组的支持。
#
# torch: PyTorch库,一个流行的开源机器学习库,用于深度学习,提供了张量操作、自动微分和神经网络构建的功能。
#
# datasets: Hugging Face的datasets库,用于加载和处理各种公共数据集。
#
# transformers: 另一个Hugging Face库,包含了预训练模型、 tokenizers 和其他用于NLP任务的工具。这里导入了:
#
# BitsAndBytesConfig:配置类,用于设置量化训练的参数。
# GemmaForSequenceClassification:Gemma模型的特定变体,用于序列分类任务。
# GemmaTokenizerFast:Gemma模型的快速tokenizer。
# Gemma2Config:Gemma 2模型的配置类。
# PreTrainedTokenizerBase:tokenizer基类。
# EvalPrediction:用于存储评估结果的类。
# Trainer:用于训练模型的类。
# TrainingArguments:用于设置训练参数的类。
# DataCollatorWithPadding:数据整理器,用于填充数据到固定长度。
# peft: 一个库,提供参数高效微调(Parameter-Efficient Fine-Tuning)技术,用于高效地微调大型预训练模型。这里导入了:
#
# LoraConfig:配置类,用于设置LoRA(Low-Rank Adaptation)微调的参数。
# get_peft_model:函数,用于获取微调后的模型。
# prepare_model_for_kbit_training:函数,用于准备模型进行k-bit量化训练。
# TaskType:枚举,定义了不同的任务类型。
# sklearn.metrics: 从scikit-learn库中导入的两个评估指标:
#
# log_loss:对数损失,用于评估概率估计的质量。
# accuracy_score:准确率,用于评估分类模型的性能。
# 这个脚本设置了一个机器学习工作流,从数据加载、模型配置、训练、评估到微调,为使用Gemma模型进行序列分类任务提供了一套完整的工具和函数。

@dataclass
class Config:
    output_dir: str = "output"
    checkpoint: str = "AI-ModelScope/gemma-2b"  # 4-bit quantized gemma-2-9b-instruct
    max_length: int = 1024  # 165
    n_splits: int = 5  #
    fold_idx: int = 0
    optim_type: str = "adamw_torch"  # _8bit" 优化器
    per_device_train_batch_size: int = 12  #
    gradient_accumulation_steps: int = 1  # global batch size is 8
    per_device_eval_batch_size: int = 8  #
    n_epochs: int = 5  # 训练轮次
    freeze_layers: int = 6  # there're 42 layers in total, we don't add adapters to the first 16 layers
    lr: float = 2e-4  # 学习率
    warmup_steps: int = 20  #
    lora_r: int = 16  # 32
    lora_alpha: float = lora_r * 2
    lora_dropout: float = 0.05
    lora_bias: str = "none"


config = Config()

training_args = TrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,
    report_to="none",
    num_train_epochs=config.n_epochs,
    per_device_train_batch_size=config.per_device_train_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    per_device_eval_batch_size=config.per_device_eval_batch_size,
    logging_steps=100,
    # max_grad_norm =1,
    eval_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="acc",
    greater_is_better=True,
    # save_steps=200,
    load_best_model_at_end=True,
    optim=config.optim_type,
    bf16=True,
    tf32=True,
    learning_rate=config.lr,
    warmup_steps=config.warmup_steps,
)

lora_config = LoraConfig(
    r=config.lora_r,
    lora_alpha=config.lora_alpha,
    # only target self-attention
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj"],  #可调
    layers_to_transform=[i for i in range(42) if i >= config.freeze_layers],
    lora_dropout=config.lora_dropout,
    bias=config.lora_bias,
    task_type=TaskType.SEQ_CLS,
)

tokenizer = GemmaTokenizerFast.from_pretrained(config.checkpoint)
tokenizer.add_eos_token = True  # We'll add <eos> at the end
tokenizer.padding_side = "right"

model = GemmaForSequenceClassification.from_pretrained(
    config.checkpoint,
    num_labels=2,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
model.config.use_cache = False
# model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model

ds = Dataset.from_parquet("input/mrpc/train-00000-of-00001.parquet")  #读取文件  from_csv
#ds = ds.select(torch.arange(100))  # We only use the first 100 data for demo purpose


# 数据增强  自己运行不了
# from nlpaug.augmenter.word import SynonymReplacer  # 导入同义词替换器
# # 初始化同义词替换器
# synonym_replacer = SynonymReplacer()
# # 自定义数据增强函数
# def data_augmentation(batch):
#     augmented_texts = []
#     for text in batch["text"]:
#         # 假设我们只对文本进行同义词替换增强
#         augmented_text = synonym_replacer.augment(text)
#         augmented_texts.append(augmented_text)
#     # 返回增强后的数据
#     return {key: value if key != 'text' else augmented_texts for key, value in batch.items()}
# # 应用数据增强
# ds = ds.map(data_augmentation, batched=True)



test_ds = Dataset.from_parquet("input/mrpc/test-00000-of-00001.parquet")


class CustomTokenizer:
    def __init__(
            self,
            tokenizer: PreTrainedTokenizerBase,
            max_length: int
    ) -> None:
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, batch: dict) -> dict:
        # prompt = ["<prompt>: " + self.process_text(t) for t in batch["prompt"]]
        prompt = ["<prompt>: " + 'Determining whether sentence A and sentence B are semantically equivalent.'] * len(
            batch["sentence1"])
        response_a = ["\n\n<sentence_A>: " + self.process_text(t) for t in batch["sentence1"]]
        response_b = ["\n\n<sentence_B>: " + self.process_text(t) for t in batch["sentence2"]]
        texts = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        tokenized = self.tokenizer(texts, max_length=self.max_length, truncation=True)
        labels = [i for i in batch["label"]]
        # for a_win, b_win in zip(batch["winner_model_a"], batch["winner_model_b"]):
        #     if a_win:
        #         label = 0
        #     elif b_win:
        #         label = 1
        #     else:
        #         label = 2
        #     labels.append(label)
        # print(batch)
        return {**tokenized, "labels": labels}

    @staticmethod
    def process_text(text: str) -> str:
        return text
        # return " ".join(eval(text, {"null": ""}))

encode = CustomTokenizer(tokenizer, max_length=config.max_length)
ds = ds.map(encode, batched=True)
test_ds = test_ds.map(encode, batched=True)

def compute_metrics(eval_preds: EvalPrediction) -> dict:   #评价模型好坏
    preds = eval_preds.predictions
    labels = eval_preds.label_ids
    probs = torch.from_numpy(preds).float().softmax(-1).numpy()
    # print(labels,eval_preds)
    loss = log_loss(y_true=labels, y_pred=probs)
    acc = accuracy_score(y_true=labels, y_pred=preds.argmax(-1))
    return {"acc": acc, "log_loss": loss}

folds = [
    (
        [i for i in range(len(ds)) if i % config.n_splits != fold_idx],
        [i for i in range(len(ds)) if i % config.n_splits == fold_idx]
    )
    for fold_idx in range(config.n_splits)
]

# train_idx, eval_idx = folds[config.fold_idx]

trainer = Trainer(
    args=training_args,
    model=model,
    tokenizer=tokenizer,
    train_dataset=ds,#.select(train_idx),
    eval_dataset=test_ds,#.select(eval_idx),
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)
trainer.train()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值