import os import copy from dataclasses import dataclass import numpy as np import torch from datasets import Dataset from transformers import ( BitsAndBytesConfig, GemmaForSequenceClassification, GemmaTokenizerFast, Gemma2Config, PreTrainedTokenizerBase, EvalPrediction, Trainer, TrainingArguments, DataCollatorWithPadding, ) from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType from sklearn.metrics import log_loss, accuracy_score # 这个Python脚本导入了一系列用于构建、训练和评估机器学习模型的库和模块,特别是在使用大型语言模型(如Gemma模型)进行序列分类任务时。下面是对脚本中每个导入项的解释: # # os: 用于与操作系统交互,例如获取环境变量或列出目录内容。 # # copy: 提供对对象进行复制操作的功能。 # # dataclasses: 从Python 3.7开始,dataclasses模块提供了一个装饰器和相关功能,可以自动为类生成特殊方法,如__init__()和__repr__(),简化数据存储代码的编写。 # # numpy: 一个用于科学计算的库,提供了大量的数学函数和对多维数组的支持。 # # torch: PyTorch库,一个流行的开源机器学习库,用于深度学习,提供了张量操作、自动微分和神经网络构建的功能。 # # datasets: Hugging Face的datasets库,用于加载和处理各种公共数据集。 # # transformers: 另一个Hugging Face库,包含了预训练模型、 tokenizers 和其他用于NLP任务的工具。这里导入了: # # BitsAndBytesConfig:配置类,用于设置量化训练的参数。 # GemmaForSequenceClassification:Gemma模型的特定变体,用于序列分类任务。 # GemmaTokenizerFast:Gemma模型的快速tokenizer。 # Gemma2Config:Gemma 2模型的配置类。 # PreTrainedTokenizerBase:tokenizer基类。 # EvalPrediction:用于存储评估结果的类。 # Trainer:用于训练模型的类。 # TrainingArguments:用于设置训练参数的类。 # DataCollatorWithPadding:数据整理器,用于填充数据到固定长度。 # peft: 一个库,提供参数高效微调(Parameter-Efficient Fine-Tuning)技术,用于高效地微调大型预训练模型。这里导入了: # # LoraConfig:配置类,用于设置LoRA(Low-Rank Adaptation)微调的参数。 # get_peft_model:函数,用于获取微调后的模型。 # prepare_model_for_kbit_training:函数,用于准备模型进行k-bit量化训练。 # TaskType:枚举,定义了不同的任务类型。 # sklearn.metrics: 从scikit-learn库中导入的两个评估指标: # # log_loss:对数损失,用于评估概率估计的质量。 # accuracy_score:准确率,用于评估分类模型的性能。 # 这个脚本设置了一个机器学习工作流,从数据加载、模型配置、训练、评估到微调,为使用Gemma模型进行序列分类任务提供了一套完整的工具和函数。 @dataclass class Config: output_dir: str = "output" checkpoint: str = "AI-ModelScope/gemma-2b" # 4-bit quantized gemma-2-9b-instruct max_length: int = 1024 # 165 n_splits: int = 5 # fold_idx: int = 0 optim_type: str = "adamw_torch" # _8bit" 优化器 per_device_train_batch_size: int = 12 # gradient_accumulation_steps: int = 1 # global batch size is 8 per_device_eval_batch_size: int = 8 # n_epochs: int = 5 # 训练轮次 freeze_layers: int = 6 # there're 42 layers in total, we don't add adapters to the first 16 layers lr: float = 2e-4 # 学习率 warmup_steps: int = 20 # lora_r: int = 16 # 32 lora_alpha: float = lora_r * 2 lora_dropout: float = 0.05 lora_bias: str = "none" config = Config() training_args = TrainingArguments( output_dir="output", overwrite_output_dir=True, report_to="none", num_train_epochs=config.n_epochs, per_device_train_batch_size=config.per_device_train_batch_size, gradient_accumulation_steps=config.gradient_accumulation_steps, per_device_eval_batch_size=config.per_device_eval_batch_size, logging_steps=100, # max_grad_norm =1, eval_strategy="epoch", save_strategy="epoch", metric_for_best_model="acc", greater_is_better=True, # save_steps=200, load_best_model_at_end=True, optim=config.optim_type, bf16=True, tf32=True, learning_rate=config.lr, warmup_steps=config.warmup_steps, ) lora_config = LoraConfig( r=config.lora_r, lora_alpha=config.lora_alpha, # only target self-attention target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj"], #可调 layers_to_transform=[i for i in range(42) if i >= config.freeze_layers], lora_dropout=config.lora_dropout, bias=config.lora_bias, task_type=TaskType.SEQ_CLS, ) tokenizer = GemmaTokenizerFast.from_pretrained(config.checkpoint) tokenizer.add_eos_token = True # We'll add <eos> at the end tokenizer.padding_side = "right" model = GemmaForSequenceClassification.from_pretrained( config.checkpoint, num_labels=2, torch_dtype=torch.bfloat16, device_map="auto", ) model.config.use_cache = False # model = prepare_model_for_kbit_training(model) model = get_peft_model(model, lora_config) model ds = Dataset.from_parquet("input/mrpc/train-00000-of-00001.parquet") #读取文件 from_csv #ds = ds.select(torch.arange(100)) # We only use the first 100 data for demo purpose # 数据增强 自己运行不了 # from nlpaug.augmenter.word import SynonymReplacer # 导入同义词替换器 # # 初始化同义词替换器 # synonym_replacer = SynonymReplacer() # # 自定义数据增强函数 # def data_augmentation(batch): # augmented_texts = [] # for text in batch["text"]: # # 假设我们只对文本进行同义词替换增强 # augmented_text = synonym_replacer.augment(text) # augmented_texts.append(augmented_text) # # 返回增强后的数据 # return {key: value if key != 'text' else augmented_texts for key, value in batch.items()} # # 应用数据增强 # ds = ds.map(data_augmentation, batched=True) test_ds = Dataset.from_parquet("input/mrpc/test-00000-of-00001.parquet") class CustomTokenizer: def __init__( self, tokenizer: PreTrainedTokenizerBase, max_length: int ) -> None: self.tokenizer = tokenizer self.max_length = max_length def __call__(self, batch: dict) -> dict: # prompt = ["<prompt>: " + self.process_text(t) for t in batch["prompt"]] prompt = ["<prompt>: " + 'Determining whether sentence A and sentence B are semantically equivalent.'] * len( batch["sentence1"]) response_a = ["\n\n<sentence_A>: " + self.process_text(t) for t in batch["sentence1"]] response_b = ["\n\n<sentence_B>: " + self.process_text(t) for t in batch["sentence2"]] texts = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)] tokenized = self.tokenizer(texts, max_length=self.max_length, truncation=True) labels = [i for i in batch["label"]] # for a_win, b_win in zip(batch["winner_model_a"], batch["winner_model_b"]): # if a_win: # label = 0 # elif b_win: # label = 1 # else: # label = 2 # labels.append(label) # print(batch) return {**tokenized, "labels": labels} @staticmethod def process_text(text: str) -> str: return text # return " ".join(eval(text, {"null": ""})) encode = CustomTokenizer(tokenizer, max_length=config.max_length) ds = ds.map(encode, batched=True) test_ds = test_ds.map(encode, batched=True) def compute_metrics(eval_preds: EvalPrediction) -> dict: #评价模型好坏 preds = eval_preds.predictions labels = eval_preds.label_ids probs = torch.from_numpy(preds).float().softmax(-1).numpy() # print(labels,eval_preds) loss = log_loss(y_true=labels, y_pred=probs) acc = accuracy_score(y_true=labels, y_pred=preds.argmax(-1)) return {"acc": acc, "log_loss": loss} folds = [ ( [i for i in range(len(ds)) if i % config.n_splits != fold_idx], [i for i in range(len(ds)) if i % config.n_splits == fold_idx] ) for fold_idx in range(config.n_splits) ] # train_idx, eval_idx = folds[config.fold_idx] trainer = Trainer( args=training_args, model=model, tokenizer=tokenizer, train_dataset=ds,#.select(train_idx), eval_dataset=test_ds,#.select(eval_idx), compute_metrics=compute_metrics, data_collator=DataCollatorWithPadding(tokenizer=tokenizer), ) trainer.train()
sunjr_基于Python搭建经营风险预警平台_dayuyanmx1
最新推荐文章于 2024-09-26 11:55:47 发布