1、导入必要的包
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
AutoTokenizer,
AutoConfig,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
TrainingArguments,
Trainer)
# PEFT的全称是Parameter-Efficient Fine-Tuning,是transform开发的一个参数高效微调的库
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np
# 加载数据集 训练 验证 测试 数据集和模型可以在hugging face 官网下载 https://huggingface.co/
dataset = load_dataset('shawhin/imdb-truncated')
dataset
# 得出训练集标签的平均值
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])
2、开始制作模型
model_checkpoint = 'distilbert-base-uncased'
# 类别的映射关系
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}
# 加载预训练的权重 num_labels指明是二分类任务 model_checkpoint 预训练模型的名称
model = AutoModelForSequenceClassification.from_pretrained(
model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)
3、对数据进行预处理
# 创建分词器
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
# 判断是否有填充标记 通过 resize_token_embeddings 方法调整模型的 token embeddings,以包含新添加的 pad token。
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))
# 创建分词器函数
def tokenize_function(examples):
# 提取文本
text = examples["text"]
# 设置 tokenizer 的截断位置为左侧。这意味着如果文本超过指定的 max_length,则在左侧截断。这是为了确保重要的文本内容被保留下来。
tokenizer.truncation_side = "left"
tokenized_inputs = tokenizer(
text,
# 返回numpy 类型
return_tensors="np",
# 是否进行文本截断
truncation=True,
max_length=512
)
return tokenized_inputs
# 将数据集都运行一下分词函数
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset
# 创建数据收集器
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
4、创建评估函数
accuracy = evaluate.load("accuracy")
def compute_metrics(p):
predictions, labels = p
predictions = np.argmax(predictions, axis=1)
# 计算预测结果和真实标签 返回准确率
return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}
5、应用未训练的模型跑一下文本
text_list = ["I'm sorry.", "You areedespicable person", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]
print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
# 将文本转化为可以理解的编码 并返回pytorch张量
inputs = tokenizer.encode(text, return_tensors="pt")
# 计算对数
logits = model(inputs.cuda()).logits
# convert logits to label
predictions = torch.argmax(logits)
print(text + " - " + id2label[predictions.tolist()])
6、对模型进行训练
peft_config = LoraConfig(task_type="SEQ_CLS", # 序列分类任务
r = 4, # 递归深度
lora_alpha = 32, # alpha 值表示 LORA 模块的影响更大。
lora_dropout = 0.01,
target_modules = ['q_lin'])
# 对模型进行配置
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
# 超参数
lr = 1e-3
batch_size = 4
num_epochs = 10
# 定义训练参数
training_args = TrainingArguments(
output_dir= model_checkpoint + "-lora-text-classification",
learning_rate=lr,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=num_epochs,
weight_decay=0.01, # 权重衰减,一种正则化技术,用于控制模型参数的大小。
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True, # 是否在训练结束加载最佳模型
)
# 创建训练对象
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["validation"],
tokenizer=tokenizer,
data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
compute_metrics=compute_metrics,
)
# 训练模型
trainer.train()
查看损失和准确率
7、产生微调后的预测
model.to('cuda')
print("Trained model predictions:")
print("--------------------------")
for text in text_list:
inputs = tokenizer.encode(text, return_tensors="pt").to("cuda")
logits = model(inputs).logits
predictions = torch.max(logits,1).indices
print(text + " - " + id2label[predictions.tolist()[0]])
完结撒花