文章目录
1、导包
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
2、加载数据
dataset = load_dataset("csv", data_files="ChnSentiCorp_htl_all.csv", split="train")
dataset = dataset.filter(lambda x: x["review"] is not None)
dataset
3、划分数据集
datasets = dataset.train_test_split(test_size=0.1)
datasets
4、数据预处理
import torch
tokenizer = AutoTokenizer.from_pretrained("chinese-macbert-base")
def process_function(examples):
tokenized_examples = tokenizer(examples["review"], max_length=32, truncation=True,padding="max_length")
tokenized_examples["labels"] = examples["label"]
return tokenized_examples
tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets
5、创建模型
model = AutoModelForSequenceClassification.from_pretrained("chinese-macbert-base")
打印模型配置
model.config
6、创建评估函数
import evaluate
acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
def eval_metric(eval_predict):
predictions, labels = eval_predict
predictions = predictions.argmax(axis=-1)
acc = acc_metric.compute(predictions=predictions, references=labels)
f1 = f1_metric.compute(predictions=predictions, references=labels)
acc.update(f1)
return acc
7、 创建TrainingArguments
7.1、显存优化
显存优化
优化策略--------------------------------------------------- 优化对象
1、Gradient Accumulation--------------------------------前向激活值
2、gradient Checkpoints ---------------------------------前向激活值
3、Adafactor Optimizer -----------------------------------优化器状态
4、freeze model ------------------------------------------- 前向激活值
5、data length --------------------------------------------- 前向激活值
- 梯度累加——gradient_accumulation_steps=32
- 梯度检查点——gradient_checkpointing= True
- 优化器优化——optim=‘adafactor’
- 冻结模型参数
- 减小数据长度——per_device_eval_batch_size=1
7.2、配置代码
train_args = TrainingArguments(output_dir="./checkpoints", # 输出文件夹
per_device_train_batch_size=1, # 训练时的batch_size
gradient_accumulation_steps=32, # 梯度累加 现存减少,训练时间延长
gradient_checkpointing= True, #优化
optim='adafactor', #指定优化器
per_device_eval_batch_size=1, # 验证时的batch_size
num_train_epochs=1,
logging_steps=10, # log 打印的频率
evaluation_strategy="epoch", # 评估策略
save_strategy="epoch", # 保存策略
save_total_limit=3, # 最大保存数
learning_rate=2e-5, # 学习率
weight_decay=0.01, # weight_decay
metric_for_best_model="f1", # 设定评估指标
load_best_model_at_end=True) # 训练完成后加载最优模型
train_args
8、创建Trainer
from transformers import DataCollatorWithPadding
'''
#bert + FFC
#冻结bert,微调FFC
for name,params in model.bert.named_parameters():
params.requires_grad = False
'''
trainer = Trainer(model=model,
args=train_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"],
data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
compute_metrics=eval_metric)
9、模型训练
#OOM
trainer.train()
10、模型评估
trainer.evaluate(tokenized_datasets["test"])
11、模型预测
from transformers import pipeline
model.config.id2label = id2_label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)
sen = "我觉得不错!"
pipe(sen)