使用bert-base-chinese预训练模型对二分类问题进行微调
import pandas as pd
from transformers import BertTokenizerFast, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
model_name = "./bert-base-chinese"
path = "./abuse_22.csv"
df = pd.read_csv(path, encoding="utf-8")
texts = df["content"][:1000].tolist()
labels = df["punish_result"][:1000].tolist()
texts = list(map(lambda x: str(x), texts))
class Dataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = BertTokenizerFast.from_pretrained(model_name)
# 参考这里 https://blog.csdn.net/weixin_42924890/article/details/139269528
train_encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
encodings = Dataset(train_encodings, labels)
args = TrainingArguments(output_dir='./output_dir',
evaluation_strategy='epoch',
no_cuda=True,
num_train_epochs=2,
learning_rate=1e-4,
weight_decay=1e-2,
per_device_eval_batch_size=32,
per_device_train_batch_size=32)
trainer = Trainer(
model=model,
args=args,
train_dataset=encodings,
)
# 开始训练
trainer.train()