bert模型用于二分类问题微调

使用bert-base-chinese预训练模型对二分类问题进行微调

import pandas as pd
from transformers import BertTokenizerFast, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

model_name = "./bert-base-chinese"
path = "./abuse_22.csv"


df = pd.read_csv(path, encoding="utf-8")
texts = df["content"][:1000].tolist()
labels = df["punish_result"][:1000].tolist()
texts = list(map(lambda x: str(x), texts))

class Dataset(torch.utils.data.Dataset):

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = BertTokenizerFast.from_pretrained(model_name)

# 参考这里 https://blog.csdn.net/weixin_42924890/article/details/139269528
train_encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
encodings = Dataset(train_encodings, labels)


args = TrainingArguments(output_dir='./output_dir',
                         evaluation_strategy='epoch',
                         no_cuda=True,
                         num_train_epochs=2,
                         learning_rate=1e-4,
                         weight_decay=1e-2,
                         per_device_eval_batch_size=32,
                         per_device_train_batch_size=32)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encodings,
)

# 开始训练
trainer.train()
  • 3
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
下面是使用PyTorch实现BERT微调进行二分类任务的代码示例: ```python import torch from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup # 定义一些基本参数 MAX_LEN = 128 BATCH_SIZE = 32 EPOCHS = 4 LEARNING_RATE = 2e-5 # 加载数据 train_texts = [...] # 训练集文本 train_labels = [...] # 训练集标签 test_texts = [...] # 测试集文本 test_labels = [...] # 测试集标签 # 加载BERT tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # 对训练集和测试集进行编码 train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=MAX_LEN) test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=MAX_LEN) # 将编码转换为PyTorch tensors train_labels = torch.tensor(train_labels) test_labels = torch.tensor(test_labels) train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']), torch.tensor(train_encodings['attention_mask']), train_labels) test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']), torch.tensor(test_encodings['attention_mask']), test_labels) # 创建数据加载器 train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE) # 加载BERT模型 model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2) # 定义优化器和学习率调度器 optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8) total_steps = len(train_dataloader) * EPOCHS scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) # 训练BERT模型 for epoch in range(EPOCHS): for step, batch in enumerate(train_dataloader): # 将batch转换为GPU tensor batch = tuple(t.to('cuda') for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]} outputs = model(**inputs) loss = outputs[0] loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() optimizer.zero_grad() # 在测试集上评估模型 model.eval() test_loss, test_accuracy = 0, 0 nb_test_steps, nb_test_examples = 0, 0 for batch in test_dataloader: batch = tuple(t.to('cuda') for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]} with torch.no_grad(): outputs = model(**inputs) logits = outputs[1] logits = logits.detach().cpu().numpy() label_ids = inputs['labels'].cpu().numpy() tmp_test_accuracy = accuracy(logits, label_ids) test_accuracy += tmp_test_accuracy nb_test_examples += inputs['input_ids'].size(0) nb_test_steps += 1 test_accuracy = test_accuracy / nb_test_examples print('Epoch: {}, Test Accuracy: {}'.format(epoch, test_accuracy)) ``` 其中,`train_texts`和`train_labels`是训练集的文本和标签,`test_texts`和`test_labels`是测试集的文本和标签。`tokenizer`用来将文本转换为BERT的输入格式,`BertForSequenceClassification`是带有分类层的BERT模型,`AdamW`是用于优化模型的优化器,`get_linear_schedule_with_warmup`是用于调整学习率的调度器。在训练过程中,我们使用了梯度裁剪来避免梯度爆炸,同时在每个epoch结束后在测试集上评估模型的性能。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值