基于bert-base-chinese的Bert微调二分类任务（超简单版

GML_Fish

已于 2024-07-23 16:54:05 修改

阅读量242

点赞数 10

文章标签： bert 分类机器学习

于 2024-07-23 09:26:31 首次发布

本文链接：https://blog.csdn.net/Yx_213/article/details/140626116

版权

结合了多个教程改写的，函数化了，应该好理解一点。
根据自己数据集的格式，修改 Dataset 即可，我这里是读取 csv 文件，第一列是 text，第二列是 label，不想改代码的可以按照这样改一下自己的数据集格式
bert-base-chinese 下载链接

训练代码

import torch
import os
import pandas as pd
from datasets import load_from_disk
from transformers import BertTokenizer,BertModel,AdamW
from sklearn.metrics import f1_score
from tqdm import tqdm

class Dataset(torch.utils.data.Dataset):
    def __init__(self, path):
        self.dataset = pd.read_csv(path)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        text = self.dataset.iloc[i]['text']
        label = self.dataset.iloc[i]['label']

        return text,label

class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(768, 2)  # 一个全连接神经网络,768是词编码维度,2是二分类

    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad():  # 使用预训练模型，抽取训练数据中的特征
            out = pretrained(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids
            )
        out = self.fc(out.last_hidden_state[:, 0])  # 把抽取到的特征放到全连接神经网络计算,获取bert最后一层隐藏层中[cls]对应的输出向量
        out = out.softmax(dim=1)  # 对out的第一个维度进行归一化

        return out

# 使用tokenizer编码数据
def collate_fn(data):
    sents = [i[0] for i in data]
    labels = [i[1] for i in data]

    #编码
    data = tokenizer.batch_encode_plus(batch_text_or_text_pairs=sents,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=500,
                                   return_tensors='pt',
                                   return_length=True)

    #input_ids:编码之后的数字
    #attention_mask:是补零的位置是0,其他位置是1
    input_ids = data['input_ids'].to(device)
    attention_mask = data['attention_mask'].to(device)
    token_type_ids = data['token_type_ids'].to(device)
    labels = torch.LongTensor(labels).to(device)

    return input_ids, attention_mask, token_type_ids, labels

def save_model(save_name):
    save_path = './bert_checkpoint'
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    torch.save(model.state_dict(), os.path.join(save_path, save_name))

def test(data_test_path):

    correct = 0
    loss = 0
    f1 = 0
    total = 0

    loader_test = torch.utils.data.DataLoader(
        dataset=Dataset(data_test_path),
        batch_size=16,
        collate_fn=collate_fn,
        shuffle=True,
        drop_last=True
    )

    for index, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader_test):

        with torch.no_grad():
            out = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        loss += criterion(out,labels)
        out = out.argmax(dim=1)
        correct += (out == labels).sum().item()
        total += len(labels)
        f1 += f1_score(labels.cpu().numpy(), out.cpu().numpy(), average='macro')
    
    accuracy = correct/total
    f1_av = f1/(index+1)
    loss_av = loss/total
    print(f"正确数：{correct}，总数：{total}，test准确率：{accuracy}， F1 Score:{f1_av}， loss:{loss_av}")
    return accuracy


if __name__== "__main__" :

    data_train_path = '' # 训练集路径
    data_test_path = '' # 验证集路径
    model_path = '' # 模型路径
    epoch = 10
    device = "cuda:0" # 根据自己情况改GPU
    dataset = Dataset(data_train_path)
    
    # 每个模型都有自己的tokenizer分词器
    tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path=model_path)
    # 数据加载器
    loader = torch.utils.data.DataLoader(dataset=dataset,
                                        batch_size=16,
                                        collate_fn=collate_fn,
                                        shuffle=True,
                                        drop_last=True)

    # 加载预训练模型
    pretrained = BertModel.from_pretrained(model_path)
    pretrained.to(device)
    # 冻结bert预训练模型的参数,即不对预训练模型的参数进行训练
    for param in pretrained.parameters():
        param.requires_grad_(False)

    model = Model().to(device)
    # 训练
    optimizer = AdamW(model.parameters(), lr=5e-4)  # AdamW优化器
    criterion = torch.nn.CrossEntropyLoss()  # 交叉熵损失函数,用于分类任务
    model.train()
    best_accuracy = 0
    print("训练前测试：")
    accuracy = test(data_test_path)

    for now_epoch in range(epoch):
        for (input_ids, attention_mask, token_type_ids, labels) in tqdm(loader):
            out = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)  # 模型的预测输出
            loss = criterion(out, labels)  # 用模型预测的输出和真实标签计算loss函数
            loss.backward()  # 反向传播
            optimizer.step()  # 梯度下降,
            optimizer.zero_grad()  # 梯度清零
        # 训练一轮后测试集的准确率
        accuracy = test(data_test_path)

        if(accuracy>best_accuracy):
            best_accuracy = accuracy
            save_model(f'best_{now_epoch}.pt')
    
    save_model('last.pt')

推理代码

import torch
import os
import pandas as pd
from datasets import load_from_disk
from transformers import BertTokenizer, BertModel, AdamW
from tqdm import tqdm

class Dataset(torch.utils.data.Dataset):
    def __init__(self, text, label):
        self.text = text
        self.label = label

    def __len__(self):
        return 1

    def __getitem__(self, i):
        return self.text, self.label

class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(768, 2)  # 一个全连接神经网络,768是词编码维度,2是二分类

    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad():  # 使用预训练模型，抽取训练数据中的特征
            out = pretrained(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids
            )
        out = self.fc(out.last_hidden_state[:, 0])  # 把抽取到的特征放到全连接神经网络计算,获取bert最后一层隐藏层中[cls]对应的输出向量
        out = out.softmax(dim=1)  # 对out的第一个维度进行归一化

        return out

# 使用tokenizer编码数据
def collate_fn(data):
    sents = [i[0] for i in data]
    labels = [i[1] for i in data]

    #编码
    data = tokenizer.batch_encode_plus(batch_text_or_text_pairs=sents,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=500,
                                   return_tensors='pt',
                                   return_length=True)

    #input_ids:编码之后的数字
    #attention_mask:是补零的位置是0,其他位置是1
    input_ids = data['input_ids'].to(device)
    attention_mask = data['attention_mask'].to(device)
    token_type_ids = data['token_type_ids'].to(device)
    labels = torch.LongTensor(labels).to(device)

    return input_ids, attention_mask, token_type_ids, labels

def infer_single_data(text):
    data = Dataset(text, "")
    loader = torch.utils.data.DataLoader(data, batch_size=1, collate_fn=collate_fn)

    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
        with torch.no_grad():
            out = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        out = out.argmax(dim=1)
        predicted_label = out.item()

    return predicted_label

if __name__== "__main__" :

    model_path = '' # 模型路径
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    # 每个模型都有自己的tokenizer分词器
    tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path=model_path)

    # 加载预训练模型
    pretrained = BertModel.from_pretrained(model_path)
    pretrained.to(device)
    # 冻结bert预训练模型的参数,即不对预训练模型的参数进行训练
    for param in pretrained.parameters():
        param.requires_grad_(False)

    model = Model()
    model.load_state_dict(torch.load('bert_checkpoint/best.pt'))
    model.eval()
    model.to(device)

    test_text = "嗯，谢谢你介绍的做法很详细，但我不喜欢吃鸡蛋，有没有其他菜做法能介绍一下？"

    predicted_label = infer_single_data(test_text)
    if(predicted_label==0):
        print(") # 分类1
    else:
        print(") # 分类2

GML_Fish

关注

10
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
基于bert-base-chinese的Bert微调二分类任务（超简单版

结合了多个教程改写的，函数化了，应该好理解一点。根据自己数据集的格式，修改 Dataset 即可，我这里是读取 csv 文件，第一列是 text，第二列是 label，不想改代码的可以按照这样改一下自己的数据集格式。
复制链接

扫一扫