【实战】使用Bert微调完成文本二分类

持续战斗状态

已于 2024-03-09 16:49:53 修改

阅读量900

点赞数 1

分类专栏：【AI】调研与实战 NLP 文章标签： bert 分类深度学习自然语言处理文本分类

于 2023-02-21 21:53:06 首次发布

本文链接：https://blog.csdn.net/weixin_43499457/article/details/129150733

版权

NLP 同时被 2 个专栏收录

18 篇文章 4 订阅

订阅专栏

【AI】调研与实战

8 篇文章 0 订阅

订阅专栏

使用Bert微调完成文本二分类

1.训练前准备
2.模型定义、训练和测试代码
3.微调
4.预测、批量预测

实验用的数据可以点击这里下载
完整代码：github或gitee

1.训练前准备

指定训练和预测的gpu

from torch.utils.data import DataLoader,TensorDataset
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
import torch

device0 = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")#训练集gpu
device1 = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")#测试集gpu

读取数据、分析数据


data=pd.read_table('./data/train.txt',header=None)#text label
data.columns = ['text', 'label']
text=[i for i in data['text']]
label=[i for i in data['label']]

#可以通过df.colname 来指定某个列，value_count()在这里进行计数
df2 = data.label.value_counts()  
print(df2)

构造训练数据

class SentimentDataset(Dataset):
    def __init__(self,df):
        self.dataset = df
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, idx):
        text = self.dataset.loc[idx, "text"]
        label = self.dataset.loc[idx, "label"]
        input_ids = self.dataset.loc[idx, "input_ids"]
        attention_mask = self.dataset.loc[idx, "attention_mask"]
        sample = {"text": text, "label": label,"input_ids":input_ids,"attention_mask":attention_mask}
        # print(sample)
        return sample
        
print('text2token')
from transformers import AutoTokenizer, AutoModel
# added_token=['##char##']
# tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese",additional_special_tokens=added_token)
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
def text2token(text,tokenizer,max_length=100):
    text2id = tokenizer(
        text, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt"
    )
    input_ids=text2id["input_ids"].tolist()
    attention_mask=text2id["attention_mask"].tolist()
    return input_ids,attention_mask

input_ids,attention_mask=text2token(text,tokenizer,max_length=100)

data['input_ids']=input_ids
data['attention_mask']=attention_mask

train_data = data.sample(frac=0.8)
test_data=data[~data.index.isin(train_data.index)]
print(len(train_data),len(test_data))
train_data=train_data.reset_index(drop=True)
test_data=test_data.reset_index(drop=True)

print('DataLoader')
#按batch_size分


batch_size=16
train_loader = DataLoader(
    SentimentDataset(train_data), 
    batch_size=batch_size, 
    shuffle=True, 
    num_workers=0
)
test_loader = DataLoader(
    SentimentDataset(test_data), 
    batch_size=batch_size, 
    shuffle=False, 
    num_workers=0
)
import pickle
with open('train_loader.pkl', 'wb') as f:
    pickle.dump(train_loader, f)
with open('test_loader.pkl', 'wb') as f:
    pickle.dump(test_loader, f)

如果之前保存了可以用这个直接读数据

import pickle
with open("train_loader.pkl",'rb') as f:
    train_loader  = pickle.loads(f.read())
with open("test_loader.pkl",'rb') as f:
    test_loader  = pickle.loads(f.read())

2.模型定义、训练和测试代码

定义模型

from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import torch.nn.functional as F
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
class fn_cls(nn.Module):
    def __init__(self,device):
        super(fn_cls, self).__init__()
        self.model = AutoModel.from_pretrained("bert")
        self.model.resize_token_embeddings(len(tokenizer))##############
        self.model.to(device)
        # self.dropout = nn.Dropout(0.3)
        self.l1 = nn.Linear(768, 1)

    def forward(self, x, attention_mask=None):
        outputs = self.model(x, attention_mask=attention_mask)
#         print(outputs[0])torch.Size([8, 100, 768])
#         print(outputs[1])torch.Size([8, 768])
#         print(outputs[0][:,0,:])torch.Size([8, 768])
        x = outputs[1]
        # x = self.dropout(x)
        x = self.l1(x)
        return x
# cls = fn_cls(device0)

# from torch import optim
# optimizer = optim.Adam(cls.parameters(), lr=1e-4)
sigmoid = nn.Sigmoid()
criterion = nn.BCELoss()#weight=weight

测试代码

from sklearn import metrics
import numpy as np
from tqdm import tqdm

def test(device_test):
    cls.to(device_test)
    cls.eval()

    epoch_loss=0
    total=0
    correct=0
    output_all=[]
    label_all=[]
    for batch_idx,batch in enumerate(test_loader):
        with torch.no_grad():
            label=batch['label'].to(device_test).float().view(-1,1)#batch size * 1
            label_all.append(label)
            input_ids=torch.stack(batch['input_ids']).t().to(device_test)#batch size * 100
            attention_mask=torch.stack(batch['attention_mask']).t().to(device_test)#batch size * 100
            
            #计算输出
            output = cls(input_ids, attention_mask=attention_mask)#batch size * 1
            output=sigmoid(output)#batch size * 1
            total+=len(output)
            
            #计算loss
            loss = criterion(output, label)
            epoch_loss+=loss
            ave_loss=epoch_loss/total
            
            #四舍五入
            output=output.round()
            output_all.append(output)
            
            #计算准确率
            add_correct=(output== label).sum().item()
            correct+=add_correct
            acc=correct/total
            
            if batch_idx%5==0:
                print('[{}/{} ({:.0f}%)]\t正确分类的样本数：{}，样本总数：{}，准确率：{:.2f}%，ave_loss：{}'.format(
                    batch_idx, len(test_loader),100.*batch_idx/len(test_loader), 
                    correct, total,acc,
                    ave_loss
                    ),end= "\r")
            
            
            
    #结束：
    print('正确分类的样本数：{}，样本总数：{}，准确率：{:.2f}%，ave_loss：{}'.format(
                    correct, total,acc,
                    ave_loss))
    
#     can't convert cuda:5 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.
    output_all=torch.cat(output_all,0)
    label_all=torch.cat(label_all,0)
    
    output_all=np.array(output_all.cpu())
    label_all=np.array(label_all.cpu())
    acc_score=metrics.accuracy_score(label_all,output_all)
    print(metrics.classification_report(label_all,output_all))
    print("准确率:",acc_score )
    
    return acc,epoch_loss.item()

# test(device1)

训练代码

train_acc_l=[]
train_epoch_loss_l=[]
test_acc_l=[]
test_epoch_loss_l=[]

def train_one_epoch(device_train,epoch_num):
    print("______________________________________________")
    print("______________________________________________")
    print("_______________",epoch_num,"start_______________")
    print("______________________________________________")
    print("______________________________________________")
    cls.to(device_train)
    cls.train()

    epoch_loss=0
    total=0
    correct=0
    output_all=[]
    label_all=[]
    for batch_idx,batch in enumerate(train_loader):
        label=batch['label'].to(device_train).float().view(-1,1)#batch size * 1
        input_ids=torch.stack(batch['input_ids']).t().to(device_train)#batch size * 100
        attention_mask=torch.stack(batch['attention_mask']).t().to(device_train)#batch size * 100

        #计算输出
        output = cls(input_ids, attention_mask=attention_mask)#batch size * 1
        output=sigmoid(output)#batch size * 1

        #计算loss
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
            
        with torch.no_grad():
            #四舍五入
            output=output.round()
            output_all.append(output)
            label_all.append(label)
            total+=len(output)
            
            #epoch_loss
            epoch_loss+=loss
            ave_loss=epoch_loss/total
            
            #计算准确率
            add_correct=(output== label).sum().item()
            correct+=add_correct
            acc=correct/total
            
            if batch_idx%5==0:
                print('[{}/{} ({:.0f}%)]\t正确分类的样本数：{}，样本总数：{}，准确率：{:.2f}%，ave_loss：{}'.format(
                    batch_idx, len(train_loader),100.*batch_idx/len(train_loader), 
                    correct, total,acc,
                    ave_loss
                    ),end= "\r")
            
            
            
    #结束：
    print('正确分类的样本数：{}，样本总数：{}，准确率：{:.2f}%，ave_loss：{}'.format(
                    correct, total,acc,
                    ave_loss))
    
#     can't convert cuda:5 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.
    with torch.no_grad():
        output_all=torch.cat(output_all,0)
        label_all=torch.cat(label_all,0)

        output_all=np.array(output_all.cpu())
        label_all=np.array(label_all.cpu())
        acc_score=metrics.accuracy_score(label_all,output_all)
        
    # print(metrics.classification_report(label_all,output_all))
    # print("准确率:",acc_score )
    
    test_acc,test_epoch_loss=test(device1)
    print('train_acc:',acc,'train_epoch_loss:',epoch_loss.item(),'test_acc:',test_acc,'test_epoch_loss:',test_epoch_loss)
    train_acc_l.append(acc)
    train_epoch_loss_l.append(epoch_loss.item())
    test_acc_l.append(test_acc)
    test_epoch_loss_l.append(test_epoch_loss)
    print("______________________________________________")
    print("______________________________________________")
    print("_______________",epoch_num,"end_______________")
    print("______________________________________________")
    print("______________________________________________")
    return test_epoch_loss
    
# train_one_epoch(device0,0)

3.微调

import time

cls = fn_cls(device0)

from torch import optim
# cls=torch.load("./data/yxl_best.model",map_location=device0)
optimizer = optim.Adam(cls.parameters(), lr=1e-4)
test(device1)
now_loss = 999
pre_epoch_loss = 9999
epoch = 0
while now_loss < pre_epoch_loss :
    torch.save(cls,"./data/yxl_best.model")
    pre_epoch_loss = now_loss 
    now_loss = train_one_epoch(device0,epoch)
    epoch += 1

4.预测、批量预测

def predict(device,s_l,cls):
    with torch.no_grad():
        cls.to(device)
        cls.eval()
        text2id = tokenizer(
            s_l, max_length=100, padding='max_length', truncation=True, return_tensors="pt"
        )
        input_ids=text2id["input_ids"].to(device)
        mask=text2id["attention_mask"].to(device)
        output = cls(input_ids, attention_mask=mask)
        output1=sigmoid(output)
        output2=output1.round()
        return output1,output2
from tqdm import tqdm
def run(device, s_l, cls, bs):
	# bs指的是batch size
    with torch.no_grad():
        cls.to(device)
        cls.eval()
        len_ = len(s_l)
        all_end_lgs = []
        all_end = []
        for start in tqdm(range(0, len_, bs)):
            li_i = s_l[start:min(start+bs, len_)]
            text2id = tokenizer(
                li_i, max_length=100, padding='max_length', truncation=True, return_tensors="pt"
                )
            input_ids=text2id["input_ids"].to(device)
            mask=text2id["attention_mask"].to(device)
            output = cls(input_ids, attention_mask=mask)
            output1=sigmoid(output)
            output2=output1.round()
            all_end_lgs = all_end_lgs + output1.tolist()
            all_end = all_end + output2.tolist()
    return all_end,all_end_lgs

预测实例：

s = ['好好好好好好好',
'坏坏坏坏坏坏坏坏',]
print(predict(device1,s,cls)[1])

持续战斗状态

关注

1
点赞
踩
9

收藏

觉得还不错? 一键收藏
打赏
1
评论
【实战】使用Bert微调完成文本二分类

使用Bert微调完成文本二分类，包含实验数据、全部代码，一键运行
复制链接

扫一扫

专栏目录