【实战】使用Bert微调完成文本二分类

实验用的数据可以点击这里下载
完整代码:githubgitee

1.训练前准备

指定训练和预测的gpu

from torch.utils.data import DataLoader,TensorDataset
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
import torch

device0 = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")#训练集gpu
device1 = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")#测试集gpu

读取数据、分析数据


data=pd.read_table('./data/train.txt',header=None)#text label
data.columns = ['text', 'label']
text=[i for i in data['text']]
label=[i for i in data['label']]

#可以通过df.colname 来指定某个列,value_count()在这里进行计数
df2 = data.label.value_counts()  
print(df2)

构造训练数据

class SentimentDataset(Dataset):
    def __init__(self,df):
        self.dataset = df
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, idx):
        text = self.dataset.loc[idx, "text"]
        label = self.dataset.loc[idx, "label"]
        input_ids = self.dataset.loc[idx, "input_ids"]
        attention_mask = self.dataset.loc[idx, "attention_mask"]
        sample = {"text": text, "label": label,"input_ids":input_ids,"attention_mask":attention_mask}
        # print(sample)
        return sample
        
print('text2token')
from transformers import AutoTokenizer, AutoModel
# added_token=['##char##']
# tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese",additional_special_tokens=added_token)
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
def text2token(text,tokenizer,max_length=100):
    text2id = tokenizer(
        text, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt"
    )
    input_ids=text2id["input_ids"].tolist()
    attention_mask=text2id["attention_mask"].tolist()
    return input_ids,attention_mask

input_ids,attention_mask=text2token(text,tokenizer,max_length=100)

data['input_ids']=input_ids
data['attention_mask']=attention_mask

train_data = data.sample(frac=0.8)
test_data=data[~data.index.isin(train_data.index)]
print(len(train_data),len(test_data))
train_data=train_data.reset_index(drop=True)
test_data=test_data.reset_index(drop=True)

print('DataLoader')
#按batch_size分


batch_size=16
train_loader = DataLoader(
    SentimentDataset(train_data), 
    batch_size=batch_size, 
    shuffle=True, 
    num_workers=0
)
test_loader = DataLoader(
    SentimentDataset(test_data), 
    batch_size=batch_size, 
    shuffle=False, 
    num_workers=0
)
import pickle
with open('train_loader.pkl', 'wb') as f:
    pickle.dump(train_loader, f)
with open('test_loader.pkl', 'wb') as f:
    pickle.dump(test_loader, f)

如果之前保存了可以用这个直接读数据

import pickle
with open("train_loader.pkl",'rb') as f:
    train_loader  = pickle.loads(f.read())
with open("test_loader.pkl",'rb') as f:
    test_loader  = pickle.loads(f.read())

2.模型定义、训练和测试代码

定义模型

from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import torch.nn.functional as F
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
class fn_cls(nn.Module):
    def __init__(self,device):
        super(fn_cls, self).__init__()
        self.model = AutoModel.from_pretrained("bert")
        self.model.resize_token_embeddings(len(tokenizer))##############
        self.model.to(device)
        # self.dropout = nn.Dropout(0.3)
        self.l1 = nn.Linear(768, 1)

    def forward(self, x, attention_mask=None):
        outputs = self.model(x, attention_mask=attention_mask)
#         print(outputs[0])torch.Size([8, 100, 768])
#         print(outputs[1])torch.Size([8, 768])
#         print(outputs[0][:,0,:])torch.Size([8, 768])
        x = outputs[1]
        # x = self.dropout(x)
        x = self.l1(x)
        return x
# cls = fn_cls(device0)

# from torch import optim
# optimizer = optim.Adam(cls.parameters(), lr=1e-4)
sigmoid = nn.Sigmoid()
criterion = nn.BCELoss()#weight=weight

测试代码

from sklearn import metrics
import numpy as np
from tqdm import tqdm

def test(device_test):
    cls.to(device_test)
    cls.eval()

    epoch_loss=0
    total=0
    correct=0
    output_all=[]
    label_all=[]
    for batch_idx,batch in enumerate(test_loader):
        with torch.no_grad():
            label=batch['label'].to(device_test).float().view(-1,1)#batch size * 1
            label_all.append(label)
            input_ids=torch.stack(batch['input_ids']).t().to(device_test)#batch size * 100
            attention_mask=torch.stack(batch['attention_mask']).t().to(device_test)#batch size * 100
            
            #计算输出
            output = cls(input_ids, attention_mask=attention_mask)#batch size * 1
            output=sigmoid(output)#batch size * 1
            total+=len(output)
            
            #计算loss
            loss = criterion(output, label)
            epoch_loss+=loss
            ave_loss=epoch_loss/total
            
            #四舍五入
            output=output.round()
            output_all.append(output)
            
            #计算准确率
            add_correct=(output== label).sum().item()
            correct+=add_correct
            acc=correct/total
            
            if batch_idx%5==0:
                print('[{}/{} ({:.0f}%)]\t正确分类的样本数:{},样本总数:{},准确率:{:.2f}%,ave_loss:{}'.format(
                    batch_idx, len(test_loader),100.*batch_idx/len(test_loader), 
                    correct, total,acc,
                    ave_loss
                    ),end= "\r")
            
            
            
    #结束:
    print('正确分类的样本数:{},样本总数:{},准确率:{:.2f}%,ave_loss:{}'.format(
                    correct, total,acc,
                    ave_loss))
    
#     can't convert cuda:5 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.
    output_all=torch.cat(output_all,0)
    label_all=torch.cat(label_all,0)
    
    output_all=np.array(output_all.cpu())
    label_all=np.array(label_all.cpu())
    acc_score=metrics.accuracy_score(label_all,output_all)
    print(metrics.classification_report(label_all,output_all))
    print("准确率:",acc_score )
    
    return acc,epoch_loss.item()

# test(device1)

训练代码

train_acc_l=[]
train_epoch_loss_l=[]
test_acc_l=[]
test_epoch_loss_l=[]

def train_one_epoch(device_train,epoch_num):
    print("______________________________________________")
    print("______________________________________________")
    print("_______________",epoch_num,"start_______________")
    print("______________________________________________")
    print("______________________________________________")
    cls.to(device_train)
    cls.train()

    epoch_loss=0
    total=0
    correct=0
    output_all=[]
    label_all=[]
    for batch_idx,batch in enumerate(train_loader):
        label=batch['label'].to(device_train).float().view(-1,1)#batch size * 1
        input_ids=torch.stack(batch['input_ids']).t().to(device_train)#batch size * 100
        attention_mask=torch.stack(batch['attention_mask']).t().to(device_train)#batch size * 100

        #计算输出
        output = cls(input_ids, attention_mask=attention_mask)#batch size * 1
        output=sigmoid(output)#batch size * 1

        #计算loss
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
            
        with torch.no_grad():
            #四舍五入
            output=output.round()
            output_all.append(output)
            label_all.append(label)
            total+=len(output)
            
            #epoch_loss
            epoch_loss+=loss
            ave_loss=epoch_loss/total
            
            #计算准确率
            add_correct=(output== label).sum().item()
            correct+=add_correct
            acc=correct/total
            
            if batch_idx%5==0:
                print('[{}/{} ({:.0f}%)]\t正确分类的样本数:{},样本总数:{},准确率:{:.2f}%,ave_loss:{}'.format(
                    batch_idx, len(train_loader),100.*batch_idx/len(train_loader), 
                    correct, total,acc,
                    ave_loss
                    ),end= "\r")
            
            
            
    #结束:
    print('正确分类的样本数:{},样本总数:{},准确率:{:.2f}%,ave_loss:{}'.format(
                    correct, total,acc,
                    ave_loss))
    
#     can't convert cuda:5 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.
    with torch.no_grad():
        output_all=torch.cat(output_all,0)
        label_all=torch.cat(label_all,0)

        output_all=np.array(output_all.cpu())
        label_all=np.array(label_all.cpu())
        acc_score=metrics.accuracy_score(label_all,output_all)
        
    # print(metrics.classification_report(label_all,output_all))
    # print("准确率:",acc_score )
    
    test_acc,test_epoch_loss=test(device1)
    print('train_acc:',acc,'train_epoch_loss:',epoch_loss.item(),'test_acc:',test_acc,'test_epoch_loss:',test_epoch_loss)
    train_acc_l.append(acc)
    train_epoch_loss_l.append(epoch_loss.item())
    test_acc_l.append(test_acc)
    test_epoch_loss_l.append(test_epoch_loss)
    print("______________________________________________")
    print("______________________________________________")
    print("_______________",epoch_num,"end_______________")
    print("______________________________________________")
    print("______________________________________________")
    return test_epoch_loss
    
# train_one_epoch(device0,0)

3.微调

import time

cls = fn_cls(device0)

from torch import optim
# cls=torch.load("./data/yxl_best.model",map_location=device0)
optimizer = optim.Adam(cls.parameters(), lr=1e-4)
test(device1)
now_loss = 999
pre_epoch_loss = 9999
epoch = 0
while now_loss < pre_epoch_loss :
    torch.save(cls,"./data/yxl_best.model")
    pre_epoch_loss = now_loss 
    now_loss = train_one_epoch(device0,epoch)
    epoch += 1

    

4.预测、批量预测

def predict(device,s_l,cls):
    with torch.no_grad():
        cls.to(device)
        cls.eval()
        text2id = tokenizer(
            s_l, max_length=100, padding='max_length', truncation=True, return_tensors="pt"
        )
        input_ids=text2id["input_ids"].to(device)
        mask=text2id["attention_mask"].to(device)
        output = cls(input_ids, attention_mask=mask)
        output1=sigmoid(output)
        output2=output1.round()
        return output1,output2
from tqdm import tqdm
def run(device, s_l, cls, bs):
	# bs指的是batch size
    with torch.no_grad():
        cls.to(device)
        cls.eval()
        len_ = len(s_l)
        all_end_lgs = []
        all_end = []
        for start in tqdm(range(0, len_, bs)):
            li_i = s_l[start:min(start+bs, len_)]
            text2id = tokenizer(
                li_i, max_length=100, padding='max_length', truncation=True, return_tensors="pt"
                )
            input_ids=text2id["input_ids"].to(device)
            mask=text2id["attention_mask"].to(device)
            output = cls(input_ids, attention_mask=mask)
            output1=sigmoid(output)
            output2=output1.round()
            all_end_lgs = all_end_lgs + output1.tolist()
            all_end = all_end + output2.tolist()
    return all_end,all_end_lgs
        
    

预测实例:

s = ['好好好好好好好',
'坏坏坏坏坏坏坏坏',]
print(predict(device1,s,cls)[1])
  • 1
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论
BERT(Bidirectional Encoder Representations from Transformers)是一种预训练的语言模型,它可以用于各种自然语言处理任务,例如文本分类、命名实体识别、问答等等。bert-base-chinese是针对中文语言的BERT预训练模型。 使用bert-base-chinese模型,可以按照以下步骤进行: 1. 安装相应的Python库,例如transformers库,可以使用pip install transformers进行安装。 2. 加载bert-base-chinese模型,可以使用transformers库中的BertModel类和BertTokenizer类。BertTokenizer类可以将输入文本转换为模型输入的格式,BertModel类则是BERT模型的实现。 3. 对文本进行预处理,包括分词、截断、填充等操作,将文本转换为模型输入的格式。 4. 调用BertModel类进行预测,得到模型的输出结果。 以下是一个使用bert-base-chinese模型进行文本分类的示例代码: ```python from transformers import BertTokenizer, BertModel import torch # 加载bert-base-chinese模型和tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') model = BertModel.from_pretrained('bert-base-chinese') # 输入文本 text = "这是一段测试文本" # 对文本进行预处理 inputs = tokenizer(text, return_tensors='pt') input_ids = inputs['input_ids'] attention_mask = inputs['attention_mask'] # 调用BertModel类进行预测 outputs = model(input_ids, attention_mask=attention_mask) ``` 在以上示例代码中,我们首先加载了bert-base-chinese模型和tokenizer,然后将文本转换为模型输入的格式,最后调用BertModel类进行预测。在预测过程中,我们可以得到模型的输出结果outputs,可以使用这些输出结果进行各种自然语言处理任务。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

持续战斗状态

很高兴可以帮到你

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值