使用Bert微调完成文本二分类
实验用的数据可以点击这里下载
完整代码:github或gitee
1.训练前准备
指定训练和预测的gpu
from torch.utils.data import DataLoader,TensorDataset
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
import torch
device0 = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")#训练集gpu
device1 = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")#测试集gpu
读取数据、分析数据
data=pd.read_table('./data/train.txt',header=None)#text label
data.columns = ['text', 'label']
text=[i for i in data['text']]
label=[i for i in data['label']]
#可以通过df.colname 来指定某个列,value_count()在这里进行计数
df2 = data.label.value_counts()
print(df2)
构造训练数据
class SentimentDataset(Dataset):
def __init__(self,df):
self.dataset = df
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
text = self.dataset.loc[idx, "text"]
label = self.dataset.loc[idx, "label"]
input_ids = self.dataset.loc[idx, "input_ids"]
attention_mask = self.dataset.loc[idx, "attention_mask"]
sample = {"text": text, "label": label,"input_ids":input_ids,"attention_mask":attention_mask}
# print(sample)
return sample
print('text2token')
from transformers import AutoTokenizer, AutoModel
# added_token=['##char##']
# tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese",additional_special_tokens=added_token)
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
def text2token(text,tokenizer,max_length=100):
text2id = tokenizer(
text, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt"
)
input_ids=text2id["input_ids"].tolist()
attention_mask=text2id["attention_mask"].tolist()
return input_ids,attention_mask
input_ids,attention_mask=text2token(text,tokenizer,max_length=100)
data['input_ids']=input_ids
data['attention_mask']=attention_mask
train_data = data.sample(frac=0.8)
test_data=data[~data.index.isin(train_data.index)]
print(len(train_data),len(test_data))
train_data=train_data.reset_index(drop=True)
test_data=test_data.reset_index(drop=True)
print('DataLoader')
#按batch_size分
batch_size=16
train_loader = DataLoader(
SentimentDataset(train_data),
batch_size=batch_size,
shuffle=True,
num_workers=0
)
test_loader = DataLoader(
SentimentDataset(test_data),
batch_size=batch_size,
shuffle=False,
num_workers=0
)
import pickle
with open('train_loader.pkl', 'wb') as f:
pickle.dump(train_loader, f)
with open('test_loader.pkl', 'wb') as f:
pickle.dump(test_loader, f)
如果之前保存了可以用这个直接读数据
import pickle
with open("train_loader.pkl",'rb') as f:
train_loader = pickle.loads(f.read())
with open("test_loader.pkl",'rb') as f:
test_loader = pickle.loads(f.read())
2.模型定义、训练和测试代码
定义模型
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import torch.nn.functional as F
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
class fn_cls(nn.Module):
def __init__(self,device):
super(fn_cls, self).__init__()
self.model = AutoModel.from_pretrained("bert")
self.model.resize_token_embeddings(len(tokenizer))##############
self.model.to(device)
# self.dropout = nn.Dropout(0.3)
self.l1 = nn.Linear(768, 1)
def forward(self, x, attention_mask=None):
outputs = self.model(x, attention_mask=attention_mask)
# print(outputs[0])torch.Size([8, 100, 768])
# print(outputs[1])torch.Size([8, 768])
# print(outputs[0][:,0,:])torch.Size([8, 768])
x = outputs[1]
# x = self.dropout(x)
x = self.l1(x)
return x
# cls = fn_cls(device0)
# from torch import optim
# optimizer = optim.Adam(cls.parameters(), lr=1e-4)
sigmoid = nn.Sigmoid()
criterion = nn.BCELoss()#weight=weight
测试代码
from sklearn import metrics
import numpy as np
from tqdm import tqdm
def test(device_test):
cls.to(device_test)
cls.eval()
epoch_loss=0
total=0
correct=0
output_all=[]
label_all=[]
for batch_idx,batch in enumerate(test_loader):
with torch.no_grad():
label=batch['label'].to(device_test).float().view(-1,1)#batch size * 1
label_all.append(label)
input_ids=torch.stack(batch['input_ids']).t().to(device_test)#batch size * 100
attention_mask=torch.stack(batch['attention_mask']).t().to(device_test)#batch size * 100
#计算输出
output = cls(input_ids, attention_mask=attention_mask)#batch size * 1
output=sigmoid(output)#batch size * 1
total+=len(output)
#计算loss
loss = criterion(output, label)
epoch_loss+=loss
ave_loss=epoch_loss/total
#四舍五入
output=output.round()
output_all.append(output)
#计算准确率
add_correct=(output== label).sum().item()
correct+=add_correct
acc=correct/total
if batch_idx%5==0:
print('[{}/{} ({:.0f}%)]\t正确分类的样本数:{},样本总数:{},准确率:{:.2f}%,ave_loss:{}'.format(
batch_idx, len(test_loader),100.*batch_idx/len(test_loader),
correct, total,acc,
ave_loss
),end= "\r")
#结束:
print('正确分类的样本数:{},样本总数:{},准确率:{:.2f}%,ave_loss:{}'.format(
correct, total,acc,
ave_loss))
# can't convert cuda:5 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.
output_all=torch.cat(output_all,0)
label_all=torch.cat(label_all,0)
output_all=np.array(output_all.cpu())
label_all=np.array(label_all.cpu())
acc_score=metrics.accuracy_score(label_all,output_all)
print(metrics.classification_report(label_all,output_all))
print("准确率:",acc_score )
return acc,epoch_loss.item()
# test(device1)
训练代码
train_acc_l=[]
train_epoch_loss_l=[]
test_acc_l=[]
test_epoch_loss_l=[]
def train_one_epoch(device_train,epoch_num):
print("______________________________________________")
print("______________________________________________")
print("_______________",epoch_num,"start_______________")
print("______________________________________________")
print("______________________________________________")
cls.to(device_train)
cls.train()
epoch_loss=0
total=0
correct=0
output_all=[]
label_all=[]
for batch_idx,batch in enumerate(train_loader):
label=batch['label'].to(device_train).float().view(-1,1)#batch size * 1
input_ids=torch.stack(batch['input_ids']).t().to(device_train)#batch size * 100
attention_mask=torch.stack(batch['attention_mask']).t().to(device_train)#batch size * 100
#计算输出
output = cls(input_ids, attention_mask=attention_mask)#batch size * 1
output=sigmoid(output)#batch size * 1
#计算loss
loss = criterion(output, label)
loss.backward()
optimizer.step()
optimizer.zero_grad()
with torch.no_grad():
#四舍五入
output=output.round()
output_all.append(output)
label_all.append(label)
total+=len(output)
#epoch_loss
epoch_loss+=loss
ave_loss=epoch_loss/total
#计算准确率
add_correct=(output== label).sum().item()
correct+=add_correct
acc=correct/total
if batch_idx%5==0:
print('[{}/{} ({:.0f}%)]\t正确分类的样本数:{},样本总数:{},准确率:{:.2f}%,ave_loss:{}'.format(
batch_idx, len(train_loader),100.*batch_idx/len(train_loader),
correct, total,acc,
ave_loss
),end= "\r")
#结束:
print('正确分类的样本数:{},样本总数:{},准确率:{:.2f}%,ave_loss:{}'.format(
correct, total,acc,
ave_loss))
# can't convert cuda:5 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.
with torch.no_grad():
output_all=torch.cat(output_all,0)
label_all=torch.cat(label_all,0)
output_all=np.array(output_all.cpu())
label_all=np.array(label_all.cpu())
acc_score=metrics.accuracy_score(label_all,output_all)
# print(metrics.classification_report(label_all,output_all))
# print("准确率:",acc_score )
test_acc,test_epoch_loss=test(device1)
print('train_acc:',acc,'train_epoch_loss:',epoch_loss.item(),'test_acc:',test_acc,'test_epoch_loss:',test_epoch_loss)
train_acc_l.append(acc)
train_epoch_loss_l.append(epoch_loss.item())
test_acc_l.append(test_acc)
test_epoch_loss_l.append(test_epoch_loss)
print("______________________________________________")
print("______________________________________________")
print("_______________",epoch_num,"end_______________")
print("______________________________________________")
print("______________________________________________")
return test_epoch_loss
# train_one_epoch(device0,0)
3.微调
import time
cls = fn_cls(device0)
from torch import optim
# cls=torch.load("./data/yxl_best.model",map_location=device0)
optimizer = optim.Adam(cls.parameters(), lr=1e-4)
test(device1)
now_loss = 999
pre_epoch_loss = 9999
epoch = 0
while now_loss < pre_epoch_loss :
torch.save(cls,"./data/yxl_best.model")
pre_epoch_loss = now_loss
now_loss = train_one_epoch(device0,epoch)
epoch += 1
4.预测、批量预测
def predict(device,s_l,cls):
with torch.no_grad():
cls.to(device)
cls.eval()
text2id = tokenizer(
s_l, max_length=100, padding='max_length', truncation=True, return_tensors="pt"
)
input_ids=text2id["input_ids"].to(device)
mask=text2id["attention_mask"].to(device)
output = cls(input_ids, attention_mask=mask)
output1=sigmoid(output)
output2=output1.round()
return output1,output2
from tqdm import tqdm
def run(device, s_l, cls, bs):
# bs指的是batch size
with torch.no_grad():
cls.to(device)
cls.eval()
len_ = len(s_l)
all_end_lgs = []
all_end = []
for start in tqdm(range(0, len_, bs)):
li_i = s_l[start:min(start+bs, len_)]
text2id = tokenizer(
li_i, max_length=100, padding='max_length', truncation=True, return_tensors="pt"
)
input_ids=text2id["input_ids"].to(device)
mask=text2id["attention_mask"].to(device)
output = cls(input_ids, attention_mask=mask)
output1=sigmoid(output)
output2=output1.round()
all_end_lgs = all_end_lgs + output1.tolist()
all_end = all_end + output2.tolist()
return all_end,all_end_lgs
预测实例:
s = ['好好好好好好好',
'坏坏坏坏坏坏坏坏',]
print(predict(device1,s,cls)[1])