结合了多个教程改写的,函数化了,应该好理解一点。
根据自己数据集的格式,修改 Dataset 即可,我这里是读取 csv 文件,第一列是 text,第二列是 label,不想改代码的可以按照这样改一下自己的数据集格式
bert-base-chinese 下载链接
训练代码
import torch
import os
import pandas as pd
from datasets import load_from_disk
from transformers import BertTokenizer,BertModel,AdamW
from sklearn.metrics import f1_score
from tqdm import tqdm
class Dataset(torch.utils.data.Dataset):
def __init__(self, path):
self.dataset = pd.read_csv(path)
def __len__(self):
return len(self.dataset)
def __getitem__(self, i):
text = self.dataset.iloc[i]['text']
label = self.dataset.iloc[i]['label']
return text,label
class Model(torch.nn.Module):
def __init__(self):
super().__init__()
self.fc = torch.nn.Linear(768, 2) # 一个全连接神经网络,768是词编码维度,2是二分类
def forward(self, input_ids, attention_mask, token_type_ids):
with torch.no_grad(): # 使用预训练模型,抽取训练数据中的特征
out = pretrained(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids
)
out = self.fc(out.last_hidden_state[:, 0]) # 把抽取到的特征放到全连接神经网络计算,获取bert最后一层隐藏层中[cls]对应的输出向量
out = out.softmax(dim=1) # 对out的第一个维度进行归一化
return out
# 使用tokenizer编码数据
def collate_fn(data):
sents = [i[0] for i in data]
labels = [i[1] for i in data]
#编码
data = tokenizer.batch_encode_plus(batch_text_or_text_pairs=sents,
truncation=True,
padding='max_length',
max_length=500,
return_tensors='pt',
return_length=True)
#input_ids:编码之后的数字
#attention_mask:是补零的位置是0,其他位置是1
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)
token_type_ids = data['token_type_ids'].to(device)
labels = torch.LongTensor(labels).to(device)
return input_ids, attention_mask, token_type_ids, labels
def save_model(save_name):
save_path = './bert_checkpoint'
if not os.path.exists(save_path):
os.makedirs(save_path)
torch.save(model.state_dict(), os.path.join(save_path, save_name))
def test(data_test_path):
correct = 0
loss = 0
f1 = 0
total = 0
loader_test = torch.utils.data.DataLoader(
dataset=Dataset(data_test_path),
batch_size=16,
collate_fn=collate_fn,
shuffle=True,
drop_last=True
)
for index, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader_test):
with torch.no_grad():
out = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
loss += criterion(out,labels)
out = out.argmax(dim=1)
correct += (out == labels).sum().item()
total += len(labels)
f1 += f1_score(labels.cpu().numpy(), out.cpu().numpy(), average='macro')
accuracy = correct/total
f1_av = f1/(index+1)
loss_av = loss/total
print(f"正确数:{correct},总数:{total},test准确率:{accuracy}, F1 Score:{f1_av}, loss:{loss_av}")
return accuracy
if __name__== "__main__" :
data_train_path = '' # 训练集路径
data_test_path = '' # 验证集路径
model_path = '' # 模型路径
epoch = 10
device = "cuda:0" # 根据自己情况改GPU
dataset = Dataset(data_train_path)
# 每个模型都有自己的tokenizer分词器
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path=model_path)
# 数据加载器
loader = torch.utils.data.DataLoader(dataset=dataset,
batch_size=16,
collate_fn=collate_fn,
shuffle=True,
drop_last=True)
# 加载预训练模型
pretrained = BertModel.from_pretrained(model_path)
pretrained.to(device)
# 冻结bert预训练模型的参数,即不对预训练模型的参数进行训练
for param in pretrained.parameters():
param.requires_grad_(False)
model = Model().to(device)
# 训练
optimizer = AdamW(model.parameters(), lr=5e-4) # AdamW优化器
criterion = torch.nn.CrossEntropyLoss() # 交叉熵损失函数,用于分类任务
model.train()
best_accuracy = 0
print("训练前测试:")
accuracy = test(data_test_path)
for now_epoch in range(epoch):
for (input_ids, attention_mask, token_type_ids, labels) in tqdm(loader):
out = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) # 模型的预测输出
loss = criterion(out, labels) # 用模型预测的输出和真实标签计算loss函数
loss.backward() # 反向传播
optimizer.step() # 梯度下降,
optimizer.zero_grad() # 梯度清零
# 训练一轮后测试集的准确率
accuracy = test(data_test_path)
if(accuracy>best_accuracy):
best_accuracy = accuracy
save_model(f'best_{now_epoch}.pt')
save_model('last.pt')
推理代码
import torch
import os
import pandas as pd
from datasets import load_from_disk
from transformers import BertTokenizer, BertModel, AdamW
from tqdm import tqdm
class Dataset(torch.utils.data.Dataset):
def __init__(self, text, label):
self.text = text
self.label = label
def __len__(self):
return 1
def __getitem__(self, i):
return self.text, self.label
class Model(torch.nn.Module):
def __init__(self):
super().__init__()
self.fc = torch.nn.Linear(768, 2) # 一个全连接神经网络,768是词编码维度,2是二分类
def forward(self, input_ids, attention_mask, token_type_ids):
with torch.no_grad(): # 使用预训练模型,抽取训练数据中的特征
out = pretrained(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids
)
out = self.fc(out.last_hidden_state[:, 0]) # 把抽取到的特征放到全连接神经网络计算,获取bert最后一层隐藏层中[cls]对应的输出向量
out = out.softmax(dim=1) # 对out的第一个维度进行归一化
return out
# 使用tokenizer编码数据
def collate_fn(data):
sents = [i[0] for i in data]
labels = [i[1] for i in data]
#编码
data = tokenizer.batch_encode_plus(batch_text_or_text_pairs=sents,
truncation=True,
padding='max_length',
max_length=500,
return_tensors='pt',
return_length=True)
#input_ids:编码之后的数字
#attention_mask:是补零的位置是0,其他位置是1
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)
token_type_ids = data['token_type_ids'].to(device)
labels = torch.LongTensor(labels).to(device)
return input_ids, attention_mask, token_type_ids, labels
def infer_single_data(text):
data = Dataset(text, "")
loader = torch.utils.data.DataLoader(data, batch_size=1, collate_fn=collate_fn)
for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
with torch.no_grad():
out = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
out = out.argmax(dim=1)
predicted_label = out.item()
return predicted_label
if __name__== "__main__" :
model_path = '' # 模型路径
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 每个模型都有自己的tokenizer分词器
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path=model_path)
# 加载预训练模型
pretrained = BertModel.from_pretrained(model_path)
pretrained.to(device)
# 冻结bert预训练模型的参数,即不对预训练模型的参数进行训练
for param in pretrained.parameters():
param.requires_grad_(False)
model = Model()
model.load_state_dict(torch.load('bert_checkpoint/best.pt'))
model.eval()
model.to(device)
test_text = "嗯,谢谢你介绍的做法很详细,但我不喜欢吃鸡蛋,有没有其他菜做法能介绍一下?"
predicted_label = infer_single_data(test_text)
if(predicted_label==0):
print(") # 分类1
else:
print(") # 分类2