处理思路:
1特征工程:了解文本的长度特征,填充到指定长度,少则填充多则截断。把类别标签投影到数字
2dataset设计:要求至少返回label,input_ids,mask_attention
3模型建立:导入bert模型,并添加线性分类头
4训练参数设置:包括epochs,optimizer,loss,model,scheduler,train_dl,test_dl,device
5训练函数修改:已经具备训练函数模板(主要用于图片分类等),模板函数在文末。建立utils包,导入修改至适合当前模型
6查看结果:制作train_acc,train_loss,test_acc,test_loss曲线。
项目文件夹dir展示
其中utils是自定义辅助函数包,包括训练函数和工具函数start_train_and_test,utils. 主文件夹包括参数设置config,dataset设置函数,模型函数及主函数train.py
数据集展示:对文本实现多分类,总共有12个类
1.特征工程与查看数据
import torch
import warnings
import pandas as pd
from utils.utils import config
warnings.filterwarnings('ignore')
train_df=pd.read_csv('train.csv',sep='\t',header=None)
label_name=list(set(train_df.iloc[:,1]))
label_pipeline=lambda x:label_name.index(x)
train_df['labels']=train_df[1].apply(label_pipeline)
print(train_df.head)
最右侧为投影后的标签。
其中train_df.iloc[:,1]为所有源标签值,set()方法是只取其中不同的,最后把对象变为list,用list 的index作为标签。
import matplotlib.pyplot as plt
from transformers import AutoTokenizer
model_path=r'F:\stu-ch\python\PLMs\bert-base-chinese'
tokenizer=AutoTokenizer.from_pretrained(model_path)
token_lens=[]
for txt in train_df[0]:
token_id=tokenizer(txt)['input_ids']
token_lens.append(len(token_id))
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
plt.hist(token_lens,bins=100)
plt.show()
对数据做特征工程,查看文本长度。图像如下:
所以只需要把文本截取到40即可,而且40也不长。通常的截断是128.即把句子截断到第128个词,少则填充,多则截断。
2. dataset设计
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from utils.utils import config
tokenizer=AutoTokenizer.from_pretrained(config['model_path'])
class mydataset(Dataset):
def __init__(self, data):
self.data = data
def __len__(self):
return len(self.data[1])
def __getitem__(self, idx):
text = self.data[0][idx]
label = self.data['labels'][idx]
encoding = tokenizer.encode_plus(
text,
padding='max_length',
truncation=True,
add_special_tokens=True,
max_length=int(config['max_len']),
return_token_type_ids=False,
pad_to_max_length=True,
return_attention_mask=True,
return_tensors='pt',
)
# print(encoding['input_ids'])
return {
'texts': text,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
# toeken_type_ids:0
'labels': torch.tensor(label, dtype=torch.long)
}
需要注意其中的padding='max_length', truncation=True,不能少否则会报错,输入的数据长度不齐,有的短,有的长。上面是dataset函数
关于tokenizer.encode_plus与其他的区别可以看下面这篇文章
tokenizer,tokenizer.tokenize,tokenizer.encode,tokenizer.encode_plus它们到底有什么不一样?_Howard_DL的博客-CSDN博客
下面是主函数
from dataset import *
mydataset=mydataset(train_df)
from torch.utils.data import random_split
train_data,test_data=random_split(mydataset,[int(0.8*len(mydataset)),int(0.2*len(mydataset))])
from torch.utils.data import DataLoader
train_dl=DataLoader(train_data,batch_size=64,shuffle=False)
test_dl=DataLoader(test_data,batch_size=64,shuffle=False)
3模型建立:导入bert模型,并添加线性分类头
import torch.nn as nn
from transformers import AutoModel
from utils.utils import config
class mymodel(nn.Module):
def __init__(self):
super(mymodel,self).__init__()
self.bert=AutoModel.from_pretrained(config['model_path'])
self.drop=nn.Dropout(0.3)
self.out=nn.Linear(self.bert.config.hidden_size, int(config['n_classes'])) # 两个类别
def forward(self,input_ids,attention_mask):
_,pooled_output=self.bert(input_ids=input_ids,
attention_mask=attention_mask,
return_dict = False)
output=self.drop(pooled_output)
return self.out(output)
4训练参数设置:包括epochs,optimizer,loss,model,scheduler,train_dl,test_dl,device
from model import mymodel
import torch.optim as optim
import torch.nn as nn
import matplotlib.pyplot as plt
from collections import defaultdict
from transformers import get_linear_schedule_with_warmup
from utils.start_train_and_test import start_train_and_test
model=mymodel()
opt = optim.AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_dl) * int(config['epoch'])
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
scheduler = get_linear_schedule_with_warmup(opt, num_warmup_steps=0.01 * total_steps,
num_training_steps=total_steps)
loss_fn = nn.CrossEntropyLoss().to(device)
5训练函数修改:已经具备训练函数模板(主要用于图片分类等),首先提供基本万能的训练模板函数。然后针对自己的模型进行细微修改
#%%模板函数
def start_train_and_test(epochs,device,model,train_dl,test_dl,loss_fn,opt):
train_loss = []
train_acc = []
test_loss = []
test_acc = []
for epoch in range(epochs):
model.train()
epoch_train_acc, epoch_train_loss = train(train_dl,model,loss_fn,opt,device)
model.eval()
epoch_test_acc, epoch_test_loss = test(test_dl,model,loss_fn,device)
train_acc.append(epoch_train_acc)
train_loss.append(epoch_train_loss)
test_acc.append(epoch_test_acc)
test_loss.append(epoch_test_loss)
template = ('Epoch:{:2d}, Train_acc:{:.1f}%, Train_loss:{:.3f}, Test_acc:{:.1f}%,Test_loss:{:.3f}')
print(template.format(epoch+1, epoch_train_acc*100, epoch_train_loss, epoch_test_acc*100, epoch_test_loss))
print('Done')
return {'train_acc':train_acc,'train_loss':train_loss,'test_acc':test_acc,'test_loss':test_loss}
#%%train()
def train(dataloader, model, loss_fn, optimizer,device):
size = len(dataloader.dataset) # 训练集的大小,一共60000张图片
num_batches = len(dataloader) # 批次数目,1875(60000/32)
train_loss, train_acc = 0, 0 # 初始化训练损失和正确率
for X, y in dataloader: # 获取图片及其标签
X, y = X.to(device), y.to(device)
# 计算预测误差
pred = model(X) # 网络输出
loss = loss_fn(pred, y) # 计算网络输出和真实值之间的差距,targets为真实值,计算二者差值即为损失
# 反向传播
optimizer.zero_grad() # grad属性归零
loss.backward() # 反向传播
optimizer.step() # 每一步自动更新
# 记录acc与loss
train_acc += (pred.argmax(1) == y).type(torch.float).sum().item()
train_loss += loss.item()
train_acc /= size
train_loss /= num_batches
return train_acc, train_loss
#%%test()
def test (dataloader, model, loss_fn,device):
size = len(dataloader.dataset) # 测试集的大小,一共10000张图片
num_batches = len(dataloader) # 批次数目,313(10000/32=312.5,向上取整)
test_loss, test_acc = 0, 0
# 当不进行训练时,停止梯度更新,节省计算内存消耗
with torch.no_grad():
for imgs, target in dataloader:
imgs, target = imgs.to(device), target.to(device)
# 计算loss
target_pred = model(imgs)
loss = loss_fn(target_pred, target)
test_loss += loss.item()
test_acc += (target_pred.argmax(1) == target).type(torch.float).sum().item()
test_acc /= size
test_loss /= num_batches
return test_acc, test_loss
修改模型。主要修改的是train(),test()中迭代的输入。修改如下
for _, data in enumerate(dataloader): # 获取图片及其标签
X=data['input_ids']
x2=data['attention_mask']
y=data['labels']
X, x2,y = X.to(device),x2.to(device), y.to(device)
最终的代码为:
import torch
def start_train_and_test(epochs,device,model,train_dl,test_dl,loss_fn,opt,scheduler):
train_loss = []
train_acc = []
test_loss = []
test_acc = []
for epoch in range(epochs):
model.train()
epoch_train_acc, epoch_train_loss = train(train_dl,model,loss_fn,opt,device)
scheduler.step()
model.eval()
epoch_test_acc, epoch_test_loss = test(test_dl,model,loss_fn,device)
train_acc.append(epoch_train_acc)
train_loss.append(epoch_train_loss)
test_acc.append(epoch_test_acc)
test_loss.append(epoch_test_loss)
lr=opt.state_dict()['param_groups'][0]['lr']
template = ('Epoch:{:2d}, Train_acc:{:.1f}%, Train_loss:{:.3f}, Test_acc:{:.1f}%,Test_loss:{:.3f},lr:{:.2E}')
print(template.format(epoch+1, epoch_train_acc*100, epoch_train_loss, epoch_test_acc*100, epoch_test_loss,lr))
print('Done')
return {'train_acc':train_acc,'train_loss':train_loss,'test_acc':test_acc,'test_loss':test_loss}
#%%
def train(dataloader, model, loss_fn, optimizer,device):
size = len(dataloader.dataset) # 训练集的大小,一共60000张图片
num_batches = len(dataloader) # 批次数目,1875(60000/32)
model.to(device)
train_loss, train_acc = 0, 0 # 初始化训练损失和正确率
for _, data in enumerate(dataloader): # 获取图片及其标签
X=data['input_ids']
x2=data['attention_mask']
y=data['labels']
X, x2,y = X.to(device),x2.to(device), y.to(device)
# 计算预测误差
pred = model(X,x2) # 网络输出
loss = loss_fn(pred, y) # 计算网络输出和真实值之间的差距,targets为真实值,计算二者差值即为损失
# 反向传播
optimizer.zero_grad() # grad属性归零
loss.backward() # 反向传播
optimizer.step() # 每一步自动更新
# 记录acc与loss
train_acc += (pred.argmax(1) == y).type(torch.float).sum().item()
train_loss += loss.item()
train_acc /= size
train_loss /= num_batches
return train_acc, train_loss
#%%
def test (dataloader, model, loss_fn,device):
size = len(dataloader.dataset) # 测试集的大小,一共10000张图片
num_batches = len(dataloader) # 批次数目,313(10000/32=312.5,向上取整)
test_loss, test_acc = 0, 0
model.to(device)
# 当不进行训练时,停止梯度更新,节省计算内存消耗
with torch.no_grad():
for _, data in enumerate(dataloader): # 获取图片及其标签
X = data['input_ids']
x2 = data['attention_mask']
target = data['labels']
X, x2, target = X.to(device), x2.to(device), target.to(device)
# 计算loss
target_pred = model(X,x2)
loss = loss_fn(target_pred, target)
test_loss += loss.item()
test_acc += (target_pred.argmax(1) == target).type(torch.float).sum().item()
test_acc /= size
test_loss /= num_batches
return test_acc, test_loss
在主程序中只须调用一下即可。
history=start_train_and_test(int(config['epoch']),device,model,train_dl,test_dl,loss_fn,opt,scheduler)
config为所有的参数。为一个txt文本。用utils.utils读取。
注意等号前后不要有空格。否则报错。其次路径不用加引号。model_path为自己下载预训练模型的路径,或者直接写bert-base-chinese。下面是utils.utils.py文件,用于读取config.txt
def read_config():
config = {}
with open('config.txt', encoding='utf-8') as f:
for line in f:
if '=' not in line:
continue
columns = line.strip().split('=')
config[columns[0]] = columns[1]
return config
config = read_config()
6查看结果:制作train_acc,train_loss,test_acc,test_loss曲线。
history = defaultdict(list) # 记录10轮loss和acc
best_accuracy = 0
epoch=int(config['epoch'])
train_acc = [i.cpu().numpy() for i in history['train_acc']]
val_acc = [i.cpu().numpy() for i in history['val_acc']]
plt.plot(train_acc, label='train accuracy')
plt.plot(val_acc, label='validation accuracy')
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0.7, 1]);
# %%
print(train_acc)
print(val_acc)
print(history['val_acc'])
结果如下
数据集:
链接:https://pan.baidu.com/s/1GXAXCtR0D_ibdyz0Y-botw
提取码:1c6f
365天每周深度学习。有想带练的可以联系我阿