基于huggingface/transforms(pytorch)框架实现Bert文本分类

基于huggingface/transforms-PyTorch框架实现Bert文本分类

背景

作者在使用bert_keras实现bert文本分类模型后发现训练时并不能使用GPU加速训练,因此想使用huggingface/transforms框架实现bert文本分类模型,但是由于不清楚其模型输入格式、API没有中文介绍等原因,在实现过程中还是废了不少力,因此想在此分享一下现成的代码供大家使用。https://github.com/huggingface/transformers是huggingface/transforms源码地址。

项目结构

项目结构如下:
在这里插入图片描述

安装依赖包

主要是windows下,pytorch的安装不能直接使用pip进行安装,安装命令如下:

#windows:
pip install torch===1.6.0 torchvision===0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
#linux:
pip install torch torchvision

数据与预训练模型

数据

数据存储在csv文件中(如果你理解了数据处理过程也可以替换存储方式),格式如下,第一列为label,第2列为文本
在这里插入图片描述

预训练模型

我没有使用transforms框架进行预训练,然而PyTorch加载模型需要模型存储的格式为bin,因此需要先将ckpt格式的文件转换为bin格式。代码如下:

cd bert_nlpc/preTrain_model	#进入到bert_model.ckpt所在目录
transformers-cli convert --model_type bert \
  --tf_checkpoint bert_model.ckpt \
  --config bert_config.json \
  --pytorch_dump_output pytorch_model.bin

代码部分

import random
import numpy as np
import torch
import csv
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer,BertConfig,BertForSequenceClassification,AdamW,AutoTokenizer,AutoModel
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

class BertModle:
    def __init__(self,train,validation,vocab_path,config_path,pretrain_Model_path,saveModel_path,learning_rate,n_class,epochs,batch_size,val_batch_size,max_len,gpu=True):
        self.n_class = n_class	#类别数
        self.max_len = max_len	#句子最大长度
        self.lr = learning_rate	#学习率
        self.tokenizer = BertTokenizer.from_pretrained(vocab_path)	#加载分词模型
        self.train = self.load_data(train)	#加载训练数据集
        self.validation = self.load_data(validation)	#加载测试数据集
        self.epochs = epochs	
        self.batch_size = batch_size	#训练集的batch_size
        self.val_batch_size = val_batch_size
        self.saveModel_path = saveModel_path	#模型存储位置
        self.gpu = gpu	#是否使用gpu
        config = BertConfig.from_json_file(config_path)	#加载bert模型配置信息
        config.num_labels = n_class	#设置分类模型的输出个数
        self.model = BertForSequenceClassification.from_pretrained(pretrain_Model_path,config=config)	#加载bert分类模型
        if self.gpu:
            seed = 42
            random.seed(seed)
            np.random.seed(seed)
            torch.manual_seed(seed)
            torch.cuda.manual_seed_all(seed)
            torch.backends.cudnn.deterministic = True
            self.device = torch.device('cuda')
        else:
            self.device = 'cpu'

    def encode_fn(self,text_list):
    	#将text_list embedding成bert模型可用的输入形式
    	#text_list:['我爱你','猫不是狗']
        tokenizer = self.tokenizer(
            text_list,
            padding = True,
            truncation = True,
            max_length = self.max_len,
            return_tensors='pt'  # 返回的类型为pytorch tensor
        )
        input_ids = tokenizer['input_ids']
        token_type_ids = tokenizer['token_type_ids']
        attention_mask = tokenizer['attention_mask']
        return input_ids,token_type_ids,attention_mask

    def load_data(self,path):
    	#只能处理csv文件
        text_list = []
        labels = []
        for line in csv.reader(open(path,encoding='gbk')):
            label = int(line[0])	#这里可以改,label在什么位置就改成对应的index
            text = line[1]
            text_list.append(text)
            labels.append(label)
        input_ids,token_type_ids,attention_mask = self.encode_fn(text_list)
        labels = torch.tensor(labels)
        data = TensorDataset(input_ids,token_type_ids,attention_mask,labels)
        return data

    def load_data_predict(self,path):
    	#加载文件,用于bert分类模型的预测
        text_list = []
        labels = []
        for line in csv.reader(open(path, encoding='gbk')):
            text = line[1]
            text_list.append(text)
            label = int(line[0])
            labels.append(label)
        return text_list,labels

    def flat_accuracy(self, preds, labels):
        """A function for calculating accuracy scores"""
        pred_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels.flatten()
        return accuracy_score(labels_flat, pred_flat)

    def train_model(self):
    	#训练模型
        if self.gpu:
            self.model.cuda()
        optimizer = AdamW(self.model.parameters(), lr=self.lr)
        trainData = DataLoader(self.train, batch_size = self.batch_size, shuffle = True)	#处理成多个batch的形式
        valData = DataLoader(self.validation, batch_size = self.val_batch_size, shuffle = True)

        total_steps = len(trainData) * self.epochs
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

        for epoch in range(self.epochs):
            self.model.train()
            total_loss, total_val_loss = 0, 0
            total_eval_accuracy = 0
            print('epoch:' , epoch , ', step_number:' , len(trainData))
            #训练
            for step,batch in enumerate(trainData):
                self.model.zero_grad()

                loss, logits = self.model(input_ids = batch[0].to(self.device),
                                          token_type_ids=batch[1].to(self.device),
                                          attention_mask=batch[2].to(self.device),
                                          labels=batch[3].to(self.device)
                                          )	#输出loss 和 每个分类对应的输出,softmax后才是预测是对应分类的概率
                total_loss += loss.item()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                if step % 10 == 0 and step > 0:	#每10步输出一下训练的结果,flat_accuracy()会对logits进行softmax
                    self.model.eval()
                    logits = logits.detach().cpu().numpy()
                    label_ids = batch[3].cuda().data.cpu().numpy()
                    avg_val_accuracy = self.flat_accuracy(logits, label_ids)
                    print('step:' , step)
                    print(f'Accuracy: {avg_val_accuracy:.4f}')
                    print('\n')
			#每个epoch结束,就使用validation数据集评估一次模型
            self.model.eval()
            print('testing ....')
            for i, batch in enumerate(valData):
                with torch.no_grad():
                    loss, logits = self.model(input_ids=batch[0].to(self.device),
                                              token_type_ids=batch[1].to(self.device),
                                              attention_mask=batch[2].to(self.device),
                                              labels=batch[3].to(self.device)
                                              )
                    total_val_loss += loss.item()

                    logits = logits.detach().cpu().numpy()
                    label_ids = batch[3].cuda().data.cpu().numpy()
                    total_eval_accuracy += self.flat_accuracy(logits, label_ids)

            avg_train_loss = total_loss / len(trainData)
            avg_val_loss = total_val_loss / len(valData)
            avg_val_accuracy = total_eval_accuracy / len(valData)

            print(f'Train loss     : {avg_train_loss}')
            print(f'Validation loss: {avg_val_loss}')
            print(f'Accuracy: {avg_val_accuracy:.4f}')
            print('\n')
            self.save_model(self.saveModel_path + '-' + str(epoch))

    def save_model(self , path):
    	#保存分词模型和分类模型
        self.model.save_pretrained(path)
        self.tokenizer.save_pretrained(path)

    def load_model(self,path):
    	#加载分词模型和分类模型
        tokenizer = AutoTokenizer.from_pretrained(path)
        model = BertForSequenceClassification.from_pretrained(path)
        return tokenizer,model

    def eval_model(self,Tokenizer, model,text_list,y_true):
    	#输出模型的召回率、准确率、f1-score
        preds = self.predict_batch(Tokenizer, model, text_list)
        print(classification_report(y_true,preds))

    def predict_batch(self, Tokenizer, model, text_list):
        tokenizer = Tokenizer(
            text_list,
            padding = True,
            truncation = True,
            max_length = self.max_len,
            return_tensors='pt'  # 返回的类型为pytorch tensor
        )
        input_ids = tokenizer['input_ids']
        token_type_ids = tokenizer['token_type_ids']
        attention_mask = tokenizer['attention_mask']
        pred_data = TensorDataset(input_ids,token_type_ids,attention_mask)
        pred_dataloader = DataLoader(pred_data, batch_size=self.batch_size, shuffle=False)
        model = model.to(self.device)
        model.eval()
        preds = []
        for i, batch in enumerate(pred_dataloader):
            with torch.no_grad():
                outputs = model(input_ids=batch[0].to(self.device),
                                token_type_ids=batch[1].to(self.device),
                                attention_mask=batch[2].to(self.device)
                                )
                logits = outputs[0]
                logits = logits.detach().cpu().numpy()
                preds += list(np.argmax(logits, axis=1))
        return preds

if __name__ == '__main__':
    epoch = 3
    #预训练模型的存储位置为 ../../preTrain_model/gongdan_step5000_ml128/
    #分类模型和分词模型的存储位置是 ../trained_model/bert_model/gongdan_step5000_ml128/
    model_file = 'preTrain_model'	
    trained_model_file = 'bert_model'
    model_name = 'gongdan_step5000_ml128'
    bert_model = BertModle(
        train = '../../data/train.csv',
        validation = '../../data/validation.csv',
        vocab_path = '../../'+ model_file +'/'+ model_name +'/vocab.txt',
        config_path = '../../' + model_file + '/'+ model_name +'/bert_config.json',
        pretrain_Model_path = '../../'+ model_file +'/'+ model_name +'/bert_model.bin',
        saveModel_path = '../trained_model/'+ trained_model_file+'/'+model_name,
        learning_rate = 2e-5,
        n_class = 8,
        epochs = epoch,
        batch_size = 4,
        val_batch_size = 4,
        max_len = 100 ,
        gpu = True
    )
    bert_model.train_model()
    Tokenizer,model = bert_model.load_model(bert_model.saveModel_path + '-'+str(epoch-1))
    text_list,y_true = bert_model.load_data_predict('../../data/validation.csv')
    bert_model.eval_model(Tokenizer, model,text_list,y_true)
  • 14
    点赞
  • 58
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
是的,我可以使用PyTorch框架实现ResNet网络并进行MNIST数字识别。ResNet是一种深度残差网络结构,特别适用于图像分类任务。MNIST是一个经典的手写数字数据集,常用于测试和验证机器学习模型。 我可以为您提供一个示例代码来实现这个任务。以下是一个简单的代码示例: ```python import torch import torch.nn as nn import torch.optim as optim from torchvision.datasets import MNIST from torchvision.transforms import ToTensor from torch.utils.data import DataLoader # 定义ResNet模型 class ResNet(nn.Module): def __init__(self): super(ResNet, self).__init__() # 定义网络结构... def forward(self, x): # 定义前向传播... # 加载MNIST数据集 train_dataset = MNIST(root='.', train=True, transform=ToTensor(), download=True) test_dataset = MNIST(root='.', train=False, transform=ToTensor()) # 创建数据加载器 train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False) # 创建ResNet模型实例 model = ResNet() # 定义损失函数和优化器 criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=0.001) # 训练模型 num_epochs = 10 for epoch in range(num_epochs): for images, labels in train_loader: optimizer.zero_grad() outputs = model(images) loss = criterion(outputs, labels) loss.backward() optimizer.step() # 在测试集上评估模型 with torch.no_grad(): total_correct = 0 total_samples = 0 for images, labels in test_loader: outputs = model(images) _, predicted = torch.max(outputs.data, 1) total_samples += labels.size(0) total_correct += (predicted == labels).sum().item() accuracy = total_correct / total_samples print(f'Epoch {epoch+1}, Accuracy: {accuracy}') # 使用训练好的模型进行预测 # ... ``` 请注意,此示例仅提供了一个基本的框架,您可能需要根据实际情况进行适当的修改和调整。希望对您有所帮助!***
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值