微调BERT进行中文文本分类任务(Pytorch代码实现)

微调BERT进行中文文本分类任务(Pytorch代码)

BERT(Bidirectional Encoder Representation from Transformers)是2018年10月由Google AI研究院提出的一种预训练模型。其模型结构由多层Transformer的Encoder堆叠而成,在大规模语料中进行预训练后,迁移到下游任务,仅进行参数微调训练,就能显著提升性能。本文聚焦于微调中文BERT进行新闻标题分类任务的代码实现,更多BERT原理参考图解BERT

读取数据

数据集采样自清华新闻标题分类数据集,10分类且各类别样本均衡,训练集18万,开发集2万,测试集2万。

import pandas as pd

# 本地数据地址
train_data_path = 'data/THUCNewsPart/train.txt'
dev_data_path = 'data/THUCNewsPart/dev.txt'
test_data_path = 'data/THUCNewsPart/test.txt'
label_path = 'data/THUCNewsPart/class.txt'

# 读取数据
train_df = pd.read_csv(train_data_path, sep='\t', header=None)
dev_df = pd.read_csv(dev_data_path, sep='\t', header=None)
test_df = pd.read_csv(test_data_path, sep='\t', header=None)

# 更改列名
new_columns = ['text', 'label']  
train_df = train_df.rename(columns=dict(zip(train_df.columns, new_columns)))
dev_df = dev_df.rename(columns=dict(zip(dev_df.columns, new_columns)))
test_df = test_df.rename(columns=dict(zip(test_df.columns, new_columns)))

# 读取标签
real_labels = []
with open('data/THUCNewsPart/class.txt', 'r') as f:
    for row in f.readlines():
        real_labels.append(row.strip())

分析数据

新闻标题的长度在 5~35之间。

import matplotlib.pyplot as plt
# 设置全局字体为SimHei,这是一种支持中文的字体  
plt.rcParams['font.sans-serif'] = ['SimHei']
# 解决保存图像时负号'-'显示为方块的问题  
plt.rcParams['axes.unicode_minus'] = False 

# 计算字符串长度并统计分布  
length_counts = train_df['text'].apply(len).value_counts().sort_index()  

# 绘制直方图  
plt.hist(length_counts.index, bins=len(length_counts), weights=length_counts.values)  
plt.xlabel('文本长度')  
plt.ylabel('频数')  
plt.title('字符串长度分布直方图')  
plt.show()

在这里插入图片描述

下载预训练模型

前往HuggingFace点击下载预训练模型,并放置到特定文件夹,以便复用。

在这里插入图片描述

# 点击下载的方式比较繁杂,奈何国内无法访问,此处记下更优雅的模型下载方式。
# from huggingface_hub import snapshot_download
# snapshot_download(repo_id="bert-base-chinese", 
#                   ignore_patterns=["*.msgpack", "*.h5", "*.safetensors"], 
#                   local_dir="/home/yzw/plm", local_dir_use_symlinks=False)

构建DataSet

调用BERT的分词器,进行分词,其返回结果是包括input_ids,token_type_ids和attention_mask的字典。

# 下载的预训练文件路径
BERT_PATH = '/home/yzw/plm/bert-base-chinese'

from transformers import BertTokenizer
# 加载分词器
tokenizer = BertTokenizer.from_pretrained(BERT_PATH)

example_text = '我爱北京天安门。'
bert_input = tokenizer(example_text,padding='max_length', 
                       max_length = 10, 
                       truncation=True,
                       return_tensors="pt") # pt表示返回tensor
print(bert_input)
# {'input_ids': tensor([[ 101, 2769, 4263, 1266,  776, 1921, 2128, 7305,  511,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

继承Dataset,并实现__init____getitem__以及__len__方法,以便进行迭代训练。

from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, df):
        # tokenizer分词后可以被自动汇聚
        self.texts = [tokenizer(text, 
                                padding='max_length',  # 填充到最大长度
                                max_length = 35, 	# 经过数据分析,最大长度为35
                                truncation=True,
                                return_tensors="pt") 
                      for text in df['text']]
        # Dataset会自动返回Tensor
        self.labels = [label for label in df['label']]

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

    def __len__(self):
        return len(self.labels)

# 因为要进行分词,此段运行较久,约40s
train_dataset = MyDataset(train_df)
dev_dataset = MyDataset(dev_df)
test_dataset = MyDataset(test_df)

构建模型

模型结构比较简单,取BERT的[CLS]输出(pooled_output),经过dropout层随机丢弃一些神经元,再接入线性分类层,最后采用ReLU进行激活,得到分类结果。后面将讨论RuLU的有效性。

from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):
    def __init__(self):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(BERT_PATH)
        self.dropout = nn.Dropout(0.5)
        self.linear = nn.Linear(768, 10)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer

训练模型

import torch
from torch.optim import Adam
from tqdm import tqdm
import numpy as np
import random
import os

# 训练超参数
epoch = 5
batch_size = 64
lr = 1e-5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
random_seed = 1999
save_path = './bert_checkpoint'

def setup_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
setup_seed(random_seed)

def save_model(save_name):
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    torch.save(model.state_dict(), os.path.join(save_path, save_name))

    
# 定义模型
model = BertClassifier()
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=lr)
model = model.to(device)
criterion = criterion.to(device)

# 构建数据加载器
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)


# 训练
best_dev_acc = 0
for epoch_num in range(epoch):
    total_acc_train = 0
    total_loss_train = 0
    for inputs, labels in tqdm(train_loader):
        input_ids = inputs['input_ids'].squeeze(1).to(device) # torch.Size([32, 35])
        masks = inputs['attention_mask'].to(device) # torch.Size([32, 1, 35])
        labels = labels.to(device)
        output = model(input_ids, masks)

        batch_loss = criterion(output, labels)
        batch_loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        acc = (output.argmax(dim=1) == labels).sum().item()
        total_acc_train += acc
        total_loss_train += batch_loss.item()

    # ----------- 验证模型 -----------
    model.eval()
    total_acc_val = 0
    total_loss_val = 0
    # 不需要计算梯度
    with torch.no_grad():
        # 循环获取数据集,并用训练好的模型进行验证
        for inputs, labels in dev_loader:
            input_ids = inputs['input_ids'].squeeze(1).to(device) # torch.Size([32, 35])
            masks = inputs['attention_mask'].to(device) # torch.Size([32, 1, 35])
            labels = labels.to(device)
            output = model(input_ids, masks)

            batch_loss = criterion(output, labels)
            acc = (output.argmax(dim=1) == labels).sum().item()
            total_acc_val += acc
            total_loss_val += batch_loss.item()
        
        print(f'''Epochs: {epoch_num + 1} 
          | Train Loss: {total_loss_train / len(train_dataset): .3f} 
          | Train Accuracy: {total_acc_train / len(train_dataset): .3f} 
          | Val Loss: {total_loss_val / len(dev_dataset): .3f} 
          | Val Accuracy: {total_acc_val / len(dev_dataset): .3f}''')
        
        # 保存最优的模型
        if total_acc_val / len(dev_dataset) > best_dev_acc:
            best_dev_acc = total_acc_val / len(dev_dataset)
            save_model('best.pt')
        
    model.train()

# 保存最后的模型,以便继续训练
save_model('last.pt')
# todo 保存优化器

batch_size设置为64,占用显存约4GB。迭代训练5epoch,每个epoch耗时4分钟,第4个epoch后开发集准确率出现下降,训练过程如下图所示:

在这里插入图片描述

评估模型

# 加载模型
model = BertClassifier()
model.load_state_dict(torch.load(os.path.join(save_path, 'best.pt')))
model = model.to(device)
model.eval()

def evaluate(model, dataset):
    model.eval()
    test_loader = DataLoader(dataset, batch_size=128)
    total_acc_test = 0
    with torch.no_grad():
        for test_input, test_label in test_loader:
            input_id = test_input['input_ids'].squeeze(1).to(device)
            mask = test_input['attention_mask'].to(device)
            test_label = test_label.to(device)
            output = model(input_id, mask)
            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc   
    print(f'Test Accuracy: {total_acc_test / len(dataset): .3f}')
    
evaluate(model, test_dataset)

开发集上表现最好的模型,在测试集上的准确率为0.943

交互式推理

while True:
    text = input('新闻标题:')
    bert_input = tokenizer(text, padding='max_length', 
                            max_length = 35, 
                            truncation=True,
                            return_tensors="pt")
    input_ids = bert_input['input_ids'].to(device)
    masks = bert_input['attention_mask'].unsqueeze(1).to(device)
    output = model(input_ids, masks)
    pred = output.argmax(dim=1)
    print(real_labels[pred])
# 运行结果
新闻标题: 男子炫耀市中心养烈性犬?警方介入
society
新闻标题: 北京大学召开校运动会
education
新闻标题: 深度学习大会在北京召开
science

讨论分析

  1. 是否需要在分类线性层后加上ReLU激活函数?

    ReLU激活函数为模型提供非线性能力,但BERT中已经包含了大量非线性激活函数,感觉没有必要再添加。分类线性层紧接着损失函数计算,进行ReLU激活,改变了输出概率分布,原先负数强制归零,会不会影响到模型的学习能力?

    实验表明:不加ReLU训练速度基本不变,但训练3个epoch后开发集指标出现下降,最终验证集准确率0.944,小幅提升。确实没有在分类线性层后加上ReLU激活函数的必要。

参考文献

  1. 保姆级教程,用PyTorch和BERT进行文本分类
  2. Chinese-Text-Classification-PyTorch
  3. 超细节的BERT/Transformer知识点
具体使用方法可以看我的博客:https://blog.csdn.net/weixin_40015791/article/details/90410083 下面也会简单介绍一下:在bert开源代码中的run_classifier.py中找到 processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "xnli": XnliProcessor, "intentdetection":IntentDetectionProcessor, "emotion":EmotionProcessor, #新加上这一行 } 然后在该文件中增加一个class: class EmotionProcessor(DataProcessor): """Processor for the MRPC data set (GLUE version).""" def get_train_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "fine_tuning_train_data.tsv")), "train") #此处的名字和文件夹中的训练集的名字要保持一致 def get_dev_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "fine_tuning_val_data.tsv")), "dev") def get_test_examples(self, data_dir): """See base class.""" return self._create_examples( self._read_tsv(os.path.join(data_dir, "fine_tuning_test_data.tsv")), "test") def get_labels(self): """See base class.""" return ["0", "1","2","3","4","5","6"] #七分类则从0到6 def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) if set_type == "test": label = "0" text_a = tokenization.convert_to_unicode(line[0]) else: label = tokenization.convert_to_unicode(line[0]) text_a = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples 最后直接调用即可,运行的命令如下: python run_classifier.py \ --task_name=emotion \ --do_train=true \ --do_eval=true \ --data_dir=data \ #把数据解压到同一级的文件夹中,此处是该文件夹名字data --vocab_file=chinese_L-12_H-768_A-12/vocab.txt \ #中文数据微调的原始bert模型 --bert_config_file=chinese_L-12_H-768_A-12/bert_config.json \ --init_checkpoint=chinese_L-12_H-768_A-12/bert_model.ckpt \ --max_seq_length=128 \ --train_batch_size=32 \ --learning_rate=2e-5 \ --num_train_epochs=3.0 \ --output_dir=output #生成文件所在的文件夹 大概9个小时,最后文件夹中会有三个文件 后缀分别为index/meta/00000-of-00001,分别将这个改成bert_model.ckpt.index/bert_model.ckpt.meta/bert_model.ckpt.data-00000-of-00001,再在同一个文件夹中放入chinese_L-12_H-768_A-12中的vocab.txt和bert_config.json 即最后该文件夹中有5个文件。然后像调用chinese_L-12_H-768_A-12一样将文件夹名改成自己的文件夹名即可。 bert-serving-start -model_dir output -num_worfer=3 即可调用微调后的语言通用模型。
下面是使用PyTorch实现BERT微调进行分类任务代码示例: ```python import torch from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup # 定义一些基本参数 MAX_LEN = 128 BATCH_SIZE = 32 EPOCHS = 4 LEARNING_RATE = 2e-5 # 加载数据 train_texts = [...] # 训练集文本 train_labels = [...] # 训练集标签 test_texts = [...] # 测试集文本 test_labels = [...] # 测试集标签 # 加载BERT tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # 对训练集和测试集进行编码 train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=MAX_LEN) test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=MAX_LEN) # 将编码转换为PyTorch tensors train_labels = torch.tensor(train_labels) test_labels = torch.tensor(test_labels) train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']), torch.tensor(train_encodings['attention_mask']), train_labels) test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']), torch.tensor(test_encodings['attention_mask']), test_labels) # 创建数据加载器 train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE) test_sampler = SequentialSampler(test_dataset) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE) # 加载BERT模型 model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2) # 定义优化器和学习率调度器 optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, eps=1e-8) total_steps = len(train_dataloader) * EPOCHS scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) # 训练BERT模型 for epoch in range(EPOCHS): for step, batch in enumerate(train_dataloader): # 将batch转换为GPU tensor batch = tuple(t.to('cuda') for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]} outputs = model(**inputs) loss = outputs[0] loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() optimizer.zero_grad() # 在测试集上评估模型 model.eval() test_loss, test_accuracy = 0, 0 nb_test_steps, nb_test_examples = 0, 0 for batch in test_dataloader: batch = tuple(t.to('cuda') for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]} with torch.no_grad(): outputs = model(**inputs) logits = outputs[1] logits = logits.detach().cpu().numpy() label_ids = inputs['labels'].cpu().numpy() tmp_test_accuracy = accuracy(logits, label_ids) test_accuracy += tmp_test_accuracy nb_test_examples += inputs['input_ids'].size(0) nb_test_steps += 1 test_accuracy = test_accuracy / nb_test_examples print('Epoch: {}, Test Accuracy: {}'.format(epoch, test_accuracy)) ``` 其中,`train_texts`和`train_labels`是训练集的文本和标签,`test_texts`和`test_labels`是测试集的文本和标签。`tokenizer`用来将文本转换为BERT的输入格式,`BertForSequenceClassification`是带有分类层的BERT模型,`AdamW`是用于优化模型的优化器,`get_linear_schedule_with_warmup`是用于调整学习率的调度器。在训练过程中,我们使用了梯度裁剪来避免梯度爆炸,同时在每个epoch结束后在测试集上评估模型的性能。
评论 23
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值