BERT文本分类——基于simplifyweibo_4_moods微博数据集

MatpyMaster

已于 2023-11-10 17:00:47 修改

阅读量3k

点赞数 6

分类专栏： NLP自然语言处理文章标签： bert 人工智能深度学习自然语言处理分类

于 2023-11-10 16:51:46 首次发布

本文链接：https://blog.csdn.net/qq_41301570/article/details/134336747

版权

NLP自然语言处理专栏收录该内容

35 篇文章

订阅专栏

在上一篇中我们基于今日头条新闻数据集构建了分类模型，取得了较好的结果。。今天我们基于simplifyweibo_4_moods数据集进行文本分类，篇幅有限，完整代码可在文末获取。

一.simplifyweibo_4_moods数据集

36 万多条，带情感标注新浪微博，包含 4 种情感，其中喜悦约 20 万条，愤怒、厌恶、低落各约 5 万条。

二.代码实现

1.下载预训练模型

bert-chinese：https://huggingface.co/bert-base-chinese

simplifyweibo_4_moods数据集：simplifyweibo_4_moods

2. 概述Tokenizer

先介绍下BERT Tokenizer中的max_length、padding和truncation参数的工作原理。

（1）padding

用于指定填充的方式。可以使用不同的填充方式。

max_length：填充后的序列长度将与max_length参数指定的长度一致。对于超过max_length长度的序列，进行截断；对于不足max_length长度的序列，进行填充。

longest：填充后的序列长度将与最长的序列长度一致。所有序列都将在最长序列的基础上进行填充或截断。

do_not_pad：不进行填充操作

（2）max_length

用于指定切分后的文本序列的最大长度。如果输入文本的长度超过了max_length，则会进行截断（truncation）以确保序列的长度不超过max_length。如果输入文本的长度不足max_length，将会进行填充（padding）以使序列长度一致。

（2）truncation参数

truncation参数用于指定是否进行截断，默认为False。当truncation参数设置为True时，如果输入文本长度超过了max_length，将会进行截断操作；当truncation参数设置为False时，输入文本长度超过max_length将导致错误。

3. 寻找最大长度

# simplifyweibo_4_moods
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import pandas as pd
x = range(16)
y = [0 for _ in range(16)]
df = pd.read_csv('simplifyweibo_4_moods.csv')
text = df['review']
for line in text:
    y[len(line.split(' '))] += 1
for i in range(1, 16):
    y[i] += y[i - 1]
fig = plt.figure(figsize=(15, 9))
plt.bar(x, y,label='simplifyweibo_4_moods')
plt.legend(loc="upper left",fontsize=25)
plt.xlabel('Length',fontsize=25)
plt.show()

可以看到所有文本都在16以内，故最大长度我们设置为16即可，这样可以加快训练速度。

4. 训练代码

import torch
import numpy as np
from transformers import BertTokenizer
import pandas as pd
from torch import nn
from transformers import BertModel
from torch.optim import Adam
from tqdm import tqdm

df = pd.read_csv('simplifyweibo_4_moods.csv')
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                                     [int(.8*len(df)), int(.9*len(df))])  # 拆分为训练集、验证集和测试集，比例为 80:10:10。

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = np.array(df['label'])
        self.texts = [tokenizer(text,
                                padding='max_length',
                                max_length = 16,
                                truncation=True,
                                return_tensors="pt")
                      for text in df['review']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

# 构建模型
class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-chinese',num_labels=15)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 4)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer


# 训练模型
def train(model, train_data, val_data, learning_rate, epochs, batch_size):
    # 通过Dataset类获取训练和验证集
    train, val = Dataset(train_data), Dataset(val_data)
    # DataLoader根据batch_size获取数据，训练时选择打乱样本
    train_dataloader = torch.utils.data.DataLoader(train, batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size)
    # 判断是否使用GPU
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    # 定义损失函数和优化器
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()
    # 开始进入训练循环
    for epoch_num in range(epochs):
        # 定义两个变量，用于存储训练集的准确率和损失
        total_acc_train = 0
        total_loss_train = 0
        # 进度条函数tqdm
        for train_input, train_label in tqdm(train_dataloader):
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)
            # 通过模型得到输出
            output = model(input_id, mask)
            # 计算损失
            batch_loss = criterion(output, train_label.long())
            total_loss_train += batch_loss.item()
            # 计算精度
            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc
            # 模型更新
            model.zero_grad()
            batch_loss.backward()
            optimizer.step()
        # ------ 验证模型 -----------
        # 定义两个变量，用于存储验证集的准确率和损失
        total_acc_val = 0
        total_loss_val = 0
        # 不需要计算梯度
        with torch.no_grad():
            # 循环获取数据集，并用训练好的模型进行验证
            for val_input, val_label in val_dataloader:
                # 如果有GPU，则使用GPU，接下来的操作同训练
                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)
                output = model(input_id, mask)
                batch_loss = criterion(output, val_label.long())
                total_loss_val += batch_loss.item()
                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
        print(
            f'''Epochs: {epoch_num + 1} 
              | Train Loss: {total_loss_train / len(train_data): .3f} 
              | Train Accuracy: {total_acc_train / len(train_data): .3f} 
              | Val Loss: {total_loss_val / len(val_data): .3f} 
              | Val Accuracy: {total_acc_val / len(val_data): .3f}''')

EPOCHS = 10  # 训练轮数
model = BertClassifier()  # 定义的模型
LR = 1e-6  # 学习率
Batch_Size = 16  # 看你的GPU，要合理取值
train(model, df_train, df_val, LR, EPOCHS, Batch_Size)
torch.save(model.state_dict(), 'BERT-weibo.pt')

# 评估模型
def evaluate(model, test_data, batch_size):
    test = Dataset(test_data)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
    total_acc_test = 0
    with torch.no_grad():
        for test_input, test_label in test_dataloader:
            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)
            output = model(input_id, mask)
            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
evaluate(model, df_test, Batch_Size)

5. 测试代码

import torch
from transformers import BertTokenizer
from torch import nn
from transformers import BertModel

def get_label_string(label):
    labels = {'喜悦': 0,
              '愤怒': 1,
              '厌恶': 2,
              '低落': 3
              }
    for key, value in labels.items():
        if value == label:
            return key
    return None

# 构建模型
class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-chinese',num_labels=4)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 4)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer
model = BertClassifier()
model.load_state_dict(torch.load('BERT-weibo.pt'))
model.eval()
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
text = '元旦快乐！'
text_input = tokenizer(text,padding='max_length',max_length = 16,truncation=True,return_tensors="pt")
mask = text_input['attention_mask']
input_id = text_input['input_ids']
output = model(input_id, mask)
output = output.argmax(dim=1)
output = output.item()
label_string = get_label_string(output)
print(label_string)