复旦nlp实验室 nlp-beginner 任务二:基于深度学习的文本分类

任务二:基于深度学习的文本分类

熟悉Pytorch,用Pytorch重写《任务一》,实现CNN、RNN的文本分类;

  1. 参考

    1. https://pytorch.org/
    2. Convolutional Neural Networks for Sentence Classification https://arxiv.org/abs/1408.5882
    3. https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
  2. word embedding 的方式初始化

  3. 随机embedding的初始化方式

  4. 用glove 预训练的embedding进行初始化 https://nlp.stanford.edu/projects/glove/

  5. 知识点:

    1. CNN/RNN的特征抽取
    2. 词嵌入
    3. Dropout
  6. 时间:两周

main

数据集的加载与预处理都在main函数里面

import torch
import torch.nn as nn
from tqdm import tqdm, trange # tqdm模块来显示任务进度条
from torch.optim import Adam
from tensorboardX import SummaryWriter
import pandas as pd
import os
from torchtext.legacy import data
from torchtext.legacy.data import Iterator, BucketIterator
from torchtext.vocab import Vectors
import matplotlib.pyplot as plt
import numpy as np

from Model import RNN, CNN, LSTM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_epochs = 5
batch_size = 512
learning_rate = 0.001
max_seq_length = 48
num_classes = 5
dropout_rate = 0.1
data_path = "data"
clip = 5

embed_size = 200
vectors = Vectors('glove.6B.200d.txt', 'C:/Users/Mechrevo/Desktop/AI/nlp-beginner/code-for-nlp-beginner-master/Task2-Text Classification (RNN&CNN)/embedding')
freeze = False

use_rnn = True
hidden_size = 256
num_layers = 1
bidirectional = True

use_lstm = False
num_filters = 200
kernel_sizes = [2, 3, 4]

def load_iters(batch_size=32, device="cpu", data_path='data', vectors=None):
    TEXT = data.Field(lower=True, batch_first=True, include_lengths=True)
    LABEL = data.LabelField(batch_first=True)
    train_fields = [(None, None), (None, None), ('text', TEXT), ('label', LABEL)]
    test_fields = [(None, None), (None, None), ('text', TEXT)]

    train_data = data.TabularDataset.splits(
        path=data_path,
        train='train.tsv',
        format='tsv',
        fields=train_fields,
        skip_header=True
    )[0]

    test_data = data.TabularDataset.splits(
        path='data',
        train='test.tsv',
        format='tsv',
        fields=test_fields,
        skip_header=True
    )[0]
    TEXT.build_vocab(train_data.text, vectors=vectors)
    LABEL.build_vocab(train_data.label)
    train_data, dev_data = train_data.split([0.8, 0.2])

    train_iter, dev_iter = BucketIterator.splits(
        (train_data, dev_data),
        batch_sizes=(batch_size, batch_size),
        device=device,
        sort_key=lambda x: len(x.text),
        sort_within_batch=True,
        repeat=False,
        shuffle=True
    )

    test_iter = Iterator(
        test_data,
        batch_size=batch_size,
        device=device,
        sort=False,
        sort_within_batch=False,
        repeat=False,
        shuffle=False
    )
    return train_iter, dev_iter, test_iter, TEXT, LABEL

if __name__ == "__main__":
    train_iter, dev_iter, test_iter, TEXT, LABEL = load_iters(batch_size, device, data_path, vectors)
    vocab_size = len(TEXT.vocab.itos)
    # build model
    if use_lstm:
        model = LSTM(vocab_size, embed_size, hidden_size, num_layers, num_classes, bidirectional, dropout_rate)
    elif use_rnn:
        model = RNN(vocab_size, embed_size, hidden_size, num_layers, num_classes, bidirectional, dropout_rate)
    else:
        model = CNN(vocab_size, embed_size, num_classes, num_filters, kernel_sizes, dropout_rate)
    if vectors is not None:
        model.embed.from_pretrained(TEXT.vocab.vectors, freeze=freeze)
    model.to(device)

    optimizer = Adam(model.parameters(), lr=learning_rate)
    loss_func = nn.CrossEntropyLoss()
    writer = SummaryWriter('logs', comment="rnn")
    loss_history = []
    for epoch in trange(train_epochs, desc="Epoch"):
        model.train()
        ep_loss = 0
        for step, batch in enumerate(tqdm(train_iter, desc="Iteration")):
            (inputs, lens), labels = batch.text, batch.label
            outputs = model(inputs, lens)
            loss = loss_func(outputs, labels)
            ep_loss += loss.item()

            model.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()
            if step % 10 == 0:
                loss_history.append(loss.item())
            writer.add_scalar('Train_Loss', loss, epoch)
            if step % 10 == 0:
                tqdm.write('Epoch {}, Step {}, Loss {}'.format(epoch, step, loss.item()))

        # evaluating
        model.eval()
        with torch.no_grad():
            corr_num = 0
            err_num = 0
            for batch in dev_iter:
                (inputs, lens), labels = batch.text, batch.label
                outputs = model(inputs, lens)
                corr_num += (outputs.argmax(1) == labels).sum().item()
                err_num += (outputs.argmax(1) != labels).sum().item()
            tqdm.write('Epoch {}, Accuracy {}'.format(epoch, corr_num / (corr_num + err_num)))
    if use_lstm:
        plt.title('LSTM Model')
    elif use_rnn:
        plt.title('RNN Model')
    else:
        plt.title('CNN Model')
    plt.plot(np.arange(len(loss_history)), np.array(loss_history))
    plt.xlabel('Iterations')
    plt.ylabel('Training Loss')
    plt.show()
    # predicting
    model.eval()
    with torch.no_grad():
        predicts = []
        for batch in test_iter:
            inputs, lens = batch.text
            outputs = model(inputs, lens)
            predicts.extend(outputs.argmax(1).cpu().numpy())
        test_data = pd.read_csv(os.path.join(data_path, 'test.tsv'), sep='\t')
        test_data["Sentiment"] = predicts
        test_data[['PhraseId', 'Sentiment']].set_index('PhraseId').to_csv('result.csv')

model

Models : LSTM RNN CNN

import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, num_classes,
                 bidirectional=True, dropout_rate=0.3):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional)
        self.bidirectional = bidirectional
        if not bidirectional:
            self.fc = nn.Linear(hidden_size, num_classes)
        else:
            self.fc = nn.Linear(hidden_size * 2, num_classes)
        self.dropout = nn.Dropout(dropout_rate)
        self.init()

    def init(self):
        std = 1.0 / math.sqrt(self.hidden_size)
        for w in self.parameters():
            w.data.uniform_(-std, std)

    def forward(self, x, lens):
        embeddings = self.embed(x)
        output, _ = self.rnn(embeddings)

        real_output = output[range(len(lens)), lens - 1]
        out = self.fc(self.dropout(real_output))
        return out
class LSTM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, num_classes,
                 bidirectional=True, dropout_rate=0.3):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional)
        self.bidirectional = bidirectional
        if not bidirectional:
            self.fc = nn.Linear(hidden_size, num_classes)
        else:
            self.fc = nn.Linear(hidden_size * 2, num_classes)
        self.dropout = nn.Dropout(dropout_rate)
        self.init()

    def init(self):
        std = 1.0 / math.sqrt(self.hidden_size)
        for w in self.parameters():
            w.data.uniform_(-std, std)

    def forward(self, x, lens):
        embeddings = self.embed(x)
        output, _ = self.rnn(embeddings)

        real_output = output[range(len(lens)), lens - 1]
        out = self.fc(self.dropout(real_output))
        return out
class CNN(nn.Module):
    def __int__(self, vocab_size, embed_size, num_classes, num_filters=100, kernel_size=[2, 3, 4], dropout_rate=0.3):
        super(CNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.convs = nn.ModuleList([
            nn.Conv2d(1, num_filters, (k, embed_size), padding=(k - 1, 0))
            for k in kernel_size
        ])
        self.fc = nn.Linear(len(kernel_size) * num_filters, num_classes)
        self.dropout = nn.Dropout(dropout_rate)

    def conv_and_pool(self, x, conv):
        x = F.relu(conv(x).squeeze(3))
        x_max = F.max_pool1d(x, x.size(2)).squeeze(2)
        return x_max

    def forward(self, x, lens):
        embed = self.embed(x).unsqueeze(1)

        conv_results = [self.conv_and_pool(embed, conv) for conv in self.convs]

        out = torch.cat(conv_results, 1)
        return self.fc(self.dropout(out))

自己用抱抱脸的bert模型重写了一遍:

import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments
import numpy as np
from torch.utils.data import DataLoader
from transformers import AdamW
from transformers import get_scheduler
from transformers import Trainer
from tqdm.auto import tqdm
import os
os.environ["KMP_DUPLICATE_LIB_OK"]  =  "TRUE"
import matplotlib.pyplot as plt
from datasets import load_metric

def tokenize_function(example):
    return tokenizer(example["Phrase"], truncation=True)

def compute_metrics(eval_preds):
    metric = load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

if __name__ == "__main__":
    data_files = {"train": "data/train.tsv", "validation" : "data/validation.tsv"
        ,"test": "data/test.tsv"}
    data = load_dataset("csv", data_files=data_files,delimiter="\t")
    checkpoint = "distilbert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=5)
    tokenized_datasets = data.map(tokenize_function, batched=True)

    tokenized_datasets = tokenized_datasets.remove_columns(["PhraseId", "SentenceId"])
    tokenized_datasets = tokenized_datasets.rename_column("Sentiment", "labels")
    tokenized_datasets = tokenized_datasets.remove_columns(["Phrase"])
    tokenized_datasets.set_format("torch")

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments("test-trainer")

    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, batch_size=16, collate_fn=data_collator
    )
    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], batch_size=16, collate_fn=data_collator
    )

    optimizer = AdamW(model.parameters(), lr=0.0001)

    num_epochs = 1
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )
    print(num_training_steps)

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)

    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    progress_bar = tqdm(range(num_training_steps))

    #model.train()
    loss_list = []
    for epoch in range(num_epochs):
        for idx, batch in enumerate(train_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            if idx % 100 == 0:
                loss_list.append(loss.item())
                tqdm.write('step:{}, loss :{}'.format(idx/100, loss.item() ) )
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

    plt.plot(np.arange(len(loss_list)), np.array(loss_list))
    plt.xlabel('Iterations')
    plt.ylabel('Training Loss')
    plt.title('distilled-bert-uncased')
    plt.show()

    metric = load_metric("accuracy")
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    metric.compute()

训练结果

LSTM model训练结果:
在这里插入图片描述

RNN model训练结果:
在这里插入图片描述
bert的结果,可以看到还是非常的拉胯,无法收敛
在这里插入图片描述

总结

LSTM和RNN模型在此数据集上差距不是很大…但是RNN出现了异常值,可能是因为有脏数据(噪声过多,很烂的数据集)

之后试试用transformer bert等现在用的更多的模型来跑一跑试试,效果可能会有一定提升(已使用bert,并无提升,看了一下数据集应该是因为数据集噪声太多而导致无法收敛的)

数据集太小,just for fun.

了解了用torchtext读数据和构建trivial神经网络的过程

Pytorch yyds!

抱抱脸 yyds!

  • 1
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
基于深度学习文本分类任务是指利用深度学习模型对文本进行情感分类。在这个任务中,我们使用了CNN和RNN模型来进行文本分类。数据集包含了15万余项英文文本,情感分为0-4共五类情感。任务的流程如下:输入数据→特征提取→神经网络设计→结果输出。 在特征提取阶段,我们使用了词嵌入(Word embedding)技术。词嵌入是一种将单词映射到低维向量空间的方法,它可以将单词的语义信息编码为向量表示。在本次任务中,我们参考了博客\[NLP-Beginner 任务:基于深度学习文本分类\](https://pytorch.org/Convolutional Neural Networks for Sentence Classification)中的方法,使用了预训练的词嵌入模型。 神经网络设计阶段,我们采用了卷积神经网络(CNN)和循环神经网络(RNN)的结合。具体来说,我们使用了四个卷积核,大小分别为2×d, 3×d, 4×d, 5×d。这样设计的目的是为了挖掘词组的特征。例如,2×d的卷积核用于挖掘两个连续单词之间的关系。在模型中,2×d的卷积核用红色框表示,3×d的卷积核用黄色框表示。 最后,我们将模型的输出结果进行分类,得到文本的情感分类结果。这个任务的目标是通过深度学习模型对文本进行情感分类,以便更好地理解和分析文本数据。 #### 引用[.reference_title] - *1* *3* [NLP-Brginner 任务:基于深度学习文本分类](https://blog.csdn.net/m0_61688615/article/details/128713638)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v91^insert_down1,239^v3^insert_chatgpt"}} ] [.reference_item] - *2* [NLP基本任务:基于深度学习文本分类](https://blog.csdn.net/Mr_green_bean/article/details/90480918)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v91^insert_down1,239^v3^insert_chatgpt"}} ] [.reference_item] [ .reference_list ]

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值