代码实现TextCNN分类

该代码示例展示了如何在Python中利用Torch构建一个文本卷积神经网络(TextCNN)模型,对文本数据进行预处理,包括读取数据、构建词库、定义Dataset类,以及训练和测试模型。模型包含了多个卷积层和最大池化层,用于从文本数据中提取特征并进行分类。
摘要由CSDN通过智能技术生成

调包:

import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'#mac系统防止出错
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm ##进度可视化

读数据:

def read_data(train_or_test, num=None):
    '''
    :param train_or_test: 指定数据集用于训练还是测试
    :param num: 使用的数据数量
    :return: 文本及对应分类
    '''
    with open(os.path.join("data", train_or_test + ".txt"), encoding="utf-8") as f:
        all_data = f.read().split("\n") #split返回列表,以\n来分割

    texts = []
    labels = []
    for data in all_data:
        if data:
            text, label = data.split("\t")
            texts.append(text)
            labels.append(label)
    if num == None:
        return texts, labels
    else:
        return texts[:num], labels[:num]

构建字库,并且返回embedding:

def built_curpus(train_texts, embedding_num):
    word_2_index = {"<PAD>": 0, "<UNK>": 1}
    for text in train_texts:
        for word in text:
            word_2_index[word] = word_2_index.get(word, len(word_2_index))
    return word_2_index, nn.Embedding(len(word_2_index), embedding_num)

Dataset:

class TextDataset(Dataset):
    def __init__(self, all_text, all_label, word_2_index, max_len):
        self.all_text = all_text
        self.all_label = all_label
        self.word_2_index = word_2_index
        self.max_len = max_len

    def __getitem__(self, index):
        text = self.all_text[index][:self.max_len]
        label = int(self.all_label[index])

        text_idx = [self.word_2_index.get(i, 1) for i in text]
        text_idx = text_idx + [0] * (self.max_len - len(text_idx))#填充<PAD>
        text_idx = torch.tensor(text_idx).unsqueeze(dim=0)#列表变tensor 最终tensor的维度为1 * max_len

        return text_idx, label
    
    def __len__(self):
        return len(self.all_text)

构造CNN block:

class Block(nn.Module):
    def __init__(self, kernel_s, embeddin_num, max_len, hidden_num):
        '''
        :param kernel_s: 卷积核的大小
        :param embeddin_num:
        :param max_len:
        :param hidden_num: 输出通道数
        '''
        super().__init__()
        self.cnn = nn.Conv2d(in_channels=1, out_channels=hidden_num, kernel_size=(kernel_s, embeddin_num)) #  1 * 1 * 7 * 5 [batch_size, input_channels, seq_len, embeddin_num]
        self.act = nn.ReLU()
        self.mxp = nn.MaxPool1d(kernel_size=(max_len-kernel_s+1))

    def forward(self, batch_emb):# 1 * 1 * 7 * 5 [batch_size, input_channels, seq_len, embeddin_num]
        c = self.cnn.forward(batch_emb)#卷积操作,[batch_size, input_channels, seq_len, embeddin_num]->[batch_size, output_channels, seq_len - kernel_s + 1, 1]
        a = self.act.forward(c)#激活函数,[batch_size, output_channels, seq_len - kernel_s + 1, 1]
        a = a.squeeze(dim=-1)#去除最后一个维度,[batch_size, output_channels, seq_len - kernel_s + 1]
        m = self.mxp.forward(a)#最大池化,[batch_size, output_channels, 1]
        m = m.squeeze(dim=-1)#去除最后一个维度,[batch_size, output_channels]
        return m#[batch_size, output_channels]

构建TextCNN:

class TextCNNModel(nn.Module):
    def __init__(self, emb_matrix, max_len, class_num, hidden_num):
        super().__init__()

        self.emb_matrix = emb_matrix
        self.embeddin_num = emb_matrix.weight.shape[1]

        self.block1 = Block(2, self.embeddin_num, max_len, hidden_num)
        self.block2 = Block(3, self.embeddin_num, max_len, hidden_num)
        self.block3 = Block(4, self.embeddin_num, max_len, hidden_num)
        self.block4 = Block(5, self.embeddin_num, max_len, hidden_num)

        self.classifier = nn.Linear(hidden_num * 4, class_num)  # 2 * 3
        self.loss_fun = nn.CrossEntropyLoss()

    def forward(self, batch_idx, batch_label = None):
        batch_emb = self.emb_matrix(batch_idx)
        b1_result = self.block1.forward(batch_emb)
        b2_result = self.block2.forward(batch_emb)
        b3_result = self.block3.forward(batch_emb)
        b4_result = self.block4.forward(batch_emb)

        feature = torch.cat([b1_result, b2_result, b3_result, b4_result], dim=1)# 1* 6 : [ batch * (3 * 2)]
        pre = self.classifier(feature)

        if batch_label is not None:
            loss = self.loss_fun(pre, batch_label)
            return loss
        else:
            return torch.argmax(pre, dim=-1)

主函数,训练模型并测试:

if __name__ == "__main__":
    train_text, train_label = read_data("train")
    dev_text, dev_label = read_data("dev")

    embeddin_num = 50
    max_len = 20
    batch_size = 200
    epoch = 1000
    lr = 0.001
    hidden_num = 2
    class_num = len(set(train_label))
    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    word_2_index, words_embedding = built_curpus(train_text, embeddin_num)#构建字库,并且embedding

    train_dataset = TextDataset(train_text, train_label, word_2_index, max_len)
    train_loader = DataLoader(train_dataset, batch_size, shuffle=False)#没有乱序

    dev_dataset = TextDataset(dev_text, dev_label, word_2_index, max_len)
    dev_loader = DataLoader(dev_dataset, batch_size, shuffle=False)

    model = TextCNNModel(words_embedding, max_len, class_num, hidden_num).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=lr)

    for e in range(epoch):
        for batch_idx, batch_label in train_loader:
            batch_idx = batch_idx.to(device)
            batch_label = batch_label.to(device)
            loss = model.forward(batch_idx, batch_label)
            loss.backward()
            opt.step()
            opt.zero_grad()

        print(f"loss:{loss:.3f}")

        right_num = 0
        for batch_idx, batch_label in dev_loader:
            batch_idx = batch_idx.to(device)
            batch_label = batch_label.to(device)
            pre = model.forward(batch_idx)
            right_num += int(torch.sum(pre == batch_label))

        print(f"acc = {right_num/len(dev_text)*100:.2f}%")
以下是基于Pytorch实现textcnn文本分类模型的代码,其中使用了CNN对文本进行卷积和池化,再经过全连接层进行分类。具体代码如下: ```python import torch import torch.nn as nn import torch.nn.functional as F class TextCNN(nn.Module): def __init__(self, vocab_size, embedding_dim, num_classes, filter_sizes, num_filters): super(TextCNN, self).__init__() self.embeddings = nn.Embedding(vocab_size, embedding_dim) self.convs = nn.ModuleList([ nn.Conv2d(1, num_filters, (fs, embedding_dim)) for fs in filter_sizes ]) self.fc = nn.Linear(num_filters * len(filter_sizes), num_classes) def forward(self, x): x = self.embeddings(x) # (batch_size, seq_len, embedding_dim) x = x.unsqueeze(1) # (batch_size, 1, seq_len, embedding_dim) x = [F.relu(conv(x)).squeeze(3) for conv in self.convs] # [(batch_size, num_filters, seq_len - filter_size + 1), ...] x = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in x] # [(batch_size, num_filters), ...] x = torch.cat(x, 1) # (batch_size, num_filters * len(filter_sizes)) x = self.fc(x) # (batch_size, num_classes) return x ``` 其中,`vocab_size`表示词汇表大小,`embedding_dim`表示词向量维度,`num_classes`表示分类数目,`filter_sizes`表示卷积核大小,`num_filters`表示卷积核个数。 在`forward`函数中,首先将输入的文本通过词向量层进行转换,得到(batch_size, seq_len, embedding_dim)的张量。然后通过卷积层和池化层对文本进行特征提取,得到形状为(batch_size, num_filters * len(filter_sizes))的张量,最后通过全连接层进行分类
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值