AG_NEWS新闻分类任务

关于新闻主题分类任务:目前视频和网上的代码都不能完整的运行,所以从下载数据集开始,重新写一下。

1. 数据集介绍

AG_NEWS 数据集包含4个文件,如下图

 

classes.txt:保存类别

test.csv:测试数据,7600条

train.csv:训练数据,120000条

 2. 对数据集处理

导入包

import torch
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import time
from torch.utils.data.dataset import random_split              # 导入数据随机划分方法工具
import warnings
warnings.filterwarnings('ignore')

读取csv文件

def load_data(csv_file):
    df = pd.read_csv(csv_file, header=None)     # pd默认第一行不读取,所以添加 header
    dataTmep = []

    # 逐行读取,_ 行号,row 内容
    for _, row in df.iterrows():
        label = row[0]
        context = row[1] + row[2]               # 将标题,内容合并
        dataTmep.append((label, context))
    return dataTmep


cutlen = 64
train_dataset = load_data("./data/ag_news_csv/train.csv")
test_dataset = load_data("./data/ag_news_csv/test.csv")

将读取到的文件打包,形成可以读取的dataset,并生成vocab,查看结果

def process_datasets_by_Tokenizer(train_datasets, test_datasets, cutlen=cutlen):
    tokenizer = Tokenizer()

    train_datasets_texts = []
    train_datasets_labels = []
    test_datasets_texts = []
    test_datasets_labels = []

    for index in range(len(train_datasets)):
        train_datasets_labels.append(train_datasets[index][0] - 1)
        train_datasets_texts.append(train_datasets[index][1])

    for index in range(len(test_datasets)):
        test_datasets_labels.append(test_datasets[index][0] - 1)
        test_datasets_texts.append(test_datasets[index][1])

    all_datasets_texts = train_datasets_texts + test_datasets_texts
    all_datasets_labels = train_datasets_labels + test_datasets_labels

    tokenizer.fit_on_texts(all_datasets_texts)

    train_datasets_seqs = tokenizer.texts_to_sequences(train_datasets_texts)
    test_datasets_seqs = tokenizer.texts_to_sequences(test_datasets_texts)

    train_datasets_seqs = pad_sequences(train_datasets_seqs, cutlen)
    test_datasets_seqs = pad_sequences(test_datasets_seqs, cutlen)

    train_datasets = list(zip(train_datasets_seqs, train_datasets_labels))
    test_datasets = list(zip(test_datasets_seqs, test_datasets_labels))

    vocab_size = len(tokenizer.index_word.keys())
    num_class = len(set(all_datasets_labels))
    return train_datasets, test_datasets, vocab_size, num_class, tokenizer


train_datasets, test_datasets, vocab_size, num_class, tokenizer = process_datasets_by_Tokenizer(train_dataset, test_dataset, cutlen=cutlen)

print("查看处理之后的数据: ")
print("train:\n", train_datasets[:2])
print("test:\n", test_datasets[:2])
print("vocab_size = {}, num_class = {}".format(vocab_size, num_class))
print()

3. 构建带有 Embedding 层的文本分类模型

BATCH_SIZE = 16
VOCAB_SIZE = vocab_size                 # 获得整个语料包含的不同词汇总数
NUM_CLASS = num_class                   # 获得类别总数
EMBED_DIM = 128

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        """
        类的初始化函数
        :param vocab_size: 整个语料包含的不同词汇总数
        :param embed_dim: 指定词嵌入的维度
        :param num_class: 文本分类的类别总数
        """
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5                                                    
        self.embedding.weight.data.uniform_(-initrange, initrange)         
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()                                          

    def forward(self, text):
        """
        逻辑函数
        :param text: 文本数值映射后的结果
        :return: 与类别数尺寸相同的张量,用以判断文本类别
        """
        embedded = self.embedding(text)
        c = embedded.size(0) // BATCH_SIZE
        embedded = embedded[: BATCH_SIZE * c]
        embedded = embedded.transpose(1, 0).unsqueeze(0)
        embedded = F.avg_pool1d(embedded, kernel_size=c)
        return self.fc(embedded[0].transpose(1, 0))


# 实例化模型
model = TextSentiment(VOCAB_SIZE + 1, EMBED_DIM, NUM_CLASS).to(device)

print("查看模型: ")
print(model)
print()

 4. 对数据进行 batch 处理

def generate_batch(batch):
    """
    生成 batch 数据函数
    :param batch: 由样本核对应标签的元组组成的 batch_size 大小的列表,形如[(sample1, label1), (sample2, label2)......]
    :return: 样本张量核标签各自的列表形式 (张量),形如 text = tensor([sample1, sample2....]),label = tensor([label1, label2,...])
    """
    text = []
    label = []
    for item in batch:
        text.extend(item[0])
        label.append(item[1])
    return torch.tensor(text), torch.tensor(label)


# 假设一个输入
print("测试将一个 batch 张量合并: ")
batch = [(torch.tensor([3, 23, 2, 8]), 1), (torch.tensor([3, 45, 21, 6]), 0)]
res = generate_batch(batch)
print(res)
print()

5.构建训练与验证函数

 构建损失函数,优化器等

criterion = torch.nn.CrossEntropyLoss().to(device)                             # 选择损失函数,选择预定义的交叉熵损失函数
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)                        # 选择随机梯度下降优化器
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)           # 选择优化器步长调节方法 StepLR,用来衰减学习率

定义训练函数

def train(train_data):
    train_loss = 0
    train_acc = 0

    # 使用数据加载器生成 BATCH_SIZE 大小的数据进行批次训练
    data = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)

    for i, (text, cls) in enumerate(data):
        optimizer.zero_grad()                                   
        text = text.to(device)
        cls = cls.to(device)
        output = model(text)                                   
        loss = criterion(output, cls)                           
        train_loss += loss.item()                               # 将该批次的损失加到总损失中
        loss.backward()                                         
        optimizer.step()                                        
        train_acc += (output.argmax(1) == cls).sum().item()     # 将该批次的准去率加到总准确率中 (返回 1 和 0,再被累加)

    scheduler.step()

    # 返回本轮训练的平均损失核平均准确率
    return train_loss / len(train_data), train_acc / len(train_data)

定义预测函数

def valid(test_data):
    loss = 0
    acc = 0

    # 和训练相同,使用 DataLoader 获得训练数据生成器
    data = DataLoader(test_data, batch_size=BATCH_SIZE, collate_fn=generate_batch)

    for text, cls in data:
        with torch.no_grad():
            text = text.to(device)
            cls = cls.to(device)
            output = model(text)                             
            loss = criterion(output, cls)                   
            loss += loss.item()                              # 将损失和准确率加到总损失和准确率中
            acc += (output.argmax(1) == cls).sum().item()

    # 返回本轮验证的平均损失和平均准确率
    return loss / len(test_data), acc / len(test_data)

6. 进行模型训练和验证(调用已经定义的模块)

定义训练信息

N_EPOCHS = 20                                     # 指定训练轮数

train_len = int(len(train_datasets) * 0.95)       # 从 train_datasets 取出 0.95 作为训练集,先取其长度

# 然后使用 random_split 进行乱序划分,得到对应的训练集和验证集
sub_train_, sub_valid_ = random_split(train_datasets, [train_len, len(train_datasets) - train_len])

迭代训练,并打印训练集、验证集的损失函数和准确率

# 开始每一轮训练
for epoch in range(N_EPOCHS):
    start_time = time.time()                                # 记录训练开始的时间

    # 调用 train 和 valid 函数得到训练和验证的平均损失,平均准确率
    train_loss, train_acc = train(sub_train_)
    valid_loss, valid_acc = valid(sub_valid_)

    # 计算训练和验证的总耗时
    secs = int(time.time() - start_time)

    # 用分钟和秒表示
    mins = secs / 60
    secs = secs % 60

    # 打印训练和验证耗时,平均损失,平均准确率
    print('Epoch: %d' % (epoch + 1), " | time in %d minutes, %d seconds" % (mins, secs))
    print(f'\t Loss: {train_loss: .4f}(train) \t | \t Acc: {train_acc * 100: .1f} % (train)')
    print(f'\t Loss: {valid_loss: .4f}(valid) \t | \t Acc: {valid_acc * 100: .1f} % (valid)')

 

7. 用测试集进行测试

valid_loss, valid_acc = valid(test_datasets)
print("测试集上测试: ")
print(f'\t Loss: {valid_loss: .4f}(valid) \t | \t Acc: {valid_acc * 100: .1f} % (valid)')
print()

 

496,835 条来自 AG 新闻语料库 4 大类别超过 2000 个新闻源的新闻文章,数据集仅仅援用了标题和描述字段。每个类别分别拥有 30,000 个训练样本及 1900 个测试样本。 README: AG's News Topic Classification Dataset Version 3, Updated 09/09/2015 ORIGIN AG is a collection of more than 1 million news articles. News articles have been gathered from more than 2000 news sources by ComeToMyHead in more than 1 year of activity. ComeToMyHead is an academic news search engine which has been running since July, 2004. The dataset is provided by the academic comunity for research purposes in data mining (clustering, classification, etc), information retrieval (ranking, search, etc), xml, data compression, data streaming, and any other non-commercial activity. For more information, please refer to the link http://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html . The AG's news topic classification dataset is constructed by Xiang Zhang (xiang.zhang@nyu.edu) from the dataset above. It is used as a text classification benchmark in the following paper: Xiang Zhang, Junbo Zhao, Yann LeCun. Character-level Convolutional Networks for Text Classification. Advances in Neural Information Processing Systems 28 (NIPS 2015). DESCRIPTION The AG's news topic classification dataset is constructed by choosing 4 largest classes from the original corpus. Each class contains 30,000 training samples and 1,900 testing samples. The total number of training samples is 120,000 and testing 7,600. The file classes.txt contains a list of classes corresponding to each label. The files train.csv and test.csv contain all the training samples as comma-sparated values. There are 3 columns in them, corresponding to class index (1 to 4), title and description. The title and description are escaped using double quotes ("), and any internal double quote is escaped by 2 double quotes (""). New lines are escaped by a backslash followed with an "n" character, that is "\n".
AG's News Topic Classification Dataset Version 3, Updated 09/09/2015 ORIGIN AG is a collection of more than 1 million news articles. News articles have been gathered from more than 2000 news sources by ComeToMyHead in more than 1 year of activity. ComeToMyHead is an academic news search engine which has been running since July, 2004. The dataset is provided by the academic comunity for research purposes in data mining (clustering, classification, etc), information retrieval (ranking, search, etc), xml, data compression, data streaming, and any other non-commercial activity. For more information, please refer to the link http://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html . The AG's news topic classification dataset is constructed by Xiang Zhang (xiang.zhang@nyu.edu) from the dataset above. It is used as a text classification benchmark in the following paper: Xiang Zhang, Junbo Zhao, Yann LeCun. Character-level Convolutional Networks for Text Classification. Advances in Neural Information Processing Systems 28 (NIPS 2015). DESCRIPTION The AG's news topic classification dataset is constructed by choosing 4 largest classes from the original corpus. Each class contains 30,000 training samples and 1,900 testing samples. The total number of training samples is 120,000 and testing 7,600. The file classes.txt contains a list of classes corresponding to each label. The files train.csv and test.csv contain all the training samples as comma-sparated values. There are 3 columns in them, corresponding to class index (1 to 4), title and description. The title and description are escaped using double quotes ("), and any internal double quote is escaped by 2 double quotes (""). New lines are escaped by a backslash followed with an "n" character, that is "\n".
【资源说明】 1、该资源包括项目的全部源码,下载可以直接使用! 2、本项目适合作为计算机、数学、电子信息等专业的课程设计、期末大作业和毕设项目,作为参考资料学习借鉴。 3、本资源作为“参考资料”如果需要实现其他功能,需要能看懂代码,并且热爱钻研,自行调试。 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻数据集分类(AG_news)算法源码.zip 基于textCNN卷积神经网络的英文新闻
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值