AG_NEWS简介
简单描述:AG_NEWS是通过从原始语料库中选择4个最大的类别来构建的。每个类别包含30,000个训练样本和1,900个测试样本。训练样本的总数为120,000,测试样本为7,600。 https://github.com/mhjabreel/CharCNN/tree/master/data
简单文本处理介绍
from torchtext.vocab import build_vocab_from_iterator
# 假设这是一个包含文本分词结果的列表
word_list = ["apple", "banana", "crange", "dear", "grape","kiwi", "strawberry","<unk>", "watermelon"]
# 构建词汇表对象
vocab = build_vocab_from_iterator(iter([word_list]))
# 设置默认索引为 "<unk>" 对应的索引,默认索引也就是起始索引,下标为0
vocab.set_default_index(vocab["<unk>"])
print(vocab(['apple', 'banana']))
# 将一个包含未知单词的列表转换为索引列表,也就是默认索引为初始下标0,其余字母按照首字母大小排列
index_list = [vocab[token] for token in word_list]
# 打印转换后的索引列表
print(index_list)
vocab_size = len(vocab)#计算词汇表大小
print(vocab_size)
输出结果
AG_NEWS文本分类开始啦
1.引入相关模块和文本预处理
from torchtext.datasets import AG_NEWS
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
train_iter = AG_NEWS(split='train')
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
tokenizer = get_tokenizer('basic_english') # 返回分词器函数,“get_tokenizer函数详解”在下述
#通过for循环遍历data_iter中的每个元素(即每个包含文本内容的元组)。由于只关心文本内容,因此使用下划线 _ 来表示元组中的第一个元素(通常是ID或索引),并将文本内容赋值给变量text.
# 然后,调用之前定义的分词器函数 tokenizer 对文本进行分词,并使用 yield 关键字将分词结果依次返回给调用者。
def yield_tokens(data_iter):
for _, text in data_iter:#获取每一条的标签label和内容text
yield tokenizer(text)#对获取内容分词,并返回。yield返回一个迭代器对象
for i in range(3):
print(list(next(yield_tokens(train_iter))))
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"]) # 设置默认索引,如果找不到单词,则会选择默认索引
print(vocab(['here', 'is', 'an', 'example'])) #输出的是每个单词的索引
text_pipeline = lambda x: vocab(tokenizer(x)) #接受参数x,返回的是vocab(tokenizer(x))分词索引
label_pipeline = lambda x: int(x) - 1 #接受参数x,返回该数值减一
print(text_pipeline('here is the an example'))
PS:get_tokenizer() 函数通常是在某个 NLP(自然语言处理)库中定义的一个函数,它用于获取适当的分词器。分词是自然语言处理中的一个重要环节,它将一段文本切割成若干个有意义的单元,这些单元被称为“token”,是后续自然语言处理任务的基础。
不同的 NLP 库会提供不同的分词器,get_tokenizer() 函数就是用来获取相应的分词器。具体而言,这个函数会根据传入的参数(一般是语言或者语言加上特性),返回一个对应的分词器对象。
例如,假设我们使用了 PyTorch 的 torchtext 库,我们可以按照以下方式获取适当的英语分词器:
from torchtext.legacy.data.utils import get_tokenizer
tokenizer = get_tokenizer(‘basic_english’)
tokens = tokenizer(“Hello world, I’m an AI assistant.”)
print(tokens)
输出为
[‘hello’, ‘world’, ‘,’, ‘i’, “'m”, ‘an’, ‘ai’, ‘assistant’, ‘.’]
2.定义数据加载器
from torch.utils.data import DataLoader
def collate_batch(batch):
label_list, text_list, offsets = [], [], [0]
for (_label, _text) in batch:
# 标签列表
label_list.append(label_pipeline(_label))
# 文本列表
processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
text_list.append(processed_text)
# 偏移量,即语句的总词汇量
offsets.append(processed_text.size(0))
label_list = torch.tensor(label_list, dtype=torch.int64)
text_list = torch.cat(text_list)
offsets = torch.tensor(offsets[:-1]).cumsum(dim=0) # 返回维度dim中输入元素的累计和
return label_list.to(device), text_list.to(device), offsets.to(device)
# 数据加载器
dataloader = DataLoader(train_iter,
batch_size=8,
shuffle=False,
collate_fn=collate_batch)
3.定义网络模型(简单全连接)
from torch import nn
#这个模型的输入是词汇表大小,嵌入维度,分类种类
# 模型的输出是一个大小为 num_class 的张量,表示每个类别的分数。
class TextClassificationModel(nn.Module):
def __init__(self, vocab_size, embed_dim, num_class):
super(TextClassificationModel, self).__init__()
self.embedding = nn.EmbeddingBag(vocab_size, # 词典大小
embed_dim, # 嵌入的维度
sparse=False) #
self.fc = nn.Linear(embed_dim, num_class)
self.init_weights()
def init_weights(self):
initrange = 0.5
self.embedding.weight.data.uniform_(-initrange, initrange)
self.fc.weight.data.uniform_(-initrange, initrange)
self.fc.bias.data.zero_()
def forward(self, text, offsets):
embedded = self.embedding(text, offsets)
return self.fc(embedded)
4.定义参数以及进行训练
num_class = len(set([label for (label, text) in train_iter]))#利用集合去重的方式等到不同类别的数目
print(num_class)
vocab_size = len(vocab)#计算词汇表大小
em_size = 64 #指定嵌入维度
model = TextClassificationModel(vocab_size, em_size, num_class).to(device)
import time
import torch.optim as optim
def train(dataloader):
model.train() # 切换为训练模式
total_acc, train_loss, total_count = 0, 0, 0
log_interval = 100
start_time = time.time()
for idx, (label, text, offsets) in enumerate(dataloader):
predicted_label = model(text, offsets)
optimizer.zero_grad() # grad属性归零
loss = criterion(predicted_label, label) # 计算网络输出和真实值之间的差距,label为真实值
loss.backward() # 反向传播
optimizer.step() # 每一步自动更新
# 记录acc与loss
total_acc += (predicted_label.argmax(1) == label).sum().item()
train_loss += loss.item()
total_count += label.size(0)
if idx % log_interval == 0 and idx > 0:
elapsed = time.time() - start_time
print('| epoch {:1d} | {:4d}/{:4d} batches '
'| train_acc {:4.3f} train_loss {:4.5f}'.format(epoch, idx, len(dataloader),
total_acc / total_count, train_loss / total_count))
total_acc, train_loss, total_count = 0, 0, 0
start_time = time.time()
def evaluate(dataloader):
model.eval() # 切换为测试模式
total_acc, train_loss, total_count = 0, 0, 0
with torch.no_grad():
for idx, (label, text, offsets) in enumerate(dataloader):
predicted_label = model(text, offsets)
loss = criterion(predicted_label, label) # 计算loss值
# 记录测试数据
total_acc += (predicted_label.argmax(1) == label).sum().item()
train_loss += loss.item()
total_count += label.size(0)
return total_acc / total_count, train_loss / total_count
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
# 超参数
EPOCHS = 100 # epoch
LR = 3 # 学习率
BATCH_SIZE = 512 # batch size for training
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None
train_iter, test_iter = AG_NEWS() # 加载数据
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(train_dataset,
[num_train, len(train_dataset) - num_train])
train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
shuffle=True, collate_fn=collate_batch)
for epoch in range(1, EPOCHS + 1):
epoch_start_time = time.time()
train(train_dataloader)
val_acc, val_loss = evaluate(valid_dataloader)
if total_accu is not None and total_accu > val_acc:
scheduler.step()
else:
total_accu = val_acc
print('-' * 69)
print('| epoch {:1d} | time: {:4.2f}s | '
'valid_acc {:4.3f} valid_loss {:4.3f}'.format(epoch,
time.time() - epoch_start_time,
val_acc, val_loss))
print('-' * 69)
输出结果
| epoch 1 | 100/ 223 batches | train_acc 0.449 train_loss 0.00251
| epoch 1 | 200/ 223 batches | train_acc 0.646 train_loss 0.00195
---------------------------------------------------------------------
| epoch 1 | time: 9.22s | valid_acc 0.727 valid_loss 0.002
---------------------------------------------------------------------
| epoch 2 | 100/ 223 batches | train_acc 0.767 train_loss 0.00136
| epoch 2 | 200/ 223 batches | train_acc 0.815 train_loss 0.00111
---------------------------------------------------------------------
| epoch 2 | time: 9.06s | valid_acc 0.821 valid_loss 0.001
---------------------------------------------------------------------
| epoch 3 | 100/ 223 batches | train_acc 0.842 train_loss 0.00094
| epoch 3 | 200/ 223 batches | train_acc 0.855 train_loss 0.00086
---------------------------------------------------------------------
| epoch 3 | time: 9.06s | valid_acc 0.854 valid_loss 0.001
---------------------------------------------------------------------
| epoch 4 | 100/ 223 batches | train_acc 0.867 train_loss 0.00080
| epoch 4 | 200/ 223 batches | train_acc 0.876 train_loss 0.00075
---------------------------------------------------------------------
| epoch 4 | time: 9.29s | valid_acc 0.868 valid_loss 0.001
**......**
| epoch 98 | 100/ 223 batches | train_acc 0.913 train_loss 0.00052
| epoch 98 | 200/ 223 batches | train_acc 0.912 train_loss 0.00052
---------------------------------------------------------------------
| epoch 98 | time: 9.41s | valid_acc 0.896 valid_loss 0.001
---------------------------------------------------------------------
| epoch 99 | 100/ 223 batches | train_acc 0.914 train_loss 0.00052
| epoch 99 | 200/ 223 batches | train_acc 0.913 train_loss 0.00053
---------------------------------------------------------------------
| epoch 99 | time: 9.74s | valid_acc 0.896 valid_loss 0.001
---------------------------------------------------------------------
| epoch 100 | 100/ 223 batches | train_acc 0.913 train_loss 0.00052
| epoch 100 | 200/ 223 batches | train_acc 0.912 train_loss 0.00053
---------------------------------------------------------------------
| epoch 100 | time: 9.76s | valid_acc 0.896 valid_loss 0.001
---------------------------------------------------------------------
Process finished with exit code 0
可见经过简单的训练可以达到约90%准确率,后续可以继续调参或者调整模型网络进行增进。