1. 数据集
import numpy as np
import torch
from torch import nn, optim
from torchtext import data, datasets
import numpy as np
import torch
from torch import nn, optim
from torchtext import data, datasets
# use torchtext to load data, no need to download dataset
# set up fields
# 两个Field对象定义字段的处理方法(文本字段、标签字段)
TEXT = data.Field(tokenize='spacy') # 分词
LABEL = data.LabelField(dtype=torch.float)
# make splits for data
# IMDB共50000影评,包含正面和负面两个类别。数据被前面的Field处理
# 按照(TEXT, LABEL) 分割成 训练集,测试集
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
print('len of train data:', len(train_data)) # 25000
print('len of test data:', len(test_data)) # 25000
# torchtext.data.Example : 用来表示一个样本,数据+标签
print(train_data.examples[15].text)