import collections
import os
import time
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
import torch.nn.functional as F
import random
from tqdm import tqdm
# 空格分词defget_tokenized_imdb(data):'''
:param data: list of [string, label]
'''deftokenizer(text):return[tok.lower()for tok in text.split(' ')]return[tokenizer(review)for review,_ in data]# 只从data中读取review(评论)内容而不读取标签(label),对review使用tokenizer方法进行分词
创建词典
# 创建词典defget_vocab_imdb(data):
tokenized_data = get_tokenized_imdb(data)# 调用get_tokenized_imdb()空格分词方法获取到分词后的数据tokenized_data
counter = collections.Counter([tk for st in tokenized_data for tk in st])# 读取tokenized_data列表中每个句子的每个词,放入列表中。# collections.Counter()方法可计算出列表中所有不重复的词数总和return Vocab.Vocab(counter, min_freq=5)# 去掉词频小于5的词
对data列表中的每行数据进行处理,将词转换为索引,并使每行数据等长
defprocess_imdb(data, vocab):
max_len =500# 每条评论通过截断或者补0,使得长度变成500defpad(x):return x[:max_len]iflen(x)> max_len else x +[0]*(max_len -len(x))# x[:max_len] 只获取前max_len个词# x + [0]*(max_len - len(x)) 词数小于max_len,用pad=0补长到max_len
tokenized_data = get_tokenized_imdb(data)# 调用方法获取分词后的数据
features = torch.tensor([pad([vocab.stoi[word]for word in words])for words in tokenized_data])# 将词转换为vocab词典中对应词的索引
labels = torch.tensor([score for _, score in data])return features, labels
training on cudaepoch 1, loss 0.4792, train acc 0.763, test acc 0.844, time 7.9 secepoch 2, loss 0.1616, train acc 0.862, test acc 0.872, time 7.9 secepoch 3, loss 0.0682, train acc 0.918, test acc 0.881, time 7.9 secepoch 4, loss 0.0289, train acc 0.