import collections
import os
import time
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
import torch.nn.functional as F
import random
from tqdm import tqdm
# 空格分词defget_tokenized_imdb(data):'''
:param data: list of [string, label]
'''deftokenizer(text):return[tok.lower()for tok in text.split(' ')]return[tokenizer(review)for review,_ in data]# 只从data中读取review(评论)内容而不读取标签(label),对review使用tokenizer方法进行分词
创建词典
# 创建词典defget_vocab_imdb(data):
tokenized_data = get_tokenized_imdb(data)# 调用get_tokenized_imdb()空格分词方法获取到分词后的数据tokenized_data
counter = collections.Counter([tk for st in tokenized_data for tk in st])# 读取tokenized_data列表中每个句子的每个词,放入列表中。# collections.Counter()方法可计算出列表中所有不重复的词数总和return Vocab.Vocab(counter, min_freq=5)# 去掉词频小于5的词
对data列表中的每行数据进行处理,将词转换为索引,并使每行数据等长
defprocess_imdb(data, vocab):
max_len =500# 每条评论通过截断或者补0,使得长度变成500defpad(x):return x[:max_len]iflen(x)> max_len else x +[0]*(max_len -len(x))# x[:max_len] 只获取前max_len个词# x + [0]*(max_len - len(x)) 词数小于max_len,用pad=0补长到max_len
tokenized_data = get_tokenized_imdb(data)# 调用方法获取分词后的数据
features = torch.tensor([pad([vocab.stoi[word]for word in words])for words in tokenized_data])# 将词转换为vocab词典中对应词的索引
labels = torch.tensor([score for _, score in data])return features, labels