数据集:一本叫重生后四个哥哥都团宠我.txt的小说,代码是抄别人的,纯属用于记录学习过程
import torchtext
from torchtext import data
from torchtext.vocab import Vectors
import torch.nn as nn
import torch
from sklearn.utils import shuffle
from torchtext_practise.pra3class import RNNModel
import jieba
corpus_path=r"D:\torchtext_practise\重生后四个哥哥都团宠我.txt"
BATCH_SIZE = 64
EMBEDDING_SIZE = 650
MAX_VOCAB_SIZE = 50000
def chinese_tokenizer(text):
return [tok for tok in jieba.lcut(text)]
TEXT = data.Field(sequential=True, tokenize=chinese_tokenizer)
# 构建Dataset数据集
train = torchtext.datasets.LanguageModelingDataset(corpus_path, text_field=TEXT)
TEXT.build_vocab(train)
train_iter=data.BPTTIterator(dataset=train, batch_size=64, bptt_len=32)
it = iter(train_iter)
batch = next(it)
print(" ".join([TEXT.vocab.itos[i] for i in batch.text[:,0].data]))
print(" ".join([TEXT.vocab.itos[i] for i in batch.target[:,0].data]