import torch
import os
classDictionary(object):def__init__(self):
self.word2idx ={}
self.idx2word ={}
self.idx =0defadd_word(self, word):ifnot word in self.word2idx:#in是在dict的key中看是否存在
self.word2idx[word]= self.idx
self.idx2word[self.idx]= word
self.idx +=1#在加入word时同时也将id加入到dict中def__len__(self):returnlen(self.word2idx)classCorpus(object):def__init__(self):
self.dictionary = Dictionary()defget_data(self, path, batch_size=20):# Add words to the dictionarywithopen(path,'r')as f:
tokens =0for line in f:
words = line.split()+['<eos>']
tokens +=len(words)for word in words:
self.dictionary.add_word(word)# Tokenize the file content
ids = torch.LongTensor(tokens)
token =0withopen(path,'r')as f:for line in f:
words = line.split()+['<eos>']for word in words:
ids[token]= self.dictionary.word2idx[word]#将句子用已处理好的id表示出来
token +=1#token表示的出现的第几个词
num_batches = ids.size(0)// batch_size#共有几个batch
ids = ids[:num_batches*batch_size]#batch取整return ids.view(batch_size,-1)