deftokenize(text):
fileters =['!','"','#','$','%','&','\(','\)','\*','\+',',','-','\.','/',':',';','<','=','>','\?','@','\[','\\','\]','^','_','`','\{','\|','\}','~','\t','\n','\x97','\x96','”','“',]
text = re.sub("<.*?>"," ", text, flags=re.S)
text = re.sub("|".join(fileters)," ", text, flags=re.S)return[i.strip()for i in text.split()]
步骤二:准备dataset
# 2. 准备datasetclassImdbDataset(Dataset):def__init__(self, mode):# 调用父类初始化方法初始化继承的属性super(ImdbDataset, self).__init__()if mode =="train":
text_path =[os.path.join(data_base_path, i)for i in["train/neg","train/pos"]]else:
text_path =[os.path.join(data_base_path, i)for i in["test/neg","test/pos"]]
self.total_file_path_list =[]for i in text_path:
self.total_file_path_list.extend([os.path.join(i, j)for j in os.listdir(i)])def__getitem__(self, idx):
cur_path = self.total_file_path_list[idx]
cur_filename = os.path.basename(cur_path)
label =int(cur_filename.split("_")[-1].split(".")[0])-1# 处理标题,获取label,转化为从[0-9]
text = tokenize(open(cur_path).read().strip())# 直接按照空格进行分词return label, text
def__len__(self):returnlen(self.total_file_path_list)