import torch
from torch.utils.data import DataLoader,Dataset
from tqdm import tqdm
import re
import os
import pickle
import torch.nn as nn
import time
D:\English\anaconda3\envs\test\lib\site-packages\tqdm\auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
"""数据集搭建"""deftokenize(content):
content = re.sub("<.*?>"," ", content)
filters =[':','\.','\t','\n','\x93','\x97','\x96','#','$','%','&']
content = re.sub("|".join(filters)," ", content)
tokens =[i.strip().lower()for i in content.split()]return tokens
classword_to_sequence():#文本序列化def__init__(self):
self.dict={"UNK":0,"PAD":1}#陌生词,填充词
self.count={}deffit(self,sentence):#统计词语出现的频数for word in sentence:
self.count[word]= self.count.get(word,0)+1defbuild_vocab(self):#生产词典
self.count ={word:value for word,value in self.count.items()if5<value<200}# temp = sorted(self.count.items(),key=lamba x:x[-1])[0:10000]# self.count = dict(temp)for word in self.count:#序列化的字典
self.dict[word]=len(self.dict)
self.inverse_dict =dict(zip(self.dict.values(),self.dict.keys()))#反序列化的字典deftransform(self,sentence):#句子转序列iflen(sentence)>100:
sentence = sentence[0:100]iflen(sentence)<100:
sentence+=["PAD"]*(100-len(sentence))return[self.dict.get(word,0)for word in sentence ]definverse_transform(self,indices):return[self.inverse_dict.get(word)for word in indices]"""生成词典并保存"""
ws = word_to_sequence()forfilein tqdm([("DataSet/aclImdb/train/neg/"+ i)for i in os.listdir("DataSet/aclImdb/train/neg")]\
+[("DataSet/aclImdb/train/pos/"+ i)for i in os.listdir("DataSet/aclImdb/train/pos")]):
ws.fit(tokenize(open(file,errors='ignore').read().strip()))
ws.build_vocab()print(len(ws.dict))
pickle.dump(ws,open("runs/model/ws.pkl",'wb'))
classmy_dataset(Dataset):def__init__(self,train=True):super(my_dataset,self).__init__()if train:
self.total_file_path_list =[("DataSet/aclImdb/train/neg/"+ i)for i in os.listdir("DataSet/aclImdb/train/neg")]\
+[("DataSet/aclImdb/train/pos/"+ i)for i in os.listdir("DataSet/aclImdb/train/pos")]else:
self.total_file_path_list =[("DataSet/aclImdb/test/neg/"+ i)for i in os.listdir("DataSet/aclImdb/test/neg")]\
+[("DataSet/aclImdb/test/pos/"+ i)for i in os.listdir("DataSet/aclImdb/test/pos")]def__getitem__(self,index):
cut_path = self.total_file_path_list[index]
cut_filename = os.path.basename(cut_path)
text=tokenize(open(cut_path,errors='ignore').read().strip())
text = ws.transform(text)
text = torch.LongTensor(text)
label=int(cut_filename.split("_")[-1].split(".")[0])-1return label,text
def__len__(self):returnlen(self.total_file_path_list)
test_data = DataLoader(dataset=my_dataset(train=False),batch_size=1000,num_workers=0,shuffle=True,pin_memory=True)
train_data = DataLoader(dataset=my_dataset(train=True),batch_size=128,num_workers=0,shuffle=True,pin_memory=True)