from torchtext import data
from torchtext.vocab import Vectors
from torch.nn import init
import tqdm
from sklearn.utils import shuffle
tokenize = lambda x: x.split()
# fix_length指定每条文本的长度,截断补长
TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, fix_length=200)
LABEL = data.Field(sequential=False, use_vocab=False)
corpus_path=r"D:\torchtext-practise\1.txt"
# # 加载语料
# with open(corpus_path, "r", encoding="utf-8") as f:
# # 将数据集全部加载到内存
# lines = [line for line in tqdm.tqdm(f, desc="Loading Dataset")]#eval(line)处理的文本有格式时
# #Ctrl+Alt+i缩进
# # 打乱顺序
# train_data = shuffle(lines)
# # 获取数据长度(条数)
# corpus_lines = len(lines)
def get_text_and_label(line):
line=line.split(" ",1)
# 获取文本和标记
text = line[0]
# print(text)
label
torchtext构建数据集,txt文件
最新推荐文章于 2024-05-03 20:11:46 发布