import torchtext
from torchtext.vocab import Vectors
import torch
from torch import nn
import numpy as np
import random
import jieba
random.seed(53113)
np.random.seed(53113)
torch.manual_seed(53113)
use_cuda = torch.cuda.is_available()
if use_cuda:
torch.cuda.manual_seed(53113)
device = torch.device('cuda')
else:
device = torch.device('cpu')
1.读入原始文档和停用词txt文件
原始文档和停用词文档
with open('./mdzs.txt') as f:
text = f.readlines()
f.close()
text = [i.strip() for i in text]
with open('./stop_words.txt',encoding='utf-8') as f:
stop_words = f.readlines()
f.close()
stop_words = [i.strip() for i in stop_words]
stop_word = [' ','PS','1V1','HE','┃','O','∩','☆']
for word in stop_word:
stop_words.append(word)
text[:10]
['',
'《魔道祖师[重生]》作者:墨香铜臭',
'',
'文案:',
'前世的魏无羡万人唾骂,声名狼藉。',
'被护持一生的师弟带人端了老巢,',
'纵横一世,死无全尸。',
'',
'曾掀起腥风血雨的一代魔道祖师,重生成了一个……',
'脑残。']
stop_words[:10]
['', '为止', '纵然', 'all', '例如', '[④e]', 'when', '亦', '来讲', '谁料']
2.分词处理
text_token = []
for sentence in text:
token = jieba.lcut(sentence)
for word in token:
if word not in stop_words:
text_token.append(word)
a = ' '.join(i for i in text_token)
with open('cql.txt','w',encoding='utf-8') as f:
f.write(a)
f.close()
3.建立字典和迭代器
field = torchtext.data.Field()
train = torchtext.datasets.LanguageModelingDataset.splits(path='./',train="cql.txt",text_field=field)[0]
field.build_vocab(train, max_size