import os
import jieba
import random
import numpy as np
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
# 参数设置
word_size =64# 词向量维度
window =5# 窗口大小
nb_negative =25# 随机负采样的样本数
min_count =10# 频数少于min_count的词会将被抛弃,低频词类似于噪声,可以抛弃掉
file_num =10000#只取file_num个文件进行训练# 数据预处理defget_all_apths(dirname):
paths =[]# 将所有的txt文件路径存放在这个list中for maindir, subdir, file_name_list in os.walk(dirname):for filename in file_name_list:
apath = os.path.join(maindir, filename)# 合并成一个完整路径
paths.append(apath)return paths
defget_corpus(file_path):
words =[]
corpus =[]
i =0forfilein file_path:if".txt"infile:
i +=1try:withopen(file, encoding="utf-8")as fr:for line in fr:
words += jieba.lcut(line)
corpus.append(jieba.lcut(line))except Exception as e:print(e)if i