1.word2vec模型训练
import jieba.analyse
import jieba
import os
from gensim.models.word2vec import Word2Vec
dirname = './data/LCQMC'
sentence = []
words = []
for filename in os.listdir(dirname):
with open(os.path.join(dirname, filename), 'r') as lcqmc:
for line in lcqmc:
linedict = eval(line) #将json字符串转化为json对象
word = linedict['sentence1']
pos = linedict['sentence2']
sentence.append(word)
sentence.append(pos)
with open('./data/data_text', 'r') as f:
for line in f:
sentence.append(line)
print("data_text size:", len(sentence))#sentence存放所有要训练的语料
for string in sentence:
temp = list(string)
str = ''
for ch in temp:
str = str+ch+' '
# print(str)
words.append(str)
#size指定训练的字表示向量大小
model = Word2Vec(words, size=128, window=4, min_count=1, sg=1, workers=2)
model.save('./data/word2vecModel') #将训练好的模型存放在该代码文件下data目录中word2vecModel文件中
2.word2vec模型的调用
model = Word2Vec.load('./gensim_word2vec/data/word2vecModel')
def wordToVector(words):
result = []
for senarr in words:
temp = []
for i in range(30): #将一句语句设置固定长度30字,下标低于30的字向量从模型中取,超过的获取128的列向量
if i < len(senarr):
word_vec = model[senarr[i]] #获取字向量
else:
word_vec = [0 for _ in range(128)]
word_vec = np.asarray(word_vec)
temp.append(word_vec)
# temp = np.asarray(temp)
result.append(temp)
return result