读取分词后的评论,将分词的结果放在list中,每条评论为一个list,所有的评论放入list_all中。
from gensim.models import Word2Vec
import math
stopwords = None
with open('../data/stopwords.data', 'r', encoding='utf-8') as fr:
stopwords = fr.readlines()
list_all = []
list_temp = []
with open('../data/good_seg.txt', 'r', encoding='utf-8') as fr:
lines = fr.readlines()
for line in lines:
line = line.strip()
# print(line)
for w in stopwords:
# print(word)
if w.strip() == '/':
continue
line = line.replace(w.strip(), '')
list_temp = line.split("/")
while '' in list_temp:
list_temp.remove('')
list_all.append(list_temp)
print(list_all)
# 利用语料训练模型
model = Word2Vec(list_all, window=5, min_count=1)
y2 = model.most_similar("电影", topn=5) # 20个最相关的
for item in y2:
print(item[0], item[1])
for a in X:
print(a)