使用K-means做词聚类需要用到word2vec做词向量化预处理。
# @Author : LinYimeng
代码传送门:
# -*- coding: utf-8 -*-
# @Author : LinYimeng
import multiprocessing
import gensim
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import word2vec,Word2Vec
from gensim.models import KeyedVectors
# import logging
import os
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = word2vec.LineSentence('one.txt')
model = Word2Vec(sentences,size = 256, min_count=1, window=5,sg=0,workers=multiprocessing.cpu_count())
model.save("w2v_model1.bin")
#model.wv.save_word2vec_format('w2v_model1.txt',binary = False)
#模型储存与加载
#计算一个词的最近似的词:
gensim.models.Word2Vec.load("w2v_model1.bin")
for key in model.similar_by_word('广告',topn=10):
print(key)
#计算两个词的相似度:
p