使用 gensim 训练中文词向量,计算词语之间的相似度。
输入:语料库,txt文件。
输出:余弦相似度。
实现代码:
# -*- coding: utf-8 -*-
import logging
from gensim import models
from gensim.models import word2vec
def main():
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = word2vec.LineSentence("output.txt")
model = word2vec.Word2Vec(sentences, size=250)
# 保存模型,供以后使用
model.save("word2vec.model")
# 模型读取
# model = word2vec.Word2Vec.load("your_model_name")
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = models.Word2Vec.load('word2vec.model')
print("提供 3 种测试模式\n")
print("输入一个词,则去寻找前一百个该词的相似词&