python word2vec的使用

最新推荐文章于 2024-04-27 06:32:07 发布

逍遥_yjz

最新推荐文章于 2024-04-27 06:32:07 发布

阅读量2.1k

点赞数

分类专栏：自然语言处理

本文链接：https://blog.csdn.net/xiaoyaozizai017/article/details/80825681

版权

自然语言处理专栏收录该内容

6 篇文章 0 订阅

订阅专栏

引用块内容

#-*- coding:utf-8 -*-
import sys
from gensim.models import word2vec
import gensim
import codecs
from textrank4zh import TextRank4Keyword, TextRank4Sentence
from nltk.tokenize import WordPunctTokenizer
import chardet



reload(sys)
sys.setdefaultencoding('utf-8')


#测试gensim的基本功能
def testGensimFeatures():
    print 'gensim'

    # s1=model.most_similar(positive=["woman", "king"], negative=["man"], topn=1)
    # print s1
    #model.save(r'D:\PythonFiles\gensimData\test\text88.model')
    #model = gensim.models.Word2Vec.load(r'D:\PythonFiles\gensimData\test\text8.model')

    #model = gensim.models.KeyedVectors.load_word2vec_format(r'D:\PythonFiles\gensimData\test\GoogleNews-vectors-negative300.bin',binary=True)
    #model.wv.save_word2vec_format(r'D:\PythonFiles\gensimData\test\text88.model.bin', binary=True)

    #model.wv.save_word2vec_format(r'D:\PythonFiles\gensimData\test\wiki.enae.model.bin', binary=True)
    #model = gensim.models.KeyedVectors.load_word2vec_format(r'D:\PythonFiles\gensimData\test\text8.model.bin', binary=True)
    #print model.most_similar(['girl', 'father'], ['boy'], topn=3)
    # print '以上比较出类似的数据'
    # print '======================='
    #print '以上比较两个单词的相似度是多少 %f' % model.similarity(["boy","woman"], ["girl",'father'])
    model = word2vec.load_word2vec_format(u'/data1/yuhai/sameQuestion/GoogleNews-vectors-negative300.bin', binary=True)
    #model = word2vec.load_word2vec_format(u'/data1/yuhai/sameQuestion/GoogleNews-vectors-negative300.bin', binary=True)

    w1="boy"
    w2="girl"
    print model.similarity(w1, w2)
    w3=['boy','father','dog']
    w4=['girl','mother','you']
    print model.n_similarity(w3,w4)
    print '结束'


# if __name__ == '__main__':
#     testGensimFeatures()

###1.1 Storing and loading models
您可以使用标准的gensim方法存储/加载模型：

model.save(’/tmp/mymodel’)
new_model = gensim.models.Word2Vec.load(’/tmp/mymodel’)
它内部使用pickle(python的pickle模块实现了基本的数据序列和反序列化。通过pickle模块的序列化操作我们能够将程序中运行的对象信息保存到文件中去，永久存储；通过pickle模块的反序列化操作，我们能够从文件中创建上一次程序保存的对象。)，可选地将模型的内部大型NumPy矩阵从磁盘文件直接转换为虚拟内存，用于进程间内存共享。
此外，您可以加载由原始C工具创建的模型，使用其文本和二进制格式：


model = Word2Vec.load_word2vec_format('/tmp/vectors.txt', binary=False)
#using gzipped/bz2 input works too, no need to unzip:
model = Word2Vec.load_word2vec_format('/tmp/vectors.bin.gz', binary=True)

逍遥_yjz

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
python word2vec的使用

引用块内容#-*- coding:utf-8 -*-import sysfrom gensim.models import word2vecimport gensimimport codecsfrom textrank4zh import TextRank4Keyword, TextRank4Sentencefrom nltk.tokenize import WordP...
复制链接

扫一扫

专栏目录