引用块内容
#-*- coding:utf-8 -*-
import sys
from gensim.models import word2vec
import gensim
import codecs
from textrank4zh import TextRank4Keyword, TextRank4Sentence
from nltk.tokenize import WordPunctTokenizer
import chardet
reload(sys)
sys.setdefaultencoding('utf-8')
#测试gensim的基本功能
def testGensimFeatures():
print 'gensim'
# s1=model.most_similar(positive=["woman", "king"], negative=["man"], topn=1)
# print s1
#model.save(r'D:\PythonFiles\gensimData\test\text88.model')
#model = gensim.models.Word2Vec.load(r'D:\PythonFiles\gensimData\test\text8.model')
#model = gensim.models.KeyedVectors.load_word2vec_format(r'D:\PythonFiles\gensimData\test\GoogleNews-vectors-negative300.bin',binary=True)
#model.wv.save_word2vec_format(r'D:\PythonFiles\gensimData\test\text88.model.bin', binary=True)
#model.wv.save_word2vec_format(r'D:\PythonFiles\gensimData\test\wiki.enae.model.bin', binary=True)
#model = gensim.models.KeyedVectors.load_word2vec_format(r'D:\PythonFiles\gensimData\test\text8.model.bin', binary=True)
#print model.most_similar(['girl', 'father'], ['boy'], topn=3)
# print '以上比较出类似的数据'
# print '======================='
#print '以上比较两个单词的相似度是多少 %f' % model.similarity(["boy","woman"], ["girl",'father'])
model = word2vec.load_word2vec_format(u'/data1/yuhai/sameQuestion/GoogleNews-vectors-negative300.bin', binary=True)
#model = word2vec.load_word2vec_format(u'/data1/yuhai/sameQuestion/GoogleNews-vectors-negative300.bin', binary=True)
w1="boy"
w2="girl"
print model.similarity(w1, w2)
w3=['boy','father','dog']
w4=['girl','mother','you']
print model.n_similarity(w3,w4)
print '结束'
# if __name__ == '__main__':
# testGensimFeatures()
###1.1 Storing and loading models
您可以使用标准的gensim方法存储/加载模型:
model.save(’/tmp/mymodel’)
new_model = gensim.models.Word2Vec.load(’/tmp/mymodel’)
它内部使用pickle(python的pickle模块实现了基本的数据序列和反序列化。通过pickle模块的序列化操作我们能够将程序中运行的对象信息保存到文件中去,永久存储;通过pickle模块的反序列化操作,我们能够从文件中创建上一次程序保存的对象。),可选地将模型的内部大型NumPy矩阵从磁盘文件直接转换为虚拟内存,用于进程间内存共享。
此外,您可以加载由原始C工具创建的模型,使用其文本和二进制格式:
model = Word2Vec.load_word2vec_format('/tmp/vectors.txt', binary=False)
#using gzipped/bz2 input works too, no need to unzip:
model = Word2Vec.load_word2vec_format('/tmp/vectors.bin.gz', binary=True)