先上代码,解释在后:
代码
注意:本代码的gensim 版本适合4.0.0及以上
,低于此版本的代码请参考:https://stackoverflow.com/questions/45981305/convert-python-dictionary-to-word2vec-object
import numpy as np
from gensim import utils
from numpy import float32 as REAL
import gensim # '4.0.1'
def my_save_word2vec_format(file_name, word2veckeyedVector, binary=True):
total_vec = len(word2veckeyedVector.index_to_key) # 全部的单词数
with utils.open(file_name, 'wb') as fout:
print(total_vec, word2veckeyedVector.vector_size)
fout.write(utils.to_utf8("%s %s\n" % (total_vec, word2veckeyedVector.vector_size)))
# store in sorted order: most frequent words at the top
for key in word2veckeyedVector.index_to_key:
value = word2veckeyedVector.get_vector(key)
if binary:
value = value.astype(REAL)
fout.write(utils.to_utf8(key) + b" " + value.tobytes())
else:
fout.write(utils.to_utf8("%s %s\n" % (key, ' '.join(repr(val) for val in value))))
if __name__ == '__main__':
# ============步骤1:构建一个字典,这个字典可以是你的预训练模型,key就是你的词,value就是词向量============
d = {}
d['a'] = np.random.randn(300)
d['b'] = np.random.randn(300)
# ============步骤2:构建词向量Word2VecKeyedVectors============
m = gensim.models.keyedvectors.Word2VecKeyedVectors(vector_size=300) # 注意这里要和词向量的长度相等
m.add_vectors(list(d.keys()), list(d.values())) # 把我们的字典加入到空的词向量中
# ============步骤3:将Word2VecKeyedVectors导出到标准的word2vec_format格式的文件中============
my_save_word2vec_format(file_name='train.bin', word2veckeyedVector=m)
# ============================================================
# ============然后就OK啦,下面是读取并导入模型的测试===============
# ============================================================
# ============读取方式1:keyedvectors============
m2 = gensim.models.keyedvectors.Word2VecKeyedVectors.load_word2vec_format('train.bin', binary=True)
print(m2.vectors == m.vectors)
# ============读取方式2:Word2Vec============
from gensim.models import Word2Vec
word2vec_model = Word2Vec().wv.load_word2vec_format('train.bin', binary=True)
解释
大致的思路:
- dict文件目前并不能直接导入到模型中。目前手续就是比较繁琐,需要的步骤:
python 字典
->Word2VecKeyedVectors
->Word2Vec
Word2VecKeyedVectors
与Word2Vec
的区别是:
Word2VecKeyedVectors
不能继续增量训练,但是执行效率会超级高Word2Vec
常用Word2vec.wv.xxxxx
这种函数,可以接着增量训练,但是查询词向量等效率要低一些