word2vec记录
skip-gram和cbow原理
https://www.cnblogs.com/pinard/p/7243513.html
https://zhuanlan.zhihu.com/p/35074402
https://blog.csdn.net/liuyuemaicha/article/details/52611219 (含公式推导)
一般训练方法和相似度测量函数的搭配:
cbow+cos:表示可替代性
skip-gram+互信息:表示共线性
层次softmax,negative sampling,普通softmax
这三种方法的损失函数不一样,是互斥的训练方法。
- 层次softmax:https://www.cnblogs.com/pinard/p/7243513.html
在gensim的训练中,设置
#层次softmax
wikimodel= Word2Vec(LineSentence(outfile_name), size=vec_size, sg=1,hs=1,negative=0,window=5, min_count=5,
workers=multiprocessing.cpu_count(),compute_loss=True)
- negative sampling:https://www.cnblogs.com/pinard/p/7249903.html
在gensim的训练中,设置(negative默认大于0)
#negative sample
wikimodel= Word2Vec(LineSentence(outfile_name), size=vec_size, sg=1,window=5, min_count=5,
workers=multiprocessing.cpu_count(),compute_loss=True)
- 一些笔记
层次softmax代码
https://github.com/BUAAQingYuan/fasttext/blob/master/huffman_tree.py
转移概率计算
https://spaces.ac.cn/archives/4368
层次softmax法1:
import numpy as np
import gensim
model = gensim.models.word2vec.Word2Vec.load('word2vec_wx')
def predict_proba(oword, iword):
iword_vec = model[iword]
oword = model.wv.vocab[oword]
oword_l = model.syn1[oword.point].T
dot = np.dot(iword_vec, oword_l)
lprob = -sum(np.logaddexp(0, -dot) + oword.code*dot)
return lprob
from collections import Counter
def relative_words(word):
r = {i:predict_proba(i, word)-np.log(j.count) for i,j in model.wv.vocab.iteritems()}
return Counter(r).most_common()
层次softmax法2
def score_sg_pair(model, word, word2):
"""Score the trained Skip-gram model on a pair of words.
Parameters
----------
model : :class:`~gensim.models.word2vec.Word2Vec`
The trained model.
word : :class:`~gensim.models.keyedvectors.Vocab`
Vocabulary representation of the first word.
word2 : :class:`~gensim.models.keyedvectors.Vocab`
Vocabulary representation of the second word.
Returns
-------
float
Logarithm of the sum of exponentiations of input words.
"""
l1 = model.wv.syn0[word2.index]
l2a = deepcopy(model.syn1[word.point]) # 2d matrix, codelen x layer1_size
sgn = (-1.0) ** word.code # ch function, 0-> 1, 1 -> -1
lprob = -logaddexp(0, -sgn * dot(l1, l2a.T))
return sum(lprob)
from collections import Counter
def relative_words(word):
r = {i:score_sg_pair(model,i, word)-np.log(j.count) for i,j in model.wv.vocab.iteritems()}
return Counter(r).most_common()
cbow+negative sampling 法3
def predict_output_word(self, context_words_list, topn=10):
"""Get the probability distribution of the center word given context words.
class Word2Vec::def predict_output_word
Parameters
----------
context_words_list : list of str
List of context words.
topn : int, optional
Return `topn` words and their probabilities.
Returns
-------
list of (str, float)
`topn` length list of tuples of (word, probability).
"""
if not self.negative:
raise RuntimeError(
"We have currently only implemented predict_output_word for the negative sampling scheme, "
"so you need to have run word2vec with negative > 0 for this to work."
)
if not hasattr(self.wv, 'vectors') or not hasattr(self.trainables, 'syn1neg'):
raise RuntimeError("Parameters required for predicting the output words not found.")
word_vocabs = [self.wv.vocab[w] for w in context_words_list if w in self.wv.vocab]
if not word_vocabs:
warnings.warn("All the input context words are out-of-vocabulary for the current model.")
return None
word2_indices = [word.index for word in word_vocabs]
l1 = np_sum(self.wv.vectors[word2_indices], axis=0)
if word2_indices and self.cbow_mean:
l1 /= len(word2_indices)
# propagate hidden -> output and take softmax to get probabilities
prob_values = exp(dot(l1, self.trainables.syn1neg.T))
prob_values /= sum(prob_values)
top_indices = matutils.argsort(prob_values, topn=topn, reverse=True)
# returning the most probable output words with their probabilities
return [(self.wv.index2word[index1], prob_values[index1]) for index1 in top_indices]
def relative_words(word_list):
candidate_list=w2vModel.predict_output_word(word_list,topn=50)
r = {word:-np.log(vocab_class.count) for word,vocab_class in w2vModel.wv.vocab.items() if word in candidate_list}
for item in candidate_list:
r[item[0]]=item[1]+r.get(item[0],0)
return Counter(r).most_common()