#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Author: Shiva Manne <manneshiva@gmail.com>
# Copyright (C) 2018 RaRe Technologies s.r.o.
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""This module implements the word2vec family of algorithms, using highly optimized C routines,
data streaming and Pythonic interfaces.
The word2vec algorithms include skip-gram and CBOW models, using either
hierarchical softmax or negative sampling: `Tomas Mikolov et al: Efficient Estimation of Word Representations
in Vector Space <https://arxiv.org/pdf/1301.3781.pdf>`_, `Tomas Mikolov et al: Distributed Representations of Words
and Phrases and their Compositionality <https://arxiv.org/abs/1310.4546>`_.
Other embeddings
================
There are more ways to train word vectors in Gensim than just Word2Vec.
See also :class:`~gensim.models.doc2vec.Doc2Vec`, :class:`~gensim.models.fasttext.FastText` and
wrappers for :class:`~gensim.models.wrappers.VarEmbed` and :class:`~gensim.models.wrappers.WordRank`.
The training algorithms were originally ported from the C package https://code.google.com/p/word2vec/
and extended with additional functionality and optimizations over the years.
For a tutorial on Gensim word2vec, with an interactive web app trained on GoogleNews,
visit https://rare-technologies.com/word2vec-tutorial/.
**Make sure you have a C compiler before installing Gensim, to use the optimized word2vec routines**
(70x speedup compared to plain NumPy implementation, https://rare-technologies.com/parallelizing-word2vec-in-python/).
Usage examples
==============
Initialize a model with e.g.:
.. sourcecode:: pycon
>>> from gensim.test.utils import common_texts, get_tmpfile
>>> from gensim.models import Word2Vec
>>>
>>> path = get_tmpfile("word2vec.model")
>>>
>>> model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)
>>> model.save("word2vec.model")
The training is streamed, meaning `sentences` can be a generator, reading input data
from disk on-the-fly, without loading the entire corpus into RAM.
It also means you can continue training the model later:
.. sourcecode:: pycon
>>> model = Word2Vec.load("word2vec.model")
>>> model.train([["hello", "world"]], total_examples=1, epochs=1)
(0, 2)
The trained word vectors are stored in a :class:`~gensim.models.keyedvectors.KeyedVectors` instance in `model.wv`:
.. sourcecode:: pycon
>>> vector = model.wv['computer'] # numpy vector of a word
The reason for separating the trained vectors into `KeyedVectors` is that if you don't
need the full model state any more (don't need to continue training), the state can discarded,
resulting in a much smaller and faster object that can be mmapped for lightning
fast loading and sharing the vectors in RAM between processes:
.. sourcecode:: pycon
>>> from gensim.models import KeyedVectors
>>>
>>> path = get_tmpfile("wordvectors.kv")
>>>
>>> model.wv.save(path)
>>> wv = KeyedVectors.load("model.wv", mmap='r')
>>> vector = wv['computer'] # numpy vector of a word
Gensim can also load word vectors in the "word2vec C format", as a
:class:`~gensim.models.keyedvectors.KeyedVectors` instance:
.. sourcecode:: pycon
>>> from gensim.test.utils import datapath
>>>
>>> wv_from_text = KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'), binary=False) # C text format
>>> wv_from_bin = KeyedVectors.load_word2vec_format(datapath("euclidean_vectors.bin"), binary=True) # C bin format
It is impossible to continue training the vectors loaded from the C format because the hidden weights,
vocabulary frequencies and the binary tree are missing. To continue training, you'll need the
full :class:`~gensim.models.word2vec.Word2Vec` object state, as stored by :meth:`~gensim.models.word2vec.Word2Vec.save`,
not just the :class:`~gensim.models.keyedvectors.KeyedVectors`.
You can perform various NLP word tasks with a trained model. Some of them
are already built-in - you can see it in :mod:`gensim.models.keyedvectors`.
If you're finished training a model (i.e. no more updates, only querying),
you can switch to the :class:`~gensim.models.keyedvectors.KeyedVectors` instance:
.. sourcecode:: pycon
>>> word_vectors = model.wv
>>> del model
to trim unneeded model state = use much less RAM and allow fast loading and memory sharing (mmap).
Note that there is a :mod:`gensim.models.phrases` module which lets you automatically
detect phrases longer than one word. Using phrases, you can learn a word2vec model
where "words" are actually multiword expressions, such as `new_york_times` or `financial_crisis`:
.. sourcecode:: pycon
>>> from gensim.test.utils import common_texts
>>> from gensim.models import Phrases
>>>
>>> bigram_transformer = Phrases(common_texts)
>>> model = Word2Vec(bigram_transformer[common_texts], min_count=1)
"""
from __future__ import division # py3 "true division"
import logging
import sys
import os
import heapq
from timeit import default_timer
from copy import deepcopy
from collections import defaultdict
import threading
import itertools
import warnings
from gensim.utils import keep_vocab_item, call_on_class_only
from gensim.models.keyedvectors import Vocab, Word2VecKeyedVectors
from gensim.models.base_any2vec import BaseWordEmbeddingsModel
try:
from queue import Queue, Empty
except ImportError:
from Queue import Queue, Empty
from numpy import exp, dot, zeros, random, dtype, float32 as REAL,\
uint32, seterr, array, uint8, vstack, fromstring, sqrt,\
empty, sum as np_sum, ones, logaddexp, log, outer
from scipy.special import expit
from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc
from gensim.utils import deprecated
from six import iteritems, itervalues, string_types
from six.moves import range
logger = logging.getLogger(__name__)
try:
from gensim.models.word2vec_inner import train_batch_sg, train_batch_cbow
from gensim.models.word2vec_inner import score_sentence_sg, score_sentence_cbow
from gensim.models.word2vec_inner import FAST_VERSION, MAX_WORDS_IN_BATCH
except ImportError:
# failed... fall back to plain numpy (20-80x slower training than the above)
FAST_VERSION = -1
MAX_WORDS_IN_BATCH = 10000
def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False):
"""Update skip-gram model by training on a sequence of sentences.
Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`.
Warnings
--------
This is the non-optimized, pure Python version. If you have a C compiler, Gensim
will use an optimized code path from :mod:`gensim.models.word2vec_inner` instead.
Parameters
----------
model : :class:`~gensim.models.word2Vec.Word2Vec`
The Word2Vec model instance to train.
sentences : iterable of list of str
The corpus used to train the model.
alpha : float
The learning rate
work : object, optional
Unused.
compute_loss : bool, optional
Whether or not the training loss should be computed in this batch.
Returns
-------
int
Number of words in the vocabulary actually used for training (that already existed in the vocabulary
and were not discarded by negative sampling).
"""
result = 0
for sentence in sentences:
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab
and model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code
# now go over all words from the (reduced) window, predicting each one in turn
start = max(0, pos - model.window + reduced_window)
for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
# don't train on the `word` itself
if pos2 != pos:
train_sg_pair(
model, model.wv.index2word[word.index], word2.index, alpha, compute_loss=compute_loss
)
result += len(word_vocabs)
return result
def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss=False):
"""Update CBOW model by training on a sequence of sentences.
Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`.
Warnings
--------
This is the non-optimized, pure Python version. If you have a C compiler, Gensim
will use an optimized code path from :mod:`gensim.models.word2vec_inner` instead.
Parameters
----------
model : :class:`~gensim.models.word2vec.Word2Vec`
The Word2Vec model instance to train.
sentences : iterable of list of str
The corpus used to train the model.
alpha : float
The learning rate
work : object, optional
Unused.
neu1 : object, optional
Unused.
compute_loss : bool, optional
Whether or not the training loss should be computed in this batch.
Returns
-------
int
Number of words in the vocabulary actually used for training (that already existed in the vocabulary
and were not discarded by negative sampling).
"""
result = 0
for sentence in sentences:
word_vocabs = [
model.wv.vocab[w] for w in sentence if w in model.wv.vocab
and model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32
]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code
start = max(0, pos - model.window + reduced_window)
window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]
l1 = np_sum(model.wv.syn0[word2_indices], axis=0) # 1 x vector_size
if word2_indices and model.cbow_mean:
l1 /= len(word2_indices)
train_cbow_pair(model, word, word2_indices, l1, alpha, compute_loss=compute_loss)
result += len(word_vocabs)
return result
def score_sentence_sg(model, sentence, work=None):
"""Obtain likelihood score for a single sentence in a fitted skip-gram representation.
Notes
-----
This is the non-optimized, pure Python version. If you have a C compiler, Gensim
will use an optimized code path from :mod:`gensim.models.word2vec_inner` instead.
Parameters
----------
model : :class:`~gensim.models.word2vec.Word2Vec`
The trained model. It **MUST** have been trained using hierarchical softmax and the skip-gram algorithm.
sentence : list of str
The words comprising the sentence to be scored.
work : object, optional
Unused. For interface compatibility only.
Returns
-------
float
The probability assigned to this sentence by the Skip-Gram model.
"""
log_prob_sentence = 0.0
if model.negative:
raise RuntimeError("scoring is only available for HS=True")
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab]
for pos, word in enumerate(word_vocabs):
if word is None:
continue # OOV word in the input sentence => skip
# now go over all words from the window, predicting each one in turn
start = max(0, pos - model.window)
for pos2, word2 in enumerate(word_vocabs[start: pos + model.window + 1], start):
# don't train on OOV words and on the `word` itself
if word2 is not None and pos2 != pos:
log_prob_sentence += score_sg_pair(model, word, word2)
return log_prob_sentence
def score_sentence_cbow(model, sentence, work=None, neu1=None):
"""Obtain likelihood score for a single sentence in a fitted CBOW representation.
Notes
-----
This is the non-optimized, pure Python version. If you have a C compiler, Gensim
will use an optimized code path from :mod:`gensim.models.word2vec_inner` instead.
Parameters
----------
model : :class:`~gensim.models.word2vec.Word2Vec`
The trained model. It **MUST** have been trained using hierarchical softmax and the CBOW algorithm.
sentence : list of str
The words comprising the sentence to be scored.
work : object, optional
Unused. For interface compatibility only.
neu1 : object, optional
Unused. For interface compatibility only.
Returns
-------
float
The probability assigned to this sentence by the CBOW model.
"""
log_prob_sentence = 0.0
if model.negative:
raise RuntimeError("scoring is only available for HS=True")
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab]
for pos, word in enumerate(word_vocabs):
if word is None:
continue # OOV word in the input sentence => skip
start = max(0, pos - model.window)
window_pos = enumerate(word_vocabs[start:(pos + model.window + 1)], start)
word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]
l1 = np_sum(model.wv.syn0[word2_indices], axis=0) # 1 x layer1_size
if word2_indices and model.cbow_mean:
l1 /= len(word2_indices)
log_prob_sentence += score_cbow_pair(model, word, l1)
return log_prob_sentence
try:
from gensim.models.word2vec_corpusfile import train_epoch_sg, train_epoch_cbow, CORPUSFILE_VERSION
except ImportError:
# file-based word2vec is not supported
CORPUSFILE_VERSION = -1
def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words,
_work, _neu1, compute_loss):
raise RuntimeError("Training with corpus_file argument is not supported")
def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words,
_work, _neu1, compute_loss):
raise RuntimeError("Training with corpus_file argument is not supported")
def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_hidden=True,
context_vectors=None, context_locks=None, compute_loss=False, is_ft=False):
"""Train the passed model instance on a word and its context, using the Skip-gram algorithm.
Parameters
----------
model : :class:`~gensim.models.word2vec.Word2Vec`
The model to be trained.
word : str
The label (predicted) word.
context_index : list of int
The vocabulary indices of the words in the context.
alpha : float
Learning rate.
learn_vectors : bool, optional
Whether the vectors should be updated.
learn_hidden : bool, optional
Whether the weights of the hidden layer should be updated.
context_vectors : list of list of float, optional
Vector representations of the words in the context. If None, these will be retrieved from the model.
context_locks : list of float, optional
The lock factors for each word in the context.
compute_loss : bool, optional
Whether or not the training loss should be computed.
is_ft : bool, optional
If True, weights will be computed using `model.wv.syn0_vocab` and `model.wv.syn0_ngrams`
instead of `model.wv.syn0`.
Returns
-------
numpy.ndarray
Error vector to be back-propagated.
"""
if context_vectors is None:
if is_ft:
context_vectors_vocab = model.wv.syn0_vocab
context_vectors_ngrams = model.wv.syn0_ngrams
else:
context_vectors = model.wv.syn0
if context_locks is None:
if is_ft:
context_locks_vocab = model.syn0_vocab_lockf
context_locks_ngrams = model.syn0_ngrams_lockf
else:
context_locks = model.syn0_lockf
if word not in model.wv.vocab:
return
predict_word = model.wv.vocab[word] # target word (NN output)
if is_ft:
l1_vocab = context_vectors_vocab[context_index[0]]
l1_ngrams = np_sum(context_vectors_ngrams[context_index[1:]], axis=0)
if context_index:
l1 = np_sum([l1_vocab, l1_ngrams], axis=0) / len(context_index)
else:
l1 = context_vectors[context_index] # input word (NN input/projection layer)
lock_factor = context_locks[context_index]
neu1e = zeros(l1.shape)
if model.hs:
# work on the entire tree at once, to push as much work into numpy's C routines as possible (performance)
l2a = deepcopy(model.syn1[predict_word.point]) # 2d matrix, codelen x layer1_size
prod_term = dot(l1, l2a.T)
fa = expit(prod_term) # propagate hidden -> output
ga = (1 - predict_word.code - fa) * alpha # vector of error gradients multiplied by the learning rate
if learn_hidden:
model.syn1[predict_word.point] += outer(ga, l1) # learn hidden -> output
neu1e += dot(ga, l2a) # save error
# loss component corresponding to hierarchical softmax
if compute_loss:
sgn = (-1.0) ** predict_word.code # `ch` function, 0 -> 1, 1 -> -1
lprob = -log(expit(-sgn * prod_term))
model.running_training_loss +=
word2ve的python源码解析
最新推荐文章于 2025-03-10 20:17:27 发布