def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None,
num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False):
"""
Parameters
----------
corpus: iterable of list of (int, float)
A list of documents in the BoW format.
num_features : int, optional
Size of the dictionary. Must be either specified, or present in `corpus.num_terms`.
num_terms : int, optional
Alias for `num_features`, you can use either.
num_docs : int, optional
Number of documents in `corpus`. Will be calculated if not provided.
num_nnz : int, optional
Number of non-zero elements in `corpus`. Will be calculated if not provided.
num_best : int, optional
If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0.
Otherwise, return a full vector with one float for every document in the index.
chunksize : int, optional
Size of query chunks. Used internally when the query is an entire corpus.
dtype : numpy.dtype, optional
Data type of the internal matrix.
maintain_sparsity : bool, optional
Return sparse arrays from :meth:`~gensim.similarities.docsim.SparseMatrixSimilarity.get_similarities`?
"""
self.num_best = num_best
self.normalize = True
self.chunksize = chunksize
self.maintain_sparsity = maintain_sparsity
if corpus is not None:
logger.info("creating sparse index")
# iterate over input corpus, populating the sparse index matrix
try:
# use the more efficient corpus generation version, if the input
# `corpus` is MmCorpus-like (knows its shape and number of non-zeroes).
num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz
logger.debug("using efficient sparse index creation")
except AttributeError:
# no MmCorpus, use the slower version (or maybe user supplied the
# num_* params in constructor)
pass
if num_features is not None:
# num_terms is just an alias for num_features, for compatibility with MatrixSimilarity
num_terms = num_features
if num_terms is None:
raise ValueError("refusing to guess the number of sparse features: specify num_features explicitly")
corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else
(matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else
matutils.unitvec(v)) for v in corpus)
self.index = matutils.corpus2csc(
corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz,
dtype=dtype, printprogress=10000
).T
# convert to Compressed Sparse Row for efficient row slicing and multiplications
self.index = self.index.tocsr() # currently no-op, CSC.T is already CSR
logger.info("created %r", self.index)
corpus一般是计算得来的[[(索引1, float), (索引2, float),...],[(索引0, float), (索引2, float),...]...]
num_features一般是一共有多少词。
其他参数都能计算出来。
再看一下matutils.corpus2csc。
def corpus2csc(corpus, num_terms=None, dtype=np.float64, num_docs=None, num_nnz=None, printprogress=0):
"""Convert a streamed corpus in bag-of-words format into a sparse matrix `scipy.sparse.csc_matrix`,
with documents as columns.
Notes
-----
If the number of terms, documents and non-zero elements is known, you can pass
them here as parameters and a (much) more memory efficient code path will be taken.
Parameters
----------
corpus : iterable of iterable of (int, number)
Input corpus in BoW format
num_terms : int, optional
Number of terms in `corpus`. If provided, the `corpus.num_terms` attribute (if any) will be ignored.
dtype : data-type, optional
Data type of output CSC matrix.
num_docs : int, optional
Number of documents in `corpus`. If provided, the `corpus.num_docs` attribute (in any) will be ignored.
num_nnz : int, optional
Number of non-zero elements in `corpus`. If provided, the `corpus.num_nnz` attribute (if any) will be ignored.
printprogress : int, optional
Log a progress message at INFO level once every `printprogress` documents. 0 to turn off progress logging.
Returns
-------
scipy.sparse.csc_matrix
`corpus` converted into a sparse CSC matrix.
See Also
--------
:class:`~gensim.matutils.Sparse2Corpus`
Convert sparse format to Gensim corpus format.
"""
try:
# if the input corpus has the `num_nnz`, `num_docs` and `num_terms` attributes
# (as is the case with MmCorpus for example), we can use a more efficient code path
if num_terms is None:
num_terms = corpus.num_terms
if num_docs is None:
num_docs = corpus.num_docs
if num_nnz is None:
num_nnz = corpus.num_nnz
except AttributeError:
pass # not a MmCorpus...
if printprogress:
logger.info("creating sparse matrix from corpus")
if num_terms is not None and num_docs is not None and num_nnz is not None:
# faster and much more memory-friendly version of creating the sparse csc
posnow, indptr = 0, [0]
indices = np.empty((num_nnz,), dtype=np.int32) # HACK assume feature ids fit in 32bit integer
data = np.empty((num_nnz,), dtype=dtype)
for docno, doc in enumerate(corpus):
if printprogress and docno % printprogress == 0:
logger.info("PROGRESS: at document #%i/%i", docno, num_docs)
posnext = posnow + len(doc)
indices[posnow: posnext] = [feature_id for feature_id, _ in doc]
data[posnow: posnext] = [feature_weight for _, feature_weight in doc]
indptr.append(posnext)
posnow = posnext
assert posnow == num_nnz, "mismatch between supplied and computed number of non-zeros"
result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype)
else:
# slower version; determine the sparse matrix parameters during iteration
num_nnz, data, indices, indptr = 0, [], [], [0]
for docno, doc in enumerate(corpus):
if printprogress and docno % printprogress == 0:
logger.info("PROGRESS: at document #%i", docno)
indices.extend(feature_id for feature_id, _ in doc)
data.extend(feature_weight for _, feature_weight in doc)
num_nnz += len(doc)
indptr.append(num_nnz)
if num_terms is None:
num_terms = max(indices) + 1 if indices else 0
num_docs = len(indptr) - 1
# now num_docs, num_terms and num_nnz contain the correct values
data = np.asarray(data, dtype=dtype)
indices = np.asarray(indices)
result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype)
return result
这里主要是把稀疏矩阵压缩成密集化的,一开始的corpus会转化为sen1_len + sen2_len + ....的一维数组,然后根据稀疏转密集的csc_matrix配置参数,就可以转化成feature_len*sentence_count的矩阵了。
参见https://blog.csdn.net/dongfangxiaozi_/article/details/71319971
>>> indptr = np.array([0, 2, 3, 6])
>>> indices = np.array([0, 2, 2, 0, 1, 2])
>>> data = np.array([1, 2, 3, 4, 5, 6])
>>> csc_matrix((data, indices, indptr), shape=(3, 3)).toarray()
array([[1, 0, 4],
[0, 0, 5],
[2, 3, 6]])
indptr表示的是到某段为止是第几条数据,比如indices[3:6]存储的是第三列的0,1,2行,数据是4,5,6。
有了这个矩阵,每次查询的时候,通过先计算query的bow列表,然后进行余弦相似度计算。
def get_similarities(self, query):
"""Get similarity between `query` and this index.
Warnings
--------
Do not use this function directly; use the `self[query]` syntax instead.
Parameters
----------
query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix`}
Document or collection of documents.
Return
------
:class:`numpy.ndarray`
Similarity matrix (if maintain_sparsity=False) **OR**
:class:`scipy.sparse.csc`
otherwise
"""
is_corpus, query = utils.is_corpus(query)
if is_corpus:
query = matutils.corpus2csc(query, self.index.shape[1], dtype=self.index.dtype)
else:
if scipy.sparse.issparse(query):
query = query.T # convert documents=rows to documents=columns
elif isinstance(query, numpy.ndarray):
if query.ndim == 1:
query.shape = (1, len(query))
query = scipy.sparse.csr_matrix(query, dtype=self.index.dtype).T
else:
# default case: query is a single vector, in sparse gensim format
query = matutils.corpus2csc([query], self.index.shape[1], dtype=self.index.dtype)
# compute cosine similarity against every other document in the collection
result = self.index * query.tocsc() # N x T * T x C = N x C
if result.shape[1] == 1 and not is_corpus:
# for queries of one document, return a 1d array
result = result.toarray().flatten()
elif self.maintain_sparsity:
# avoid converting to dense array if maintaining sparsity
result = result.T
else:
# otherwise, return a 2d matrix (#queries x #index)
result = result.toarray().T
return result
核心就是这一句:
result = self.index * query.tocsc() # N x T * T x C = N x C
index是我们刚刚建立的索引矩阵,N代表语料中有多少个句子,T代表语料中的单词数量,C的代表查询的句子条数。
计算结果:总句子数*查询句子数。矩阵点乘正好是余弦相似度。到此,最开始的sim就是相似度矩阵咯。