使用gensim的LsiModel对,tfidf向量进行lsi化,代码是:
corpus_lsi = model_lsi[corpus_tfidf]
当连续两次调用的时候,出现了这个断言报错,报错代码是lsimodel的 __getitem__ 方法
于是查看gensim的lsimodel.py源文件:
def __getitem__(self, bow, scaled=False, chunksize=512):
"""Get the latent representation for `bow`.
Parameters
----------
bow : {list of (int, int), iterable of list of (int, int)}
Document or corpus in BoW representation.
scaled : bool, optional
If True - topics will be scaled by the inverse of singular values.
chunksize : int, optional
Number of documents to be used in each applying chunk.
Returns
-------
list of (int, float)
Latent representation of topics in BoW format for document **OR**
:class:`gensim.matutils.Dense2Corpus`
Latent representation of corpus in BoW format if `bow` is corpus.
"""
assert self.projection.u is not None, "decomposition not initialized yet"
# if the input vector is in fact a corpus, return a transformed corpus as a result
is_corpus, bow = utils.is_corpus(bow)
if is_corpus and chunksize:
# by default, transform `chunksize` documents at once, when called as `lsi[corpus]`.
# this chunking is completely transparent to the user, but it speeds
# up internal computations (one mat * mat multiplication, instead of
# `chunksize` smaller mat * vec multiplications).
return self._apply(bow, chunksize=chunksize)
if not is_corpus:
bow = [bow]
# convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication
vec = matutils.corpus2csc(bow, num_terms=self.num_terms, dtype=self.projection.u.dtype)
topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x
# # convert input to dense, then do dense * dense multiplication
# # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse),
# but consumes more memory
# vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow))
# topic_dist = np.dot(self.projection.u[:, :self.num_topics].T, vec)
# # use np's advanced indexing to simulate sparse * dense
# # ± same speed again
# u = self.projection.u[:, :self.num_topics]
# topic_dist = np.empty((u.shape[1], len(bow)), dtype=u.dtype)
# for vecno, vec in enumerate(bow):
# indices, data = zip(*vec) if vec else ([], [])
# topic_dist[:, vecno] = np.dot(u.take(indices, axis=0).T, np.array(data, dtype=u.dtype))
if not is_corpus:
# convert back from matrix into a 1d vec
topic_dist = topic_dist.reshape(-1)
if scaled:
topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x
# convert a np array to gensim sparse vector = tuples of (feature_id, feature_weight),
# with no zero weights.
if not is_corpus:
# lsi[single_document]
result = matutils.full2sparse(topic_dist)
else:
# lsi[chunk of documents]
result = matutils.Dense2Corpus(topic_dist)
return result
发现有这行代码:
assert self.projection.u is not None, "decomposition not initialized yet"
先mark一下,有空再来想怎么解决哈哈。这行代码应该是可以删掉的。