dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]#每段生成的(编号,词频)
tfidf = models.TfidfModel(corpus)#进入模型
corpus_tfidf = tfidf[corpus]#对模型采用[]操作,跑getitem()函数
lsi = models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics=2)
print (corpus_tfidf)
corpus_lsi = lsi[corpus_tfidf]
index = similarities.MatrixSimilarity(lsi[corpus])
lsi.print_topics(2)
with open('../papers/2.txt') as t1:
content1 = t1.read()
seg_list_after1=content1.split(" ")
t1.close
#print (seg_list_after1)
bow_1=dictionary.doc2bow(seg_list_after1)
lsi_1=lsi[bow_1]
sims=index[lsi_1]
#print (sims)
sort_sims=sorted(enumerate(sims),key=lambda item:-item[1])
gensim/corpora/dictionary.py
class dictionary:
属性:token2id :字典(str, int) id2token: 字典(int,str) 互逆
dfs : 字典(int,int) 编号,频率
num_docs文档数量 num_pos词量 num_nnz非零
方法:doc2bow参数document allow_update return_missing
返回 list of (int, int):文档的BOW模型 引用:
# Construct (word, frequency) mapping.#构建映射(单词,频率)
counter = defaultdict(int)#缺失值赋为0
for w in document:
counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1#读取词频
token2id = self.token2id
result = {token2id[w]: freq for w, freq in iteritems(counter) if w in token2id}#跟编号对应
# return tokenids, in ascending id order
result = sorted(iteritems(result))#最后的结果,排序完成的迭代器
例子:
>>> from gensim.corpora import Dictionary
>>> dct = Dictionary(["máma mele maso".split(), "ema má máma".split()])
>>>dct.token2id
{'maso': 0, 'mele': 1, 'máma': 2, 'ema': 3, 'má': 4}
>>> dct.doc2bow(['maso','ema'])
[(0, 1), (3, 1)]
gensim/models/tfidfmodel.py
class TfidfModels:
参数:corpus迭代器 id2word dictionary wlocals wglobal(加权函数) normalize(bool) smartirs
init: smartirs决定加权函数
def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity,
wglobal=df2idf, normalize=True, smartirs=None):
"""Compute tf-idf by multiplying a local component (term frequency) with a global component
(inverse document frequency), and normalizing the resulting documents to unit length.
Formula for non-normalized weight of term :math:`i` in document :math:`j` in a corpus of :math:`D` documents
.. math:: weight_{i,j} = frequency_{i,j} * log_2 \\frac{D}{document\_freq_{i}}
or, more generally
.. math:: weight_{i,j} = wlocal(frequency_{i,j}) * wglobal(document\_freq_{i}, D)
so you can plug in your own custom :math:`wlocal` and :math:`wglobal` functions."""
self.id2word = id2word
self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize
self.num_docs, self.num_nnz, self.idfs = None, None, None
self.smartirs = smartirs
if dictionary is not None:#字典不为空
# user supplied a Dictionary object, which already contains all the
# statistics we need to construct the IDF mapping. we can skip the
# step that goes through the corpus (= an optimization).
if corpus is not None:
logger.warning(
"constructor received both corpus and explicit inverse document frequencies; ignoring the corpus"
)
self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz#取dictionary的num_docs(文档数),num_nnz(非重总字符)
self.dfs = dictionary.dfs.copy()#复制dictionary的dfs,即(编号,频率)表
self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
if id2word is None:
self.id2word = dictionary
elif corpus is not None:
self.initialize(corpus)
else:
# NOTE: everything is left uninitialized; presumably the model will
# be initialized in some other way
pass
def initialize(self, corpus):
"""Compute inverse document weights, which will be used to modify term frequencies for documents.
"""
logger.info("collecting document frequencies")
dfs = {}
numnnz, docno = 0, -1
for docno, bow in enumerate(corpus):#bow模型list of(int,int)
if docno % 10000 == 0:
logger.info("PROGRESS: processing document #%i", docno)
numnnz += len(bow)#字符数
for termid, _ in bow:#编号
dfs[termid] = dfs.get(termid, 0) + 1
# keep some stats about the training corpus
self.num_docs = docno + 1
self.num_nnz = numnnz
self.dfs = dfs
# and finally compute the idf weights
n_features = max(dfs) if dfs else 0
logger.info(
"calculating IDF weights for %i documents and %i features (%i matrix non-zeros)",
self.num_docs, n_features, self.num_nnz
)
self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)#wglobal默认为df2idf,计算每个元素的加权值
def precompute_idfs(wglobal, dfs, total_docs):
# not strictly necessary and could be computed on the fly in TfidfModel__getitem__.
# this method is here just to speed things up a little.
return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)}
def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):
"""Compute default inverse-document-frequency for a term with document frequency:
:math:`idf = add + log_{log\_base} \\frac{totaldocs}{doc\_freq}`"""
return add + np.log(float(totaldocs) / docfreq) / np.log(log_base)#计算加权值
def __getitem__(self, bow, eps=1e-12):
"""Get tf-idf representation of the input vector and/or corpus.
bow : {list of (int, int), iterable of iterable of (int, int)}
Input document or copus in BoW format.
eps : float
Threshold value, will remove all position that have tfidf-value less than `eps`.
Returns
-------
vector : list of (int, float)
TfIdf vector, if `bow` is document **OR**
:class:`~gensim.interfaces.TransformedCorpus`
TfIdf corpus, if `bow` is corpus.
"""
# if the input vector is in fact a corpus, return a transformed corpus as a result
is_corpus, bow = utils.is_corpus(bow)
if is_corpus:#ture
return self._apply(bow)
# unknown (new) terms will be given zero weight (NOT infinity/huge weight,
# as strict application of the IDF formula would dictate)
termid_array, tf_array = [], []
for termid, tf in bow:
termid_array.append(termid)
tf_array.append(tf)
tf_array = self.wlocal(np.array(tf_array))
vector = [
(termid, tf * self.idfs.get(termid))
for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > eps
]
if self.normalize is True:
self.normalize = matutils.unitvec
elif self.normalize is False:
self.normalize = utils.identity
# and finally, normalize the vector either to unit length, or use a
# user-defined normalization function
vector = self.normalize(vector)
# make sure there are no explicit zeroes in the vector (must be sparse)
vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps]
return vector
gensim/models/lsimodel.py
class LsiModels:
def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000,
decay=1.0, distributed=False, onepass=True,
power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS, dtype=np.float64):
"""Construct an `LsiModel` object.
Either `corpus` or `id2word` must be supplied in order to train the model.
Parameters
----------
corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional
Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`).
num_topics : int, optional
Number of requested factors (latent dimensions)
id2word : dict of {int: str}, optional
ID to word mapping, optional.
chunksize : int, optional
Number of documents to be used in each training chunk.
decay : float, optional
Weight of existing observations relatively to new ones.
distributed : bool, optional
If True - distributed mode (parallel execution on several machines) will be used.
onepass : bool, optional
Whether the one-pass algorithm should be used for training.
Pass `False` to force a multi-pass stochastic algorithm.
power_iters: int, optional
Number of power iteration steps to be used.
Increasing the number of power iterations improves accuracy, but lowers performance
extra_samples : int, optional
Extra samples to be used besides the rank `k`. Can improve accuracy.
dtype : type, optional
Enforces a type for elements of the decomposed matrix.
"""
self.id2word = id2word
self.num_topics = int(num_topics)
self.chunksize = int(chunksize)
self.decay = float(decay)
if distributed:
if not onepass:
logger.warning("forcing the one-pass algorithm for distributed LSA")
onepass = True
self.onepass = onepass
self.extra_samples, self.power_iters = extra_samples, power_iters
self.dtype = dtype
if corpus is None and self.id2word is None:
raise ValueError(
'at least one of corpus/id2word must be specified, to establish input space dimensionality'
)
if self.id2word is None:
logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
self.id2word = utils.dict_from_corpus(corpus)
self.num_terms = len(self.id2word)
else:
self.num_terms = 1 + (max(self.id2word.keys()) if self.id2word else -1)
self.docs_processed = 0
self.projection = Projection(
self.num_terms, self.num_topics, power_iters=self.power_iters, extra_dims=self.extra_samples, dtype=dtype
)
self.numworkers = 1
if not distributed:
logger.info("using serial LSI version on this node")
self.dispatcher = None
else:
if not onepass:
raise NotImplementedError(
"distributed stochastic LSA not implemented yet; "
"run either distributed one-pass, or serial randomized."
)
try:
import Pyro4
dispatcher = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher')
logger.debug("looking for dispatcher at %s", str(dispatcher._pyroUri))
dispatcher.initialize(
id2word=self.id2word, num_topics=num_topics, chunksize=chunksize, decay=decay,
power_iters=self.power_iters, extra_samples=self.extra_samples, distributed=False, onepass=onepass
)
self.dispatcher = dispatcher
self.numworkers = len(dispatcher.getworkers())
logger.info("using distributed version with %i workers", self.numworkers)
except Exception as err:
# distributed version was specifically requested, so this is an error state
logger.error("failed to initialize distributed LSI (%s)", err)
raise RuntimeError("failed to initialize distributed LSI (%s)" % err)
if corpus is not None:
self.add_documents(corpus)