代码阅读记录

最新推荐文章于 2023-02-02 09:50:50 发布

skicth

最新推荐文章于 2023-02-02 09:50:50 发布

阅读量585

点赞数

本文链接：https://blog.csdn.net/skicth/article/details/80496064

版权

	dictionary = corpora.Dictionary(texts)
	corpus = [dictionary.doc2bow(text) for text in texts]#每段生成的（编号，词频）
	tfidf = models.TfidfModel(corpus)#进入模型
	corpus_tfidf = tfidf[corpus]#对模型采用[]操作，跑getitem()函数
	lsi = models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics=2)
	print (corpus_tfidf)
	corpus_lsi = lsi[corpus_tfidf]
	index = similarities.MatrixSimilarity(lsi[corpus])
	lsi.print_topics(2)

	with open('../papers/2.txt') as t1:
		content1 = t1.read()
		seg_list_after1=content1.split(" ")
	t1.close
	#print (seg_list_after1)
	
	bow_1=dictionary.doc2bow(seg_list_after1)
	lsi_1=lsi[bow_1]
	
	sims=index[lsi_1]
	#print (sims)
	sort_sims=sorted(enumerate(sims),key=lambda item:-item[1])

gensim/corpora/dictionary.py

class dictionary:

属性：token2id :字典（str, int) id2token: 字典（int,str) 互逆

dfs : 字典（int,int) 编号，频率

num_docs文档数量 num_pos词量 num_nnz非零

方法：doc2bow参数document allow_update return_missing

返回 list of (int, int):文档的BOW模型引用：

# Construct (word, frequency) mapping.#构建映射（单词，频率）

counter = defaultdict(int)#缺失值赋为0
for w in document:
    counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1#读取词频

token2id = self.token2id
result = {token2id[w]: freq for w, freq in iteritems(counter) if w in token2id}#跟编号对应
# return tokenids, in ascending id order
result = sorted(iteritems(result))#最后的结果，排序完成的迭代器

例子:

>>> from gensim.corpora import Dictionary

>>> dct = Dictionary(["máma mele maso".split(), "ema má máma".split()])

>>>dct.token2id
{'maso': 0, 'mele': 1, 'máma': 2, 'ema': 3, 'má': 4}
>>> dct.doc2bow(['maso','ema'])
[(0, 1), (3, 1)]

gensim/models/tfidfmodel.py

class TfidfModels:

参数：corpus迭代器 id2word dictionary wlocals wglobal(加权函数） normalize(bool) smartirs

init: smartirs决定加权函数

 def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity,
                 wglobal=df2idf, normalize=True, smartirs=None):
        """Compute tf-idf by multiplying a local component (term frequency) with a global component
        (inverse document frequency), and normalizing the resulting documents to unit length.
        Formula for non-normalized weight of term :math:`i` in document :math:`j` in a corpus of :math:`D` documents

        .. math:: weight_{i,j} = frequency_{i,j} * log_2 \\frac{D}{document\_freq_{i}}

        or, more generally

        .. math:: weight_{i,j} = wlocal(frequency_{i,j}) * wglobal(document\_freq_{i}, D)

        so you can plug in your own custom :math:`wlocal` and :math:`wglobal` functions."""

        self.id2word = id2word
        self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize
        self.num_docs, self.num_nnz, self.idfs = None, None, None
        self.smartirs = smartirs

        if dictionary is not None:#字典不为空
            # user supplied a Dictionary object, which already contains all the
            # statistics we need to construct the IDF mapping. we can skip the
            # step that goes through the corpus (= an optimization).
            if corpus is not None:
                logger.warning(
                    "constructor received both corpus and explicit inverse document frequencies; ignoring the corpus"
                )
            self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz#取dictionary的num_docs(文档数),num_nnz(非重总字符）
            self.dfs = dictionary.dfs.copy()#复制dictionary的dfs，即（编号，频率）表
            self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
            if id2word is None:
                self.id2word = dictionary
        elif corpus is not None:
            self.initialize(corpus)
        else:
            # NOTE: everything is left uninitialized; presumably the model will
            # be initialized in some other way
            pass

    def initialize(self, corpus):
        """Compute inverse document weights, which will be used to modify term frequencies for documents.
        """
        logger.info("collecting document frequencies")
        dfs = {}
        numnnz, docno = 0, -1

        for docno, bow in enumerate(corpus):#bow模型list of（int,int)
            if docno % 10000 == 0:
                logger.info("PROGRESS: processing document #%i", docno)
            numnnz += len(bow)#字符数
            for termid, _ in bow:#编号
                dfs[termid] = dfs.get(termid, 0) + 1

        # keep some stats about the training corpus
        self.num_docs = docno + 1
        self.num_nnz = numnnz
        self.dfs = dfs
        # and finally compute the idf weights
        n_features = max(dfs) if dfs else 0
        logger.info(
            "calculating IDF weights for %i documents and %i features (%i matrix non-zeros)",
            self.num_docs, n_features, self.num_nnz
        )
        self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)#wglobal默认为df2idf，计算每个元素的加权值

def precompute_idfs(wglobal, dfs, total_docs):

    # not strictly necessary and could be computed on the fly in TfidfModel__getitem__.
    # this method is here just to speed things up a little.
    return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)}

def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):
    """Compute default inverse-document-frequency for a term with document frequency:
    :math:`idf = add + log_{log\_base} \\frac{totaldocs}{doc\_freq}`"""

    return add + np.log(float(totaldocs) / docfreq) / np.log(log_base)#计算加权值

    def __getitem__(self, bow, eps=1e-12):
        """Get tf-idf representation of the input vector and/or corpus.

        bow : {list of (int, int), iterable of iterable of (int, int)}
            Input document or copus in BoW format.
        eps : float
            Threshold value, will remove all position that have tfidf-value less than `eps`.

        Returns
        -------
        vector : list of (int, float)
            TfIdf vector, if `bow` is document **OR**
        :class:`~gensim.interfaces.TransformedCorpus`
            TfIdf corpus, if `bow` is corpus.

        """
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:#ture
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
        # as strict application of the IDF formula would dictate)

        termid_array, tf_array = [], []
        for termid, tf in bow:
            termid_array.append(termid)
            tf_array.append(tf)

        tf_array = self.wlocal(np.array(tf_array))

        vector = [
            (termid, tf * self.idfs.get(termid))
            for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > eps
        ]

        if self.normalize is True:
            self.normalize = matutils.unitvec
        elif self.normalize is False:
            self.normalize = utils.identity

        # and finally, normalize the vector either to unit length, or use a
        # user-defined normalization function
        vector = self.normalize(vector)

        # make sure there are no explicit zeroes in the vector (must be sparse)
        vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps]
        return vector

gensim/models/lsimodel.py

class LsiModels:

    def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000,
                 decay=1.0, distributed=False, onepass=True,
                 power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS, dtype=np.float64):
        """Construct an `LsiModel` object.

        Either `corpus` or `id2word` must be supplied in order to train the model.

        Parameters
        ----------
        corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional
            Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`).
        num_topics : int, optional
            Number of requested factors (latent dimensions)
        id2word : dict of {int: str}, optional
            ID to word mapping, optional.
        chunksize :  int, optional
            Number of documents to be used in each training chunk.
        decay : float, optional
            Weight of existing observations relatively to new ones.
        distributed : bool, optional
            If True - distributed mode (parallel execution on several machines) will be used.
        onepass : bool, optional
            Whether the one-pass algorithm should be used for training.
            Pass `False` to force a multi-pass stochastic algorithm.
        power_iters: int, optional
            Number of power iteration steps to be used.
            Increasing the number of power iterations improves accuracy, but lowers performance
        extra_samples : int, optional
            Extra samples to be used besides the rank `k`. Can improve accuracy.
        dtype : type, optional
            Enforces a type for elements of the decomposed matrix.

        """
        self.id2word = id2word
        self.num_topics = int(num_topics)
        self.chunksize = int(chunksize)
        self.decay = float(decay)
        if distributed:
            if not onepass:
                logger.warning("forcing the one-pass algorithm for distributed LSA")
                onepass = True
        self.onepass = onepass
        self.extra_samples, self.power_iters = extra_samples, power_iters
        self.dtype = dtype

        if corpus is None and self.id2word is None:
            raise ValueError(
                'at least one of corpus/id2word must be specified, to establish input space dimensionality'
            )

        if self.id2word is None:
            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        else:
            self.num_terms = 1 + (max(self.id2word.keys()) if self.id2word else -1)

        self.docs_processed = 0
        self.projection = Projection(
            self.num_terms, self.num_topics, power_iters=self.power_iters, extra_dims=self.extra_samples, dtype=dtype
        )

        self.numworkers = 1
        if not distributed:
            logger.info("using serial LSI version on this node")
            self.dispatcher = None
        else:
            if not onepass:
                raise NotImplementedError(
                    "distributed stochastic LSA not implemented yet; "
                    "run either distributed one-pass, or serial randomized."
                )
            try:
                import Pyro4
                dispatcher = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher')
                logger.debug("looking for dispatcher at %s", str(dispatcher._pyroUri))
                dispatcher.initialize(
                    id2word=self.id2word, num_topics=num_topics, chunksize=chunksize, decay=decay,
                    power_iters=self.power_iters, extra_samples=self.extra_samples, distributed=False, onepass=onepass
                )
                self.dispatcher = dispatcher
                self.numworkers = len(dispatcher.getworkers())
                logger.info("using distributed version with %i workers", self.numworkers)
            except Exception as err:
                # distributed version was specifically requested, so this is an error state
                logger.error("failed to initialize distributed LSI (%s)", err)
                raise RuntimeError("failed to initialize distributed LSI (%s)" % err)

        if corpus is not None:
            self.add_documents(corpus)

skicth

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
代码阅读记录

dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts]#每段生成的[编号，词频] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lsi = models.LsiModel(corp...
复制链接

扫一扫