第1关：学会使用 Gensim

最新推荐文章于 2024-09-30 21:37:03 发布

好牛叉

最新推荐文章于 2024-09-30 21:37:03 发布

阅读量1.8k

点赞数 3

文章标签：算法

本文链接：https://blog.csdn.net/qq_37336842/article/details/124809866

版权

该博客介绍了如何使用gensim库构建LDA和LSI主题模型。首先，通过jieba进行词性过滤和停用词移除，构建词典并进行文本向量化。接着，基于BOW模型和TF-IDF权重构建了LDA模型，展示了一个主题的输出。然后，定义了一个TopicModel类，实现了LSI和LDA模型的训练，以及关键词的提取。最后，通过示例展示了如何提取输入文本的关键词。

摘要由CSDN通过智能技术生成

from gensim import corpora, models
import jieba.posseg as jp, jieba
from basic import get_stopword_list
texts=[]
for i in range(5):
    s=input()
    texts.append(s)
flags = ('n', 'nr', 'ns', 'nt', 'eng', 'v', 'd')  # 词性
stopwords = get_stopword_list()
words_ls = []
for text in texts:
    words = [word.word for word in jp.cut(text) if word.flag in flags and word.word not in stopwords]
    words_ls.append(words)
# 去重，存到字典
dictionary = corpora.Dictionary(words_ls)
corpus = [dictionary.doc2bow(words) for words in words_ls]
# 任务:基于 gensim 的models构建一个lda模型，主题数为1个
# ********** Begin *********#
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=1)
# ********** End **********#
for topic in lda.print_topics(num_words=1):
    print(topic[1].split('*')[1],end="")

第2关：LSA / LSI 算法

from gensim import corpora, models
import functools
from others import seg_to_list,load_data,word_filter,cmp
import math
class TopicModel(object):
    # 三个传入参数：处理后的数据集，关键词数量，具体模型（LSI、LDA），主题数量
    def __init__(self, doc_list, keyword_num, model='LSI', num_topics=4):
        # 使用gensim的接口，将文本转为向量化表示
        # 先构建词空间
        self.dictionary = corpora.Dictionary(doc_list)
        # 任务：使用BOW模型进行向量化，并保存到corpus变量中
        # ********** Begin *********#
        corpus = [self.dictionary.doc2bow(doc) for doc in doc_list]
        # ********** End **********#
        # 对每个词，根据tf-idf进行加权，得到加权后的向量表示
        self.tfidf_model = models.TfidfModel(corpus)
        self.corpus_tfidf = self.tfidf_model[corpus]
        self.keyword_num = keyword_num
        self.num_topics = num_topics
        # 选择加载的模型
        if model == 'LSI':
            self.model = self.train_lsi()
        else:
            self.model = self.train_lda()
        # 得到数据集的主题-词分布
        word_dic = self.word_dictionary(doc_list)
        self.wordtopic_dic = self.get_wordtopic(word_dic)
    def train_lsi(self):
        lsi = models.LsiModel(self.corpus_tfidf, id2word=self.dictionary, num_topics=self.num_topics)
        return lsi
    def train_lda(self):
        lda = models.LdaModel(self.corpus_tfidf, id2word=self.dictionary, num_topics=self.num_topics)
        return lda
    def get_wordtopic(self, word_dic):
        wordtopic_dic = {}
        for word in word_dic:
            single_list = [word]
            wordcorpus = self.tfidf_model[self.dictionary.doc2bow(single_list)]
            wordtopic = self.model[wordcorpus]
            wordtopic_dic[word] = wordtopic
        return wordtopic_dic
    # 计算词的分布和文档的分布的相似度，取相似度最高的keyword_num个词作为关键词
    def get_simword(self, word_list):
        sentcorpus = self.tfidf_model[self.dictionary.doc2bow(word_list)]
        senttopic = self.model[sentcorpus]
        # 余弦相似度计算
        def calsim(l1, l2):
            a, b, c = 0.0, 0.0, 0.0
            for t1, t2 in zip(l1, l2):
                x1 = t1[1]
                x2 = t2[1]
                a += x1 * x1
                b += x1 * x1
                c += x2 * x2
            sim = a / math.sqrt(b * c) if not (b * c) == 0.0 else 0.0
            return sim
        # 计算输入文本和每个词的主题分布相似度
        sim_dic = {}
        for k, v in self.wordtopic_dic.items():
            if k not in word_list:
                continue
            sim = calsim(v, senttopic)
            sim_dic[k] = sim
        for k, v in sorted(sim_dic.items(), key=functools.cmp_to_key(cmp), reverse=True)[:self.keyword_num]:
            print(k + "/ ", end='')
        print()
    # 词空间构建方法和向量化方法，在没有gensim接口时的一般处理方法
    def word_dictionary(self, doc_list):
        dictionary = []
        for doc in doc_list:
            dictionary.extend(doc)
        dictionary = list(set(dictionary))
        return dictionary
    def doc2bowvec(self, word_list):
        vec_list = [1 if word in word_list else 0 for word in self.dictionary]
        return vec_list
def topic_extract(word_list, model, pos=False, keyword_num=10):
    doc_list = load_data(pos)
    topic_model = TopicModel(doc_list, keyword_num, model=model)
    topic_model.get_simword(word_list)
if __name__ == '__main__':
    text = input()
    pos = True
    seg_list = seg_to_list(text, pos)
    filter_list = word_filter(seg_list, pos)
    topic_extract(filter_list, 'LSI', pos)