Lda模型分析完整代码

咸鱼都能看懂的代码

于 2024-05-17 08:57:45 发布

阅读量368

点赞数 2

文章标签： c# 开发语言

本文链接：https://blog.csdn.net/qq_43644046/article/details/138991541

版权

之前参考博主的代码写的非常好，但是一些小白不是计算机的对于代码运行不起来，以至于后台私信我，这里统一放一份完整代码，代码是可以运行的，我也不是专业的，只是恰好使用到了，如有错误，多多包涵，我把完整的代码放上，大家记得给原博主点个赞，原博主代码链接。

import re
import jieba as jb
from gensim.models import LdaModel
import pyLDAvis.gensim_models
import codecs
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import gensim
from gensim import corpora, models
import matplotlib.pyplot as plt
import matplotlib
if __name__ == '__main__':

    #--------------------------句子拆分------------------------------
    def stopwordslist(filepath):
        stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
        return stopwords

    # 对句子进行分词
    def seg_sentence(sentence):
        sentence = re.sub(u'[0-9\.]+', u'', sentence)
        # jb.add_word('词汇')		# 这里是加入自定义的词来补充jieba词典
        sentence_seged = jb.cut(sentence.strip())
        stopwords = stopwordslist('自己搜来的停用词表.txt')  # 这里加载停用词的路径
        outstr = ''
        for word in sentence_seged:
            if word not in stopwords and word.__len__() > 1:
                if word != '\t':
                    outstr += word
                    outstr += " "
        return outstr

    inputs = open('感想.txt', 'r', encoding='utf-8')
    outputs = open('感想分词.txt', 'w', encoding='utf-8')
    for line in inputs:
        line_seg = seg_sentence(line)  # 这里的返回值是字符串
        outputs.write(line_seg + '\n')
    outputs.close()
    inputs.close()


    # --------------------------------开始构建lda模型-------------------------------
    train = []

    fp = codecs.open('感想分词.txt', 'r', encoding='utf8')
    for line in fp:
        if line != '':
            line = line.split()
            train.append([w for w in line])

    dictionary = corpora.Dictionary(train)

    corpus = [dictionary.doc2bow(text) for text in train]

    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=4, passes=100)
    # num_topics：主题数目
    # passes：训练伦次
    # num_words：每个主题下输出的term的数目

    for topic in lda.print_topics(num_words=20):
        termNumber = topic[0]
        print(topic[0], ':', sep='')
        listOfTerms = topic[1].split('+')
        for term in listOfTerms:
            listItems = term.split('*')
            print('  ', listItems[1], '(', listItems[0], ')', sep='')

    # -------------------------------可视化拆分-------------------------------------
    train = []
    fp = codecs.open('感想分词.txt', 'r', encoding='utf8')
    for line in fp:
        if line != '':
            line = line.split()
            train.append([w for w in line])

    dictionary = corpora.Dictionary(train)

    corpus = [dictionary.doc2bow(text) for text in train]

    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=4, passes=100)
    # num_topics：主题数目
    # passes：训练伦次
    # num_words：每个主题下输出的term的数目

    for topic in lda.print_topics(num_words=20):
        termNumber = topic[0]
        print(topic[0], ':', sep='')
        listOfTerms = topic[1].split('+')
        for term in listOfTerms:
            listItems = term.split('*')
            print('  ', listItems[1], '(', listItems[0], ')', sep='')

    d = pyLDAvis.gensim_models.prepare(lda, corpus, dictionary)

    '''
    lda: 计算好的话题模型
    corpus: 文档词频矩阵
    dictionary: 词语空间
    '''

    # pyLDAvis.show(d)		#展示在浏览器
    # pyLDAvis.displace(d) #展示在notebook的output cell中
    pyLDAvis.save_html(d, 'lda_pass4.html')

    # ----------------------------困惑都计算--------------------------------------
    # 准备数据
    PATH = "感想分词.txt"  # 已经进行了分词的文档（如何分词前面的文章有介绍）
    file_object2 = open(PATH, encoding='utf-8', errors='ignore').read().split('\n')
    data_set = []  # 建立存储分词的列表
    for i in range(len(file_object2)):
        result = []
        seg_list = file_object2[i].split()  # 读取没一行文本
        for w in seg_list:  # 读取每一行分词
            result.append(w)
        data_set.append(result)
    print(data_set)  # 输出所有分词列表

    dictionary = corpora.Dictionary(data_set)  # 构建 document-term matrix
    corpus = [dictionary.doc2bow(text) for text in data_set]
    Lda = gensim.models.ldamodel.LdaModel  # 创建LDA对象


    # 计算困惑度
    def perplexity(num_topics):
        ldamodel = Lda(corpus, num_topics=num_topics, id2word=dictionary, passes=50)  # passes为迭代次数，次数越多越精准
        print(ldamodel.print_topics(num_topics=num_topics, num_words=20))  # num_words为每个主题下的词语数量
        print(ldamodel.log_perplexity(corpus))
        return ldamodel.log_perplexity(corpus)


    # 绘制困惑度折线图
    x = range(1, 20)  # 主题范围数量
    y = [perplexity(i) for i in x]
    plt.plot(x, y)
    plt.xlabel('主题数目')
    plt.ylabel('困惑度大小')
    plt.rcParams['font.sans-serif'] = ['SimHei']
    matplotlib.rcParams['axes.unicode_minus'] = False
    plt.title('主题-困惑度变化情况')
    plt.show()

    #-------------------------------一致性得分-------------------------------------
    # 准备数据
    PATH = "感想分词.txt"  # 已经进行了分词的文档（如何分词前面的文章有介绍）

    file_object2 = open(PATH, encoding='utf-8', errors='ignore').read().split('\n')
    data_set = []  # 建立存储分词的列表
    for i in range(len(file_object2)):
        result = []
        seg_list = file_object2[i].split()  # 读取没一行文本
        for w in seg_list:  # 读取每一行分词
            result.append(w)
        data_set.append(result)
    print(data_set)  # 输出所有分词列表

    dictionary = corpora.Dictionary(data_set)  # 构建 document-term matrix
    corpus = [dictionary.doc2bow(text) for text in data_set]
    Lda = gensim.models.ldamodel.LdaModel  # 创建LDA对象


    def coherence(num_topics):
        ldamodel = Lda(corpus, num_topics=num_topics, id2word=dictionary, passes=50)  # passes为迭代次数，次数越多越精准
        coherence_model_lda = CoherenceModel(model=ldamodel, texts=data_set, dictionary=dictionary, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print('\nCoherence Score: ', coherence_lda)
        return coherence_lda


    # 绘制困惑度折线图
    x = range(1, 2)  # 主题范围数量
    y = [coherence(i) for i in x]
    plt.plot(x, y)
    plt.xlabel('主题数目')
    plt.ylabel('coherence大小')
    plt.rcParams['font.sans-serif'] = ['SimHei']
    matplotlib.rcParams['axes.unicode_minus'] = False
    plt.title('主题-coherence变化情况')
    plt.show()

除此以外，你还需要在同级目录下，存放一下四个文件，

其中情感分析.py,可以自己找，我只提供一个参考，让你们知道文件内是什么

from textblob import TextBlob

# 创建一个TextBlob对象
text = TextBlob("I am feel sad")

# 分析情感
sentiment = text.sentiment.polarity

# 输出情感分析结果
print(sentiment)
if sentiment > 0:
    print("积极的")
elif sentiment == 0:
    print("中性词")
else:
    print("消极的")

感想.txt内容需要自己爬虫收集，以下只是部分例子，理论上你是需要非常多行数据的

甘蓝甜甜的，不像超市的苦有农药味，以后就在这家买了
卷心菜每次必选 。烩点儿饼丝 。凉拌非常好吃 。
卷心菜也非常新鲜 ，全部原拍
太惊喜了，超级新鲜，以后就这家了
包菜之前也拍过，炒出来味道好像甜一些，对于市场打药多的包菜，有机虽然贵很多，但吃着放心。
经常回购这家，包装很好，夏季保鲜有保证。
质量不好
不是很鲜
不错，菜真新鲜
蔬菜收到很新鲜，吃起来口感很好。已是第二次回购！
冬瓜特別好吃 感覺還是吃有機的安全 會一直回購的
再次买了，是新鲜的有机蔬菜，很好吃，有需要还会再来光顾，满意的好评！
菜很新鲜，有机放心，继续回购。

自己搜来的停用词表.txt里面是你要去查找的用来分割句子的单词，大概是以下样子，你百度一下就有了

啊
阿
哎
哎呀
哎哟
唉
俺
俺们
按
按照
吧
吧哒
把
罢了
被
本
本着
比
比方
比如
鄙人
彼
彼此
边
别
别的
别说

最后感想分词.txt是我们运行程序生成的，不是你自己搜集的文件。

咸鱼都能看懂的代码

关注

2
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
Lda模型分析完整代码

感想.txt内容需要自己爬虫收集，以下只是部分例子，理论上你是需要非常多行数据的。除此以外，你还需要在同级目录下，存放一下四个文件，
复制链接

扫一扫

Lda模型分析完整代码

“相关推荐”对你有帮助么？