import pandas as pd from gensim import corpora from gensim.models import LdaModel from gensim.models.coherencemodel import CoherenceModel import matplotlib.pyplot as plt # 读取文本数据 df = pd.read_excel ( '新闻情感分析结果.xlsx' ) combined_texts = df['Combined Text'].tolist ( ) # 准备文档集合 documents = combined_texts # 构建词袋模型 texts = [[word for word in document.split ( )] for document in documents] dictionary = corpora.Dictionary ( texts ) corpus = [dictionary.doc2bow ( text ) for text in texts] # 定义函数计算主题一致性 def calculate_topic_coherence(lda_model, texts, dictionary): coherence_model = CoherenceModel ( model=lda_model, texts=texts, dictionary=dictionary, coherence='u_mass' ) return coherence_model.get_coherence ( ) # 运行代码一次,记录主题一致性结果 num_runs = 10 num_topics_range = range ( 1, 11 ) # 不同主题数量范围 # 创建空列表来存储主题一致性 topic_coherences = [] # 循环不同的主题数量 for num_topics in num_topics_range: lda_model = LdaModel ( corpus, num_topics=num_topics, id2word=dictionary, passes=20, iterations=100 ) # 计算主题一致性并添加到列表中 coherence = calculate_topic_coherence ( lda_model, texts, dictionary ) topic_coherences.append ( coherence ) # 绘制图形 plt.plot ( num_topics_range, topic_coherences, marker='o' ) plt.xlabel ( '主题数量' ) plt.ylabel ( '主题一致性' ) plt.title ( '不同主题数量下的主题一致性比较' ) plt.show ( )
计算LDA内部协方差
最新推荐文章于 2024-04-29 18:29:37 发布