LDA主题提取

一枚爱吃大蒜的程序员

于 2024-03-31 17:27:19 发布

阅读量348

点赞数 3

文章标签：人工智能深度学习机器学习

本文链接：https://blog.csdn.net/qiqi_ai_/article/details/137204559

版权

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# 示例文本数据
documents = [
    "This is the first document about technology and artificial intelligence.",
    "The second document is about machine learning and natural language processing.",
    "The third document discusses deep learning and neural networks.",
    "Another document talks about data mining and big data analytics.",
    "The last document covers computer vision and image recognition."
]

# 使用词袋模型提取特征
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

# 设置 LDA 模型参数
num_topics = 2  # 指定要发现的主题数
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)

# 拟合 LDA 模型
lda.fit(X)

# 输出每个主题的前几个关键词
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic {topic_idx + 1}:")
    top_features_idx = topic.argsort()[-5:][::-1]  # 输出每个主题的前五个关键词
    top_features = [feature_names[i] for i in top_features_idx]
    print(top_features)