LDA模型 python代码样例

xingyun0110

已于 2022-05-15 23:09:23 修改

阅读量2.3k

点赞数 2

文章标签： python 自然语言处理数据挖掘

于 2022-05-15 23:04:43 首次发布

本文链接：https://blog.csdn.net/qq_41568607/article/details/124790221

版权

# -*- coding: utf-8 -*-
import re
import warnings
import jieba
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel

warnings.filterwarnings('ignore')  # To ignore all warnings that arise here to enhance clarity
PATH = "data_lda.csv"

# 停用词
stop_words_path = "stop_words.txt"
stop_words = []

# 保留词文本路径
# 文本格式： 词语 词频（可省略） 词性（可省略）
# 一个词一行
reserved_words_path = "reserved_words.txt"

with open(stop_words_path, encoding='utf-8') as f:
    for line in f.readlines():
        stop_words.append(line.strip())
f.close()


# 数据清洗, 可以根据自己的需求进行重载
def processing(text):
    text = re.sub("【.+?】", "", text)  # 去除 【xx】 (里面的内容通常都不是用户自己写的)
    text = re.sub("\n", "", text)
    text = re.sub(r'[\W]', "", text)  # 去除标点符号
    text = re.sub(r'[\d]', "", text)  # 去除数字
    return text


# 对句子进行中文分词
def seg_depart(sentence