# -*- coding: utf-8 -*-
import re
import warnings
import jieba
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
warnings.filterwarnings('ignore') # To ignore all warnings that arise here to enhance clarity
PATH = "data_lda.csv"
# 停用词
stop_words_path = "stop_words.txt"
stop_words = []
# 保留词文本路径
# 文本格式: 词语 词频(可省略) 词性(可省略)
# 一个词一行
reserved_words_path = "reserved_words.txt"
with open(stop_words_path, encoding='utf-8') as f:
for line in f.readlines():
stop_words.append(line.strip())
f.close()
# 数据清洗, 可以根据自己的需求进行重载
def processing(text):
text = re.sub("【.+?】", "", text) # 去除 【xx】 (里面的内容通常都不是用户自己写的)
text = re.sub("\n", "", text)
text = re.sub(r'[\W]', "", text) # 去除标点符号
text = re.sub(r'[\d]', "", text) # 去除数字
return text
# 对句子进行中文分词
def seg_depart(sentence
LDA模型 python代码样例
于 2022-05-15 23:04:43 首次发布