废话不多说,直接上代码
'''LDA模型的实现及可视化'''
import pandas as pd
import numpy as np
import jieba
import jieba.posseg as peg
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis.sklearn
import time
#定义函数,加载txt文件
def readtxt(filepath,encoding='utf-8'):
words=[line.strip() for line in open(filepath,mode='r',encoding=encoding).readlines()]
return words
#定义分词函数
def cut_word(text):
#加载用户自定义词典
#jieba.load_userdict('user_dict.txt')
#加载停用词表
stopwords=readtxt('...\stopwords_cn.txt',encoding='utf-8')
sentence=""
checkarr=['n']
for word,flag in peg.lcut(text):
if (flag in checkarr) and (word not in stopwords) and (len(word)>1):
sentence = sentence + word + " "
return sentence
#文本向量化
def word_vectorizer(n_features,max_df=0.5,min_df=3):
cv = CountVectorizer(strip_accents = 'unicode',#将使用unicode编码在预处理步骤去除raw document中的重音符号
max_features=n_features,
max_df = 0.5,# 阈值如果某个词的document frequence大于max_df,不当作关键词
min_df = 3 # 如果某个词的document frequence小于min_df,则这个词不会被当作关键词
)
return cv
def lda_model(k,max_iter=50,method='online',learning_offset=50.,random_state=0):
lda=LatentDirichletAllocation(n_components=k,max_iter=max_iter,
learning_method=method,
learning_offset=learning_offset,
random_state=random_state)
return lda
def print_keywords(lda,cv,therahold,p):
weight_matrix=lda.components_
tf_feature_names=cv.get_feature_names()
id = 0
for weights in weight_matrix:
dicts = [(name, weight) for name, weight in zip(tf_feature_names, weights)]
dicts = sorted(dicts, key=lambda x: x[1], reverse=True)#根据特征词的权重降序排列
dicts = [word for word in dicts if word[1] > therahold]# 打印权重值大于0.6的主题词
dicts = dicts[:p]# 打印每个主题前5个主题词
print('主题%d:' % (id), dicts)
id += 1
if __name__=='__main__':
#调用函数
text=readtxt(r'...\data\reviews.txt')
#分词
segged_words=[cut_word(x) for x in text]
print(segged_words[0])
#向量化
n_features = 1000# 指定特征关键词提取最大值
cv=word_vectorizer(n_features)
tf = cv.fit_transform(segged_words)#将评论关键字列表转换为词向量空间,TFIDF矩阵
#构建lda模型
time_start=time.time()
lda=lda_model(4)
ldamodel=lda.fit_transform(tf)
time_end=time.time()
print('time cost',time_end-time_start,'s')
'''对于构建的词典,一些查看操作'''
# #查看构建的词典
# print(cv.vocabulary_)
# #查看词典大小
# print(len(cv.vocabulary_))
# print(cv.get_feature_names())
# #查看抽取出的特征词个数
# print(len(cv.get_feature_names()))
# #查看每个特征词在单个文摘中的词频
# print(tf)
# #查看全部文摘向量化表示的结果
# print(tf.toarray())
# #计算每个词在所有文摘中的累积词频
# print(tf.toarray().sum(axis=0))
# #根据累积词频,提取高频词
# #(1)获取高频词的索引
# fre=tf.toarray().sum(axis=0)
# index_lst=[]
# for i in range(len(fre)):
# if fre[i]>10:
# index_lst.append(i)
# #(2)对词典按词频升序排列
# voca=list(cv.vocabulary_.items())
# sorted_voca=sorted(voca,key=lambda x:x[1],reverse=False)
# #(3)提取高频词
# high_fre_voca=[]
# for i in sorted_voca:
# if i[1] in index_lst:
# high_fre_voca.append(i[0])
# print(high_fre_voca)
'''对于构建的lda模型,查看已有语料库属于各个主题的概率'''
# #查看每个文摘属于各个主题的概率
# proba=np.array(ldamodel)
# print('每个文摘属于各个主题的概率:\n',proba)
# #构建一个零矩阵
# zero_matrix=np.zeros([proba.shape[0]])
# # 对比所属概率的大小,确定属于的类别
# max_proba = np.argmax(proba, axis=1) # 返回沿轴axis最大值的索引,axis=1代表行;最大索引即表示最可能表示的数字是多少
# print('每个文档所属类别:', max_proba)
#查看每个特征词属于各个主题的权重
# weight_matrix=ldamodel.components_
# print(weight_matrix)
# print(len(weight_matrix))
#打印每个主题前5个关键词【要求每个关键词的权重大于0.6】
print_keywords(lda, cv, 0.6, 5)
#可视化
d=pyLDAvis.sklearn.prepare(lda, tf, cv)
pyLDAvis.show(d)