LDA模型（一）：基于python中sklearn库的实现

编程小白_娟

已于 2022-12-05 08:48:20 修改

阅读量1.1k

点赞数

文章标签： python sklearn

于 2022-12-04 17:25:53 首次发布

本文链接：https://blog.csdn.net/weixin_43392789/article/details/128174786

版权

废话不多说，直接上代码

'''LDA模型的实现及可视化'''
import pandas as pd
import numpy as np
import jieba
import jieba.posseg as peg
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis.sklearn
import time


#定义函数，加载txt文件
def readtxt(filepath,encoding='utf-8'):
    words=[line.strip() for line in open(filepath,mode='r',encoding=encoding).readlines()]
    return words

#定义分词函数
def cut_word(text):
    #加载用户自定义词典
    #jieba.load_userdict('user_dict.txt')
    #加载停用词表
    stopwords=readtxt('...\stopwords_cn.txt',encoding='utf-8')
    sentence=""
    checkarr=['n']
    for word,flag in peg.lcut(text):
        if (flag in checkarr) and (word not in stopwords) and (len(word)>1):
            sentence = sentence + word + " "
    return sentence     

#文本向量化
def word_vectorizer(n_features,max_df=0.5,min_df=3):
    cv = CountVectorizer(strip_accents = 'unicode',#将使用unicode编码在预处理步骤去除raw document中的重音符号
                                    max_features=n_features,
                                    max_df = 0.5,# 阈值如果某个词的document frequence大于max_df，不当作关键词
                                    min_df = 3 # 如果某个词的document frequence小于min_df，则这个词不会被当作关键词
                                    )
    return cv


def lda_model(k,max_iter=50,method='online',learning_offset=50.,random_state=0):  
    lda=LatentDirichletAllocation(n_components=k,max_iter=max_iter,
                                  learning_method=method,
                                  learning_offset=learning_offset,
                                  random_state=random_state)

    return lda

def print_keywords(lda,cv,therahold,p):
    weight_matrix=lda.components_
    tf_feature_names=cv.get_feature_names()
    id = 0
    for weights in weight_matrix:
        dicts = [(name, weight) for name, weight in zip(tf_feature_names, weights)]
        dicts = sorted(dicts, key=lambda x: x[1], reverse=True)#根据特征词的权重降序排列
        dicts = [word for word in dicts if word[1] > therahold]# 打印权重值大于0.6的主题词
        dicts = dicts[:p]# 打印每个主题前5个主题词
        print('主题%d:' % (id), dicts)
        id += 1

if __name__=='__main__':    
    #调用函数
    text=readtxt(r'...\data\reviews.txt')
    #分词
    segged_words=[cut_word(x) for x in text]
    print(segged_words[0])
    #向量化
    n_features = 1000# 指定特征关键词提取最大值
    cv=word_vectorizer(n_features)
    tf = cv.fit_transform(segged_words)#将评论关键字列表转换为词向量空间,TFIDF矩阵
    
    #构建lda模型
    time_start=time.time()
    lda=lda_model(4)
    ldamodel=lda.fit_transform(tf)
    time_end=time.time()
    print('time cost',time_end-time_start,'s')
    
    '''对于构建的词典，一些查看操作'''
    # #查看构建的词典
    # print(cv.vocabulary_)
    # #查看词典大小
    # print(len(cv.vocabulary_))
    # print(cv.get_feature_names())
    # #查看抽取出的特征词个数
    # print(len(cv.get_feature_names()))

    # #查看每个特征词在单个文摘中的词频
    # print(tf)
    # #查看全部文摘向量化表示的结果
    # print(tf.toarray())
    # #计算每个词在所有文摘中的累积词频
    # print(tf.toarray().sum(axis=0))

    # #根据累积词频，提取高频词
    # #（1）获取高频词的索引
    # fre=tf.toarray().sum(axis=0)
    # index_lst=[]
    # for i in range(len(fre)):
    #     if fre[i]>10:
    #         index_lst.append(i)
            
    # #(2)对词典按词频升序排列
    # voca=list(cv.vocabulary_.items())
    # sorted_voca=sorted(voca,key=lambda x:x[1],reverse=False)

    # #(3)提取高频词
    # high_fre_voca=[]
    # for i in sorted_voca:
    #     if i[1] in index_lst:
    #         high_fre_voca.append(i[0])
    # print(high_fre_voca)
    
    '''对于构建的lda模型，查看已有语料库属于各个主题的概率'''
    # #查看每个文摘属于各个主题的概率
    # proba=np.array(ldamodel)
    # print('每个文摘属于各个主题的概率：\n',proba)
    # #构建一个零矩阵
    # zero_matrix=np.zeros([proba.shape[0]])
    # # 对比所属概率的大小，确定属于的类别
    # max_proba = np.argmax(proba, axis=1) # 返回沿轴axis最大值的索引，axis=1代表行；最大索引即表示最可能表示的数字是多少
    # print('每个文档所属类别：', max_proba)

    #查看每个特征词属于各个主题的权重
    # weight_matrix=ldamodel.components_
    # print(weight_matrix)
    # print(len(weight_matrix))

    #打印每个主题前5个关键词【要求每个关键词的权重大于0.6】
    print_keywords(lda, cv, 0.6, 5)
    #可视化
    d=pyLDAvis.sklearn.prepare(lda, tf, cv)
    pyLDAvis.show(d)