利用python sklearn 库实现LDA主题建模

最新推荐文章于 2024-07-22 09:31:55 发布

qq_39630202

最新推荐文章于 2024-07-22 09:31:55 发布

阅读量1w

点赞数 8

分类专栏： python_NLP 文章标签： LDA

python_NLP 专栏收录该内容

1 篇文章 1 订阅

订阅专栏

利用python sklearn 库实现LDA主题建模

本文介绍了如何使用python中的sklearn机器学习库实现自然语言处理中的LDA主题建模。

1.导入相关模块及数据

本文所使用的数据来源于web of sci 上的论文摘要数据。

import pandas as pd
import numpy as np
import mglearn

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
#from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

abstract = pd.read_csv('final_data.csv')

#train_text = abstract.loc[0:999,]
#train_text = train_text[(['TI', 'JI', 'PY', 'C1', 'AB', 'AU'])]
input_data = abstract['AB']
input_data = list(input_data)
input_data[0]
[output]:

Out[48]:
'This Work Proposes A Real Time Estimator For Needle Tip Deflection And Needle Shape During Needle Insertion Into Soft Tissue  The Estimator Is Based On An Adaptive Quasi Static Mechanics Based Model For Needle Tissue Interactions  The Model Uses Euler Bernoulli Beam Theory To Model The Needle As A Cantilever Beam That Experiences Loads Imposed By The Tissue  The Modeled Needle Tissue Interactions Consist Of A Distributed Load Along The Inserted Needle Portion And Tissue Cutting Related Point Load At The Needle Tip  We Propose A Closed Form Solution To Quantify The Magnitude Of These Needle Tissue Interaction Loads Based On Force And Torque Measured At The Needle Base  The Model Adaptively Adjusts The Shape Of The Distributed Load As The Needle Is Inserted  Experiments Are Carried Out Into Gelatin Phantom And Porcine Tissue To Validate The Deflection Estimate s Performance  The Newly Proposed Model s Performance Is Compared Against A Previously Proposed Quasi Static Model For Needle Deflection Estimation  It Is Shown That The Novel Model Outperforms The Previously Proposed Model '

2.数据预处理

定义数据预处理函数，将原始数据全部变为小写，分词，去除停用词

import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
def textPreprocessing(text):
    #小写化
    text = [s.lower() for s in text]

    #去除特殊标点
    '''
    for c in string.punctuation:
        text = text.replace(c, ' ')
    '''
    #分词
    wordLst = [nltk.word_tokenize(txt) for txt in text]

    #去除停用词
    filtered = [w for w in wordLst  if w not in stopwords.words('english')]

    return filtered

input_text = textPreprocessing(input_data)
input_text[0]

[output]:
['this',
 'work',
 'proposes',
 'a',
 'real',
 'time',
 'estimator',
 'for',
 'needle',
 'tip',
 'deflection',
 'and',
 'needle',
 'shape',
 'during',
 'needle',
 'insertion',
 'into',
 'soft',
 'tissue',
 'the',
 'estimator',
 'is',
 'based',
 'on',
 'an',
 'adaptive',
 'quasi',
 'static',
 'mechanics',
 'based',
 'model',
 'for',
 'needle',
 'tissue',
 'interactions',
 'the',
 'model',
 'uses',
 'euler',
 'bernoulli',
 'beam',
 'theory',
 'to',
 'model',
 'the',
 'needle',
 'as',
 'a',
 'cantilever',
 'beam',
 'that',
 'experiences',
 'loads',
 'imposed',
 'by',
 'the',
 'tissue',
 'the',
 'modeled',
 'needle',
 'tissue',
 'interactions',
 'consist',
 'of',
 'a',
 'distributed',
 'load',
 'along',
 'the',
 'inserted',
 'needle',
 'portion',
 'and',
 'tissue',
 'cutting',
 'related',
 'point',
 'load',
 'at',
 'the',
 'needle',
 'tip',
 'we',
 'propose',
 'a',
 'closed',
 'form',
 'solution',
 'to',
 'quantify',
 'the',
 'magnitude',
 'of',
 'these',
 'needle',
 'tissue',
 'interaction',
 'loads',
 'based',
 'on',
 'force',
 'and',
 'torque',
 'measured',
 'at',
 'the',
 'needle',
 'base',
 'the',
 'model',
 'adaptively',
 'adjusts',
 'the',
 'shape',
 'of',
 'the',
 'distributed',
 'load',
 'as',
 'the',
 'needle',
 'is',
 'inserted',
 'experiments',
 'are',
 'carried',
 'out',
 'into',
 'gelatin',
 'phantom',
 'and',
 'porcine',
 'tissue',
 'to',
 'validate',
 'the',
 'deflection',
 'estimate',
 's',
 'performance',
 'the',
 'newly',
 'proposed',
 'model',
 's',
 'performance',
 'is',
 'compared',
 'against',
 'a',
 'previously',
 'proposed',
 'quasi',
 'static',
 'model',
 'for',
 'needle',
 'deflection',
 'estimation',
 'it',
 'is',
 'shown',
 'that',
 'the',
 'novel',
 'model',
 'outperforms',
 'the',
 'previously',
 'proposed',
 'model']

3.向量化

将文本数据向量化，作为LDA模型的输入，本文用TFIDF作为向量化的指标。

vect = TfidfVectorizer(max_features=10000, min_df=10, max_df=0.95,
                       stop_words='english')
X = vect.fit_transform(input_text)

TfidfVectorizer.fit_transform 返回的是Tf-idf-weighted document-term matrix，可通过如下方式访问。

feature_names=np.array(vect.get_feature_names())
sorted_by_tfidf = np.argsort(X.max(axis=0).toarray().ravel())
#输出TFIDF排序最小的20与最大的20个term
print(feature_names[sorted_by_tfidf[20:]])
print(feature_names[sorted_by_tfidf[-20:]])

同时可以输出按照逆文档矩阵idf输出在所有文档中出现频率都很高的term。

sorted_by_idf = np.argsort(vect.idf_)
print(feature_names[sorted_by_idf[:20]])

4.LDA主题建模

设置主题数量，学习方式，超参数α，β取默认值。

lda = LatentDirichletAllocation(n_topics=10, learning_method='batch', 
                                max_iter=25, random_state=0)
lda_topics = lda.fit_transform(X)

lda.fit_transform 返回的是document-topic matrix，lda.components_返回的是topic-term matrix

sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
feature_names = np.array(vect.get_feature_names())

mglearn.tools.print_topics(topics=range(10), feature_names=feature_names,
                           sorting=sorting, topics_per_chunk=5, n_words=20)

4.结果展示

输出每个topic的top words

def print_top_words(model, feature_names, n_top_words):
    #打印每个主题下权重较高的term
    for topic_idx, topic in enumerate(model.components_):
        print "Topic #%d:" % topic_idx
        print " ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]])

    #打印主题-词语分布矩阵
    print ("#主题-词语分布矩阵: \n" model.components_)

n_top_words=20
feature_names = vect.get_feature_names()
print_top_words(lda, feature_names, n_top_words)

输出Doc-Topic矩阵

lda_topics

收敛效果(perplexity)

lda.perplexity(X)

5.调参过程

可以调整的参数

n_topics: 主题的个数
n_features: feature的个数，即常用词个数
doc_topic_prior:即我们的文档主题先验Dirichlet分布θd的参数α
topic_word_prior:即我们的主题词先验Dirichlet分布βk的参数η
learning_method: 即LDA的求解算法，有’batch’和’online’两种选择
其余sklearn提供的参数：根据LDA求解算法的不同，存在一些其它参数可以调节，参见最后的附录：sklearn LDA API 中文解释。

两种可行的调参方案
一、以n_topics为例，按照perplexity的大小选择最佳模型。当然，topic数目的不同势必会导致perplexity计算的不同，因此perplexity仅能作为参考，topic数目还需要根据实际需求主观指定。n_topics调参代码如下：

n_topics = range(20, 75, 5)
perplexityLst = [1.0]*len(n_topics)

#训练LDA并打印训练时间
lda_models = []
for idx, n_topic in enumerate(n_topics):
    lda = LatentDirichletAllocation(n_topics=n_topic,
                                    max_iter=20,
                                    learning_method='batch',
                                    evaluate_every=200,
#                                    perp_tol=0.1, #default                                       
#                                    doc_topic_prior=1/n_topic, #default
#                                    topic_word_prior=1/n_topic, #default
                                    verbose=0)
    t0 = time()
    lda.fit(tf)
    perplexityLst[idx] = lda.perplexity(tf)
    lda_models.append(lda)
    print "# of Topic: %d, " % n_topics[idx],
    print "done in %0.3fs, N_iter %d, " % ((time() - t0), lda.n_iter_),
    print "Perplexity Score %0.3f" % perplexityLst[idx]

#打印最佳模型
best_index = perplexityLst.index(min(perplexityLst))
best_n_topic = n_topics[best_index]
best_model = lda_models[best_index]
print "Best # of Topic: ", best_n_topic

#绘制不同主题数perplexity的不同
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(n_topics, perplexityLst)
ax.set_xlabel("# of topics")
ax.set_ylabel("Approximate Perplexity")
plt.grid(True)
plt.savefig(os.path.join('lda_result', 'perplexityTrend'+CODE+'.png'))
plt.show()

Output:
Best # of Topic:  25

二、如果想一次性调整所有参数也可以直接利用sklearn作cv，但是这样做的结果一定是，耗时十分长。以下代码仅供参考，可以根据自身的需求进行增减。

from sklearn.model_selection import GridSearchCV
parameters = {'learning_method':('batch', 'online'), 
              'n_topics':range(20, 75, 5),
              'perp_tol': (0.001, 0.01, 0.1),
              'doc_topic_prior':(0.001, 0.01, 0.05, 0.1, 0.2),
              'topic_word_prior':(0.001, 0.01, 0.05, 0.1, 0.2)
              'max_iter':1000}
lda = LatentDirichletAllocation()
model = GridSearch(lda, parameters)
model.fit(tf)

sorted(model.cv_results_.keys())