潜在狄利克雷分配 Latent Dirichlet allocation
1.狄利克雷分布的概率密度函数为
p
(
θ
∣
α
)
=
Γ
(
∑
i
=
1
k
α
i
)
∏
i
=
1
k
Γ
(
α
i
)
∏
i
=
1
k
θ
i
α
i
−
1
p ( \theta | \alpha ) = \frac { \Gamma ( \sum _ { i = 1 } ^ { k } \alpha _ { i } ) } { \prod _ { i = 1 } ^ { k } \Gamma ( \alpha _ { i } ) } \prod _ { i = 1 } ^ { k } \theta _ { i } ^ { \alpha _ { i } - 1 }
p(θ∣α)=∏i=1kΓ(αi)Γ(∑i=1kαi)i=1∏kθiαi−1
其中
∑
i
=
1
k
θ
i
=
1
,
θ
i
≥
0
,
α
=
(
α
1
,
α
2
,
⋯
,
α
k
)
,
α
i
>
0
,
i
=
1
,
2
,
⋯
,
\sum _ { i = 1 } ^ { k } \theta _ { i } = 1 , \theta _ { i } \geq 0 , \alpha = ( \alpha _ { 1 } , \alpha _ { 2 } , \cdots , \alpha _ { k } ) , \alpha _ { i } > 0 , i = 1,2 , \cdots ,
∑i=1kθi=1,θi≥0,α=(α1,α2,⋯,αk),αi>0,i=1,2,⋯,
狄利克雷分布是多项分布的共轭先验。
2.潜在狄利克雷分配2.潜在狄利克雷分配(LDA)是文本集合的生成概率模型。模型假设话题由单词的多项分布表示,文本由话题的多项分布表示,单词分布和话题分布的先验分布都是狄利克雷分布。LDA模型属于概率图模型可以由板块表示法表示LDA模型中,每个话题的单词分布、每个文本的话题分布、文本的每个位置的话题是隐变量,文本的每个位置的单词是观测变量。
from gensim import corpora, models, similarities
from pprint import pprint
import warnings
f = open('./LDA_test.txt')
stop_list = set('for a of the and to in'.split())
# texts = [line.strip().split() for line in f]
# print('Before\n')
# pprint(texts)
# print('After\n')
texts = [[word for word in line.strip().lower().split() if word not in stop_list] for line in f]
print('Text = ')
pprint(texts)
Text =
[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
['eps', 'user', 'interface', 'management', 'system'],
['system', 'human', 'system', 'engineering', 'testing', 'eps'],
['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
['generation', 'random', 'binary', 'unordered', 'trees'],
['intersection', 'graph', 'paths', 'trees'],
['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
['graph', 'minors', 'survey']]
dictionary = corpora.Dictionary(texts)
print(dictionary)
Dictionary(35 unique tokens: ['abc', 'applications', 'computer', 'human', 'interface']...)
V = len(dictionary)
corpus = [dictionary.doc2bow(text) for text in texts]
corpus_tfidf = models.TfidfModel(corpus)[corpus]
corpus_tfidf = corpus
print('TF-IDF:')
for c in corpus_tfidf:
print(c)
TF-IDF:
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]
[(2, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1)]
[(4, 1), (10, 1), (12, 1), (13, 1), (14, 1)]
[(3, 1), (10, 2), (13, 1), (15, 1), (16, 1)]
[(8, 1), (11, 1), (12, 1), (17, 1), (18, 1), (19, 1), (20, 1)]
[(21, 1), (22, 1), (23, 1), (24, 1), (25, 1)]
[(24, 1), (26, 1), (27, 1), (28, 1)]
[(24, 1), (26, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)]
[(9, 1), (26, 1), (30, 1)]
print('\nLSI Model:')
lsi = models.LsiModel(corpus_tfidf, num_topics=2, id2word=dictionary)
topic_result = [a for a in lsi[corpus_tfidf]]
pprint(topic_result)
LSI Model:
[[(0, 0.9334981916792661), (1, 0.10508952614085784)],
[(0, 2.031992374687028), (1, -0.047145314121734186)],
[(0, 1.5351342836582094), (1, 0.13488784052204578)],
[(0, 1.9540077194594532), (1, 0.21780498576074558)],
[(0, 1.2902472956004132), (1, -0.0022521437499251062)],
[(0, 0.022783081905507166), (1, -0.7778052604326751)],
[(0, 0.05671567576921101), (1, -1.1827703446704847)],
[(0, 0.1236000332064837), (1, -2.634306860823685)],
[(0, 0.23560627195889272), (1, -0.9407936203668306)]]
print('LSI Topics:')
pprint(lsi.print_topics(num_topics=2, num_words=5))
LSI Topics:
[(0,
'0.579*"system" + 0.376*"user" + 0.270*"eps" + 0.257*"response" + '
'0.257*"time"'),
(1,
'-0.480*"graph" + -0.464*"trees" + -0.361*"minors" + -0.266*"quasi" + '
'-0.266*"ordering"')]
similarity = similarities.MatrixSimilarity(lsi[corpus_tfidf]) # similarities.Similarity()
print('Similarity:')
pprint(list(similarity))
Similarity:
[array([ 1. , 0.9908607 , 0.9997008 , 0.9999994 , 0.9935261 ,
-0.08272626, -0.06414512, -0.06517283, 0.13288835], dtype=float32),
array([0.9908607 , 0.99999994, 0.9938636 , 0.99100804, 0.99976987,
0.0524564 , 0.07105229, 0.070025 , 0.2653665 ], dtype=float32),
array([ 0.9997008 , 0.9938636 , 0.99999994, 0.999727 , 0.99600756,
-0.05832579, -0.03971674, -0.04074576, 0.15709123], dtype=float32),
array([ 0.9999994 , 0.99100804, 0.999727 , 1. , 0.9936501 ,
-0.08163348, -0.06305084, -0.06407862, 0.13397504], dtype=float32),
array([0.9935261 , 0.99976987, 0.99600756, 0.9936501 , 0.99999994,
0.03102366, 0.04963995, 0.04861134, 0.24462426], dtype=float32),
array([-0.08272626, 0.0524564 , -0.05832579, -0.08163348, 0.03102366,
0.99999994, 0.99982643, 0.9998451 , 0.97674036], dtype=float32),
array([-0.06414512, 0.07105229, -0.03971674, -0.06305084, 0.04963995,
0.99982643, 1. , 0.9999995 , 0.9805657 ], dtype=float32),
array([-0.06517283, 0.070025 , -0.04074576, -0.06407862, 0.04861134,
0.9998451 , 0.9999995 , 1. , 0.9803632 ], dtype=float32),
array([0.13288835, 0.2653665 , 0.15709123, 0.13397504, 0.24462426,
0.97674036, 0.9805657 , 0.9803632 , 1. ], dtype=float32)]
print('\nLDA Model:')
num_topics = 2
lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary,
alpha='auto', eta='auto', minimum_probability=0.001, passes=10)
doc_topic = [doc_t for doc_t in lda[corpus_tfidf]]
print('Document-Topic:\n')
pprint(doc_topic)
LDA Model:
Document-Topic:
[[(0, 0.9754764), (1, 0.024523618)],
[(0, 0.019281428), (1, 0.98071855)],
[(0, 0.026525376), (1, 0.9734746)],
[(0, 0.022324322), (1, 0.9776757)],
[(0, 0.019260732), (1, 0.9807393)],
[(0, 0.026561616), (1, 0.9734383)],
[(0, 0.9586078), (1, 0.041392185)],
[(0, 0.97852516), (1, 0.021474862)],
[(0, 0.9459338), (1, 0.054066237)]]
for doc_topic in lda.get_document_topics(corpus_tfidf):
print(doc_topic)
[(0, 0.9754717), (1, 0.024528308)]
[(0, 0.019281477), (1, 0.9807185)]
[(0, 0.026525391), (1, 0.9734746)]
[(0, 0.02232408), (1, 0.9776759)]
[(0, 0.019261276), (1, 0.9807387)]
[(0, 0.026561601), (1, 0.97343844)]
[(0, 0.95860773), (1, 0.041392237)]
[(0, 0.97852516), (1, 0.021474792)]
[(0, 0.94593143), (1, 0.0540686)]
for topic_id in range(num_topics):
print('Topic', topic_id)
# pprint(lda.get_topic_terms(topicid=topic_id))
pprint(lda.show_topic(topic_id))
similarity = similarities.MatrixSimilarity(lda[corpus_tfidf])
print('Similarity:')
pprint(list(similarity))
hda = models.HdpModel(corpus_tfidf, id2word=dictionary)
topic_result = [a for a in hda[corpus_tfidf]]
print('\n\nUSE WITH CARE--\nHDA Model:')
pprint(topic_result)
print('HDA Topics:')
print(hda.print_topics(num_topics=2, num_words=5))
Topic 0
[('graph', 0.08828391),
('trees', 0.06360026),
('minors', 0.062823996),
('interface', 0.038095064),
('quasi', 0.038075138),
('iv', 0.038073055),
('widths', 0.03807044),
('well', 0.038069926),
('ordering', 0.03806954),
('machine', 0.038058978)]
Topic 1
[('system', 0.09442629),
('user', 0.07338805),
('eps', 0.05244716),
('time', 0.05240226),
('response', 0.052366935),
('survey', 0.03377131),
('human', 0.031527326),
('computer', 0.03152715),
('interface', 0.03149664),
('testing', 0.03147464)]
Similarity:
[array([1. , 0.0447779 , 0.05235166, 0.04794651, 0.04475633,
0.05239008, 0.99983776, 0.9999949 , 0.9994896 ], dtype=float32),
array([0.0447779 , 0.99999994, 0.9999713 , 0.99999493, 1. ,
0.9999709 , 0.06276947, 0.04158859, 0.07667071], dtype=float32),
array([0.05235166, 0.9999713 , 1. , 0.9999903 , 0.99997115,
1. , 0.07033537, 0.04916349, 0.08422884], dtype=float32),
array([0.04794651, 0.99999493, 0.9999903 , 1. , 0.99999493,
0.9999901 , 0.06593491, 0.04475765, 0.07983299], dtype=float32),
array([0.04475633, 1. , 0.99997115, 0.99999493, 1. ,
0.99997085, 0.06274792, 0.04156702, 0.07664918], dtype=float32),
array([0.05239008, 0.9999709 , 1. , 0.9999901 , 0.99997085,
1. , 0.07037374, 0.04920191, 0.08426717], dtype=float32),
array([0.99983776, 0.06276947, 0.07033537, 0.06593491, 0.06274792,
0.07037374, 1.0000001 , 0.9997751 , 0.9999029 ], dtype=float32),
array([0.9999949 , 0.04158859, 0.04916349, 0.04475765, 0.04156702,
0.04920191, 0.9997751 , 1. , 0.9993825 ], dtype=float32),
array([0.9994896 , 0.07667071, 0.08422884, 0.07983299, 0.07664918,
0.08426717, 0.9999029 , 0.9993825 , 1.0000001 ], dtype=float32)]
USE WITH CARE--
HDA Model:
[[(0, 0.3385901515551263),
(1, 0.024332489738812427),
(2, 0.01780382602946349),
(3, 0.580447299144083),
(4, 0.010138001225330678)],
[(0, 0.03424188658940884),
(1, 0.5566573349965529),
(2, 0.017788427153251667),
(3, 0.3524864311382352),
(4, 0.010137693156735962)],
[(0, 0.04482326829662072),
(1, 0.8621806661859474),
(2, 0.023707155474764582),
(3, 0.017520703315317156),
(4, 0.0135172358417456),
(5, 0.010159495048391856)],
[(0, 0.04080311605433921),
(1, 0.8794631547975355),
(2, 0.02034191778157657),
(3, 0.015019088645223198),
(4, 0.011586181375702368)],
[(0, 0.9062162755483415),
(1, 0.024048766797768573),
(2, 0.01776234625322287),
(3, 0.013146929595269315),
(4, 0.010137458581858801)],
[(0, 0.04912263171176724),
(1, 0.8578150640645474),
(2, 0.023775971444330066),
(3, 0.017518497838668406),
(4, 0.013516869497629425),
(5, 0.010159489600541211)],
[(0, 0.8497682393036221),
(1, 0.038671217789192094),
(2, 0.02841938868741942),
(3, 0.021020044830198512),
(4, 0.01621994530220803),
(5, 0.012191393097068709)],
[(0, 0.29836838530266463),
(1, 0.021663909359736146),
(2, 0.6337747834905785),
(3, 0.011680816826612034)],
[(0, 0.8119777649636178),
(1, 0.04837369201237244),
(2, 0.03570541856736731),
(3, 0.02629055822567128),
(4, 0.020276110364093925),
(5, 0.015239242133459785),
(6, 0.010984096790510562)]]
HDA Topics:
[(0, '0.116*unordered + 0.078*perceived + 0.065*well + 0.059*time + 0.058*abc'), (1, '0.139*opinion + 0.092*paths + 0.086*widths + 0.053*random + 0.045*quasi')]