将文章分为十类:
def loadCorpusFromFile ( self, fn, stopwords) :
f = open ( fn, 'r' , encoding= 'utf-8' )
text1 = f. readlines( )
text1 = "" . join( text1)
text1 = text1. split( "。" )
text = ""
for itext in text1:
text2 = jieba. cut_for_search( itext)
for itext2 in text2:
if itext2 not in stopwords:
text += itext2
text += " "
text = text[ : - 1 ]
jieba. load_userdict( 'stopwords\\userdict.txt' )
seg_generator = jieba. cut( text)
seg_list = [ i for i in seg_generator if i not in self. stop_words]
seg_list = r' ' . join( seg_list)
seglist = seg_list. split( " " )
self. vocab = [ ]
for word in seglist:
if ( word != u' ' and word not in self. vocab) :
self. vocab. append( word)
CountMatrix = [ ]
f. seek( 0 , 0 )
for line in f:
count = np. zeros( len ( self. vocab) , dtype= np. int )
text = line. strip( )
seg_generator = jieba. cut( text)
seg_list = [ i for i in seg_generator if i not in self. stop_words and len ( i) > 1 ]
seg_list = r' ' . join( seg_list)
seglist = seg_list. split( " " )
for word in seglist:
if word in self. vocab:
count[ self. vocab. index( word) ] += 1
CountMatrix. append( count)
f. close( )
self. ppCountMatrix = np. array( CountMatrix)
print ( "load corpus from %s success!" % fn)
进行迭代训练:
def fitModel ( self, n_iter= 1500 , _alpha= 0.1 , _eta= 0.01 ) :
self. model = lda. LDA( n_topics= self. n_topic, n_iter= n_iter, alpha= _alpha, eta= _eta, random_state= 1 )
self. model. fit( self. ppCountMatrix)
分析每个句子属于哪一类topic:
def no_printDoc_Topic ( self) :
values = [ ]
for i in range ( 0 , self. n_topic) :
values. append( 0 )
for i in range ( len ( self. ppCountMatrix) ) :
values[ self. model. doc_topic_[ i] . argmax( ) ] += 1
return values
def no_printDoc_Topic_juzi ( self, juzikey) :
values = [ ]
for i in range ( len ( self. ppCountMatrix) ) :
values. append( 0 )
if self. model. doc_topic_[ i] . argmax( ) == juzikey:
values[ i] = 1
return values
求出topic的所有句子,并且进行分词、去掉停用词、进行词性标注之后输出名词:
def load_stopwordslist ( path) :
stoplist = [ line. strip( ) for line in codecs. open ( path, 'r' , encoding= 'utf8' ) . readlines( ) ]
return stoplist
def gengxinzhengwen ( path) :
f = open ( path, 'r' , encoding= 'utf-8' )
text2 = ""
text1 = f. readlines( )
for i in text1:
text2 += str ( i) . replace( '\n' , '' )
text2 = text2. split( "。" )
with open ( 'cut_imgs_txt\\zhengwen2.txt' , 'w' , encoding= 'utf-8' ) as f1:
for itext in text2:
f1. write( itext)
f1. write( '\n' )
def cut_gengxinzhengwen ( path, stop) :
f = open ( path, 'r' , encoding= 'utf-8' )
text1 = f. readlines( )
with open ( 'cut_imgs_txt\\zhengwen3.txt' , 'w' , encoding= 'utf-8' ) as f1:
for itext in text1:
text2 = ""
text3 = ""
text2 = jieba. cut( itext, cut_all= False )
for i3 in text2:
if i3 not in stop:
text3 += i3
f1. write( text3)
def duiyingtopdejuzijihe ( path, values) :
f = open ( path, 'r' , encoding= 'utf-8' )
text1 = f. readlines( )
i = 0
with open ( 'cut_imgs_txt\\zhengwen4.txt' , 'w' , encoding= 'utf-8' ) as f1:
for itext in text1:
if values[ i] != 0 :
f1. write( itext)
i += 1
else :
i += 1
continue
def mingcishuchu ( path, stop) :
f = open ( path, 'r' , encoding= 'utf-8' )
text1 = f. readlines( )
value = ""
for text in text1:
value += str ( text)
all_key_values = [ ]
jieba. load_userdict( 'stopwords\\userdict.txt' )
words = pseg. lcut( value)
stayed_line = ""
for word in words:
if word. word not in stop and word. flag == 'n' :
if word. word not in all_key_values:
all_key_values. append( word. word)
for i in range ( 0 , len ( all_key_values) ) :
stayed_line += str ( all_key_values[ i] ) + " "
print ( str ( stayed_line) )
核心代码:
class LDA_v20161130 ( ) :
def __init__ ( self, topics= 2 ) :
self. n_topic = topics
self. corpus = None
self. vocab = None
self. ppCountMatrix = None
self. stop_words = [ u',' , u'。' , u'、' , u'(' , u')' , u'·' , u'!' , u' ' , u':' , u'“' , u'”' , u'\n' ]
self. model = None
def loadCorpusFromFile ( self, fn, stopwords) :
f = open ( fn, 'r' , encoding= 'utf-8' )
text1 = f. readlines( )
text1 = "" . join( text1)
text1 = text1. split( "。" )
text = ""
for itext in text1:
text2 = jieba. cut_for_search( itext)
for itext2 in text2:
if itext2 not in stopwords:
text += itext2
text += " "
text = text[ : - 1 ]
jieba. load_userdict( 'stopwords\\userdict.txt' )
seg_generator = jieba. cut( text)
seg_list = [ i for i in seg_generator if i not in self. stop_words]
seg_list = r' ' . join( seg_list)
seglist = seg_list. split( " " )
self. vocab = [ ]
for word in seglist:
if ( word != u' ' and word not in self. vocab) :
self. vocab. append( word)
CountMatrix = [ ]
f. seek( 0 , 0 )
for line in f:
count = np. zeros( len ( self. vocab) , dtype= np. int )
text = line. strip( )
seg_generator = jieba. cut( text)
seg_list = [ i for i in seg_generator if i not in self. stop_words and len ( i) > 1 ]
seg_list = r' ' . join( seg_list)
seglist = seg_list. split( " " )
for word in seglist:
if word in self. vocab:
count[ self. vocab. index( word) ] += 1
CountMatrix. append( count)
f. close( )
self. ppCountMatrix = np. array( CountMatrix)
print ( "load corpus from %s success!" % fn)
def setStopWords ( self, word_list) :
self. stop_words = word_list
def fitModel ( self, n_iter= 1500 , _alpha= 0.1 , _eta= 0.01 ) :
self. model = lda. LDA( n_topics= self. n_topic, n_iter= n_iter, alpha= _alpha, eta= _eta, random_state= 1 )
self. model. fit( self. ppCountMatrix)
def printTopic_Word ( self, n_top_word) :
for i, topic_dist in enumerate ( self. model. topic_word_) :
if i == values. index( max ( values) ) :
if n_top_word != 0 :
topic_words = np. array( self. vocab) [ np. argsort( topic_dist) ] [ : - ( n_top_word + 1 ) : - 1 ]
else :
topic_words = np. array( self. vocab) [ np. argsort( topic_dist) ] [ : - 1 ]
print ( "Topic:" , i, "\t" )
for word in topic_words:
print ( word, end= " " )
print ( '\n' )
def no_printDoc_Topic ( self) :
values = [ ]
for i in range ( 0 , self. n_topic) :
values. append( 0 )
for i in range ( len ( self. ppCountMatrix) ) :
values[ self. model. doc_topic_[ i] . argmax( ) ] += 1
return values
def no_printDoc_Topic_juzi ( self, juzikey) :
values = [ ]
for i in range ( len ( self. ppCountMatrix) ) :
values. append( 0 )
if self. model. doc_topic_[ i] . argmax( ) == juzikey:
values[ i] = 1
return values
def printVocabulary ( self) :
print ( "vocabulary:" )
for word in self. vocab:
print ( word)
print ( '\n' )
def saveVocabulary ( self, fn) :
f = codecs. open ( fn, 'w' , 'utf-8' )
for word in self. vocab:
f. write( "%s\n" % word)
f. close( )
def saveTopic_Words ( self, fn, n_top_word= - 1 ) :
if n_top_word == - 1 :
n_top_word = len ( self. vocab)
f = codecs. open ( fn, 'w' , 'utf-8' )
for i, topic_dist in enumerate ( self. model. topic_word_) :
topic_words = np. array( self. vocab) [ np. argsort( topic_dist) ] [ : - ( n_top_word + 1 ) : - 1 ]
f. write( "Topic:%d\t" % i)
for word in topic_words:
f. write( "%s " % word)
f. write( "\n" )
f. close( )
def saveDoc_Topic ( self, fn) :
f = codecs. open ( fn, 'w' , 'utf-8' )
for i in range ( len ( self. ppCountMatrix) ) :
f. write( "Doc %d:((top topic:%s) topic distribution:%s)\n" % (
i, self. model. doc_topic_[ i] . argmax( ) , self. model. doc_topic_[ i] ) )
f. close( )
运行结果: