TFIDF与scikitlearin的LDA代码,进行主体聚类,为每个句子打标签

# -*- coding: utf-8 -*-

import jieba
jieba.suggest_freq('沙瑞金', True)
jieba.suggest_freq('易学习', True)
jieba.suggest_freq('王大路', True)
jieba.suggest_freq('京州', True)
jieba.suggest_freq('桓温', True)
#
cors=[]
files=['1.txt','2.txt','3.txt']
for i in files:
    with open(i,'r',encoding='utf-8') as f:
        tmp=f.read()
    with open("trans"+i,'w',encoding='utf-8') as f:
        document_cut = jieba.cut(tmp)
    #print  ' '.join(jieba_cut)
        result = ' '.join(document_cut)
        cors.append(result)
        f.write(result)

from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [cors[0],cors[1]]
vector = TfidfVectorizer()
tfidf = vector.fit_transform(corpus)
print (tfidf)



wordlist = vector.get_feature_names()#获取词袋模型中的所有词
# tf-idf矩阵 元素a[i][j]表示j词在i类文本中的tf-idf权重
weightlist = tfidf.toarray()
#打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
for i in range(len(weightlist)):
    print ("-------第",i,"段文本的词语tf-idf权重------" )
    for j in range(len(wordlist)):
        print (wordlist[j],weightlist[i][j])



from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
corpus = [cors[0],cors[1],cors[2]]
cntVector = CountVectorizer()#stop_words=stpwrdlst
cntTf = cntVector.fit_transform(corpus)
cntTf

lda = LatentDirichletAllocation(n_topics=2, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
docres = lda.fit_transform(cntTf)

len(lda.components_[1]) #98
docres


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值