LDA

#-*- coding:utf8 -*-
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim.models.ldamodel import LdaModel
from gensim import corpora, models, similarities

import sys

reload(sys)
sys.setdefaultencoding('utf-8')
def main():
    doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
    doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
    doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
    doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
    doc_e = "Health professionals say that brocolli is good for your health."

    # compile sample documents into a list
    doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]
    # print doc_set


    tokenizer = RegexpTokenizer(r'\w+')
    p_stemmer = PorterStemmer()
    en_stop = get_stop_words('en')
    texts=[]
    for raw in doc_set:
        raw = raw.lower()
        tokens = tokenizer.tokenize(raw)
        # print tokens

        stopped_tokens=[i for i in tokens if not i in en_stop]
        stemmed_token=[p_stemmer.stem(i) for i in stopped_tokens]
        texts.append(stemmed_token)
    # print texts
    dictionary = corpora.Dictionary(texts)
    print dictionary
    print(dictionary.token2id)
    corpus = [dictionary.doc2bow(text) for text in texts]
    """词袋模型生成矩阵"""
    print corpus
    """
    num_topics: 必须。LDA 模型要求用户决定应该生成多少个主题。由于我们的文档集很小,所以我们只生成三个主题。
    id2word:必须。LdaModel 类要求我们之前的 dictionary  id 都映射成为字符串。
    passes:可选。模型遍历语料库的次数。遍历的次数越多,模型越精确。但是对于非常大的语料库,遍历太多次会花费很长的时间。
    """
    ldamodel=LdaModel(corpus,num_topics=2,id2word=dictionary,passes=20)
    # print ldamodel.print_topics(num_topics=3, num_words=4)
    # # print(dictionary.roken2id)
    # #分支一建立 TF-IDF
    tfidf = models.TfidfModel(corpus)
    print tfidf
    corpus_tfidf = tfidf[corpus]
    print corpus_tfidf
    similarity = similarities.Similarity('Similarity-tfidf-index', corpus_tfidf, num_features=600)
    print similarity
    # """使用tf-idf 模型得出该评论集的tf-idf 模型"""
    # corpus_tfidf = tfidf[corpus]
    new_sensence = "My mother spends a lot of time driving my brother around to baseball practice"
    tokens = tokenizer.tokenize(new_sensence.lower())
    tokens1 = [i for i in tokens if not i in en_stop]
    new_sen = [p_stemmer.stem(i) for i in tokens1]
    test_corpus_1 = dictionary.doc2bow(new_sen)
    vec_tfidf = tfidf[test_corpus_1]
    print vec_tfidf
    id2token={value:key for key,value in dictionary.token2id.items()}
    print id2token
    for (key,freq) in vec_tfidf:
        print id2token[key],freq
if __name__ == '__main__':
    main()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值