#-*- coding:utf8 -*- from nltk.tokenize import RegexpTokenizer from stop_words import get_stop_words from nltk.stem.porter import PorterStemmer from gensim.models.ldamodel import LdaModel from gensim import corpora, models, similarities import sys reload(sys) sys.setdefaultencoding('utf-8') def main(): doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother." doc_b = "My mother spends a lot of time driving my brother around to baseball practice." doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure." doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better." doc_e = "Health professionals say that brocolli is good for your health." # compile sample documents into a list doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e] # print doc_set tokenizer = RegexpTokenizer(r'\w+') p_stemmer = PorterStemmer() en_stop = get_stop_words('en') texts=[] for raw in doc_set: raw = raw.lower() tokens = tokenizer.tokenize(raw) # print tokens stopped_tokens=[i for i in tokens if not i in en_stop] stemmed_token=[p_stemmer.stem(i) for i in stopped_tokens] texts.append(stemmed_token) # print texts dictionary = corpora.Dictionary(texts) print dictionary print(dictionary.token2id) corpus = [dictionary.doc2bow(text) for text in texts] """词袋模型生成矩阵""" print corpus """ num_topics: 必须。LDA 模型要求用户决定应该生成多少个主题。由于我们的文档集很小,所以我们只生成三个主题。 id2word:必须。LdaModel 类要求我们之前的 dictionary 把 id 都映射成为字符串。 passes:可选。模型遍历语料库的次数。遍历的次数越多,模型越精确。但是对于非常大的语料库,遍历太多次会花费很长的时间。 """ ldamodel=LdaModel(corpus,num_topics=2,id2word=dictionary,passes=20) # print ldamodel.print_topics(num_topics=3, num_words=4) # # print(dictionary.roken2id) # #分支一建立 TF-IDF tfidf = models.TfidfModel(corpus) print tfidf corpus_tfidf = tfidf[corpus] print corpus_tfidf similarity = similarities.Similarity('Similarity-tfidf-index', corpus_tfidf, num_features=600) print similarity # """使用tf-idf 模型得出该评论集的tf-idf 模型""" # corpus_tfidf = tfidf[corpus] new_sensence = "My mother spends a lot of time driving my brother around to baseball practice" tokens = tokenizer.tokenize(new_sensence.lower()) tokens1 = [i for i in tokens if not i in en_stop] new_sen = [p_stemmer.stem(i) for i in tokens1] test_corpus_1 = dictionary.doc2bow(new_sen) vec_tfidf = tfidf[test_corpus_1] print vec_tfidf id2token={value:key for key,value in dictionary.token2id.items()} print id2token for (key,freq) in vec_tfidf: print id2token[key],freq if __name__ == '__main__': main()
LDA
最新推荐文章于 2023-06-25 15:54:28 发布