In [1]:
import logging
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.WARNING)
logging.root.level = logging.WARNING
In [2]:
from sklearn import datasets
news_dataset = datasets.fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
In [3]:
# A list of text document is contained in the data variable
documents = news_dataset.data
print "In the dataset there are", len(documents), "textual documents"
print "And this is the first one:\n", documents[0]
In [4]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
In [5]:
def tokenize(text):
return [token for token in gensim.utils.simple_preprocess(text) if token not in gensim.parsing.preprocessing.STOPWORDS]
print "After the tokenizer, the previous document becomes:\n", tokenize(documents[0])
In [6]:
processed_docs = [tokenize(doc) for doc in documents]
word_count_dict = gensim.corpora.Dictionary(processed_docs)
print "In the corpus there are", len(word_count_dict), "unique tokens"
In [7]:
word_count_dict.filter_extremes(no_below=20, no_above=0.1) # word must appear >10 times, and no more than 20% documents
In [8]:
print "After filtering, in the corpus there are only", len(word_count_dict), "unique tokens"
In [9]:
bag_of_words_corpus = [word_count_dict.doc2bow(pdoc) for pdoc in processed_docs]
In [10]:
bow_doc1 = bag_of_words_corpus[0]
print "Bag of words representation of the first document (tuples are composed by token_id and multiplicity):\n", bow_doc1
print
for i in range(5):
print "In the document, topic_id {} (word \"{}\") appears {} time[s]".format(bow_doc1[i][0], word_count_dict[bow_doc1[i][0]], bow_doc1[i][1])
print "..."
In [11]:
# LDA mono-core
lda_model = gensim.models.LdaModel(bag_of_words_corpus, num_topics=10, id2word=word_count_dict, passes=5)
# LDA multicore (in this configuration, defaulty, uses n_cores-1)
# lda_model = gensim.models.LdaMulticore(bag_of_words_corpus, num_topics=10, id2word=word_count_dict, passes=5)
In [12]:
_ = lda_model.print_topics(-1)
In [13]:
for index, score in sorted(lda_model[bag_of_words_corpus[0]], key=lambda tup: -1*tup[1]):
print "Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 10))
In [14]:
news_dataset.target_names[news_dataset.target[0]]
Out[14]:
In [16]:
unseen_document = "In my spare time I either play badmington or drive my car"
print "The unseen document is composed by the following text:", unseen_document
print
bow_vector = word_count_dict.doc2bow(tokenize(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
print "Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5))
In [17]:
print "Log perplexity of the model is", lda_model.log_perplexity(bag_of_words_corpus)