%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from time import time
from sklearn.datasets import load_files
print("loading train dataset ..." )
t = time()
news_train = load_files('datasets/mlcomp/379/train' )
news_train.data
news_train.target
news_train.target_names
loading train dataset ...
['alt.atheism',
'comp.graphics',
'comp.os.ms-windows.misc',
'comp.sys.ibm.pc.hardware',
'comp.sys.mac.hardware',
'comp.windows.x',
'misc.forsale',
'rec.autos',
'rec.motorcycles',
'rec.sport.baseball',
'rec.sport.hockey',
'sci.crypt',
'sci.electronics',
'sci.med',
'sci.space',
'soc.religion.christian',
'talk.politics.guns',
'talk.politics.mideast',
'talk.politics.misc',
'talk.religion.misc']
print("summary: {0} documents in {1} categories." .format(
len(news_train.data), len(news_train.target_names)))
print("done in {0} seconds" .format(time() - t))
summary: 13180 documents in 20 categories.
done in 3.2623984813690186 seconds
from sklearn.feature_extraction.text import TfidfVectorizer
print("vectorizing train dataset ..." )
t = time()
vectorizer = TfidfVectorizer(encoding='latin-1' )
X_train = vectorizer.fit_transform((d for d in news_train.data))
X_train[0 ]
vectorizing train dataset ...
<1x130274 sparse matrix of type '<class 'numpy.float64'>'
with 108 stored elements in Compressed Sparse Row format>