from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix
items = ['this is the first document, this really is',
'nothing will stop this from been the second doument, second is not a bad order',
'I wonder if three documents would be ok as an example, example like this is stupid',
'ok I think four documents is enough, I I I I think so.']
# will simply using tfidf as the item - profile
# row = item(documents) column = feature(term)
vectorizer = CountVectorizer(min_df=1)
counts = vectorizer.fit_transform(items)
# column = item(documents) row = feature(term)
transformer = TfidfTransformer()
tfidf = transformer.fit_transf