# -*- coding:utf-8 -*-
__author__ = "dongluyu"
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import os
import jieba
import sys
reload(sys)
sys.setdefaultencoding('utf8')
def readFile(filename):
fopen = open(filename, 'r+')
for eachLine in fopen:
return eachLine
fopen.close()
def getPath(rootPath):
alldir = os.listdir(root_path)
filenames = []
for dir in alldir:
dir_path = os.path.join(root_path, dir)
print dir_path
allfile = os.listdir(dir_path)
for file in allfile:
filename = os.path.join(dir_path, file)
filenames.append(filename)
return filenames
if __name__ == "__main__":
root_path = r'D:\Sougou'
filenames = getPath(root_path)
corpus = []
for filename in filenames:
content = readFile(filename)
words = jieba.cut(content)
corpus.append(' '.join(words))
vectorizer=CountVectorizer()
transformer=TfidfTransformer()
tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))
word=vectorizer.get_feature_names()#
print len(word)
weight=tfidf.toarray()
f = open('D:/tfidf.txt','w')
for i in range(len(weight)):
print u"-------这里输出第",i,u"类文本的词语tf-idf权重------"
for j in range(len(word)):
print word[j],weight[i][j]
f.write(str(weight[i][j]))
f.write('\n')
f.write('\n')
print "finish"
TF-IDF
最新推荐文章于 2018-10-06 20:46:31 发布