import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
class TF_IDE():
def import_file(self,a,b,file_path0):
text_list=[]
for i in range(a,b):
file_path=str(file_path0)+str(i)+".txt"
with open(file_path,encoding="UTF-8") as f:
text=f.read()
text_list.append(text)
return text_list
pass
def text_to_jieba(self,text):
text_jieba_list=[]
for i in text:
text_jieba=jieba.cut(i,cut_all=True)
text_jieba=" ".join(text_jieba)
text_jieba_list.append(text_jieba)
return text_jieba_list
def import_stop_words(self):
stop_words_list=[]
stop_words=self.import_file(1,5,r'D:\源代码检测\中文停用词表\stopwords-master\stop_words')
for i in stop_words:
stop_words_=i.split("\n")
for j in stop_words_:
stop_words_list.append(j)
return stop_words_list
def Tfidf(self):
vectorizer=TfidfVectorizer(stop_words=self.import_stop_words())
X=vectorizer.fit_transform(self.text_to_jieba(self.import_file(1,6,r"D:\源代码检测\特征提取文档\text军训心得"))).toarray()
X_feature_names=vectorizer.get_feature_names()
X_pd=pd.DataFrame(X,columns=X_feature_names)
for i in range(0,5):
X_sort=X_pd.sort_values(by=i,axis=1,ascending=False)
print("第",str(i+1),"篇文档的关键词排序:\n",X_sort.iloc[i,:])
return vectorizer,X_feature_names
def test(self):
test=self.text_to_jieba(self.import_file(1,2,r"D:\源代码检测\特征提取文档\textzz"))
X_test=self.Tfidf()[0].transform(test).toarray()
test_pd=pd.DataFrame(X_test,columns=self.Tfidf()[1])
test_sorted=test_pd.sort_values(by=0,axis=1,ascending=False)
print(test_sorted)
def __init__(self):
self.test()
if __name__=='__main__':
TF_IDE()