-----承接上篇中文文本提取
#coding=utf-8
from sklearn.feature_extraction.text import TfidfVectorizer
import jieba
def cut_word(v):
return " ".join(list(jieba.cut(v)))
#尝试使用tf-idf算法思想进行文本特征提取
def tf_context():
data=["忠领他们到朱老明那里站在大柏树坟前说你看看这个地势怎么样我们的人要是从城里过来经过大渡口或是小渡口沿着千里堤"]
data_new = []
for sent in data:
data_new.append(cut_word(sent))
tfv = TfidfVectorizer(stop_words=["的"]);
result = tfv.fit_transform(data_new);
print(result.toarray())
print("/n")
print(tfv.get_feature_names())
if __name__=="__main__":
tf_context()