import jieba
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
"我喜欢使用Python编程语言",
"Python是一种强大的编程语言",
"Python编程语言被广泛使用"
]
# 使用分词进行中文分词
corpus_seg = []
for sentence in corpus:
seg_list = jieba.cut(sentence)
corpus_seg.append(" ".join(seg_list))
# 使用CountVectorizer进行词频统计
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus_seg)
#并输出词汇表和词频矩阵
print(vectorizer.get_feature_names())
print(X.toarray())
#['python', '一种', '使用', '编程', '语言', '强大', '广泛', '喜欢', '被']
#[[1 0 1 1 1 0 0 1 0]
# [1 1 0 1 1 1 0 0 0]
# [1 0 0 1 1 0 1 0 1]]
04-04
1333
08-05
1155
06-02