from sklearn.feature_extraction.text import CountVectorizer
data = ["I have a dream that one day this nation will rise up and live out the true meaning of its creed",
"We hold these truths to be self-evident, that all men are created equal",
"I have a dream that one day on the red hills of Georgia, "
"the sons of former slaves and the sons of former slave owners will be able to sit down together at the table of brotherhood",
"I have a dream that one day even the state of Mississippi",
" a state sweltering with the heat of injustice",
"sweltering with the heat of oppression",
"will be transformed into an oasis of freedom and justice",
"I have a dream that my four little children will one day live in a nation where they will not be judged by the color of their skin but by the content of their character",
"I have a dream today"]
# CountVectorizer文本特征提取模型
# 1.实例化 将"is"标记为停用词
c_transfer = CountVectorizer(stop_words=["is"])
# 2.调用fit\_transform
c_trans_data = c_transfer.fit_transform(data)
# 打印特征名称
print(c_transfer.get_feature_names_out())
# 打印sparse矩阵
print(c_trans_data)
输出结果如下图所示:
3.中文文本特征提取
准备一段中文文本(data.txt),以水浒传中风雪山神庙情节为例: