import jieba
from sklearn.feature_extraction.text import CountVectorizer #统计词数,英文'''
# 构建文章【英文】
content = ['This is the first document.', 'This is the second second document.', 'And the third one.', 'Is this the first document? i x y']
#构建实例
con_vet = CountVectorizer()
#进行提取词语
x = con_vet.fit_transform(content)
print(x) # (0, 1) 1 (文章下标,分词下标) 词在文章中出现的次数 sparse矩阵
print(x.toarray()) # 将 sparse矩阵 转化为 数组
# 获取提取到词语
names = con_vet.get_feature_names()
print(names) # 提取到的词
'''# 构建文章【中文】
content =["今天阳光真好","我要去看北京天安门","逛完天安门之后我要去王府井","吃烤蝎子与烤蜈蚣","晚上去后海游个泳"]
content_list =[]for tmp in content:# 使用精确模式进行分词 cut_all默认为精确模式
res = jieba.cut(tmp,cut_all=False)
res_str =','.join(res)
content_list.append(res_str)#构建实例
con_vet = CountVectorizer(stop_words=['我要','之后'])#进行提取词语
x = con_vet.fit_transform(content_list)print(x)# (0, 1) 1 (文章下标,分词下标) 词在文章中出现的次数 sparse矩阵print(x.toarray())# 将 sparse矩阵 转化为 数组# 获取提取到词语
names = con_vet.get_feature_names()print(names)# 提取到的词
词数重要程度
from sklearn.feature_extraction.text import TfidfVectorizer
# 构建文章【英文】
content =['This is the first document.','This is the second second document.','And the third one.','Is this the first document? i x y']#构建实例# min_df = 1 # 设置分词的时候,词必须至少出现一次# stop_words ===停用词 不重要的词去掉
tf_vet = TfidfVectorizer(stop_words=['is','and'])#进行提取词语
x =tf_vet.fit_transform(content)print(x)#(0, 1) 1 (文章下标,分词下标) 词在文章中出现的重要程度 sparse矩阵print(x.toarray())#将 sparse矩阵 转化为 数组# 获取提取到词语
names = tf_vet.get_feature_names()print(names)# 提取到的词