from sklearn.feature_extraction import DictVectorizer
defdictvec():'''
字典数据抽取
:return: None
'''# 实例化dict= DictVectorizer(sparse=False)# 调用fit_transform
data =dict.fit_transform([{'city':'北京','temperature':100},{'city':'上海','temperature':60},{'city':'深圳','temperature':30}])print(dict.get_feature_names())print(dict.inverse_transform(data))print(data)returnNoneif __name__ =='__main__':
dictvec()
数据特征抽取
# 特征抽取from sklearn.feature_extraction.text import CountVectorizer
# 实例化CountVectorizer
vector = CountVectorizer()# 调用fit_transform输入并转换数据
res = vector.fit_transform(['life is short,i like python','life is too long,i dislike python'])# 打印结果print(vector.get_feature_names())print(res.toarray())
文本特征抽取
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
defcountvec():'''
文本数据特征抽取
:return: None
'''# 实例化
cv = CountVectorizer()# 调用fit_transform# data = cv.fit_transform(['life is short,i like python', 'life is too long,i dislike python'])
data = cv.fit_transform(["人生 苦短,我 喜欢 python","人生漫长,不用 python"])print(cv.get_feature_names())# 提取文渣中的单词,不重复,其实就相当于文章的特征# 对每篇文章,在词的列表里面进行统计每个词出现的次数,单个字母不统计print(data.toarray())# toarray()方法将结果转换为数组形式returnNoneif __name__ =='__main__':
countvec()