字典特征抽取,数据集类别特征比较多,需要将数据转换为字典,利用DictVectorizer进行字典特征抽取
# 导入模块
from sklearn.feature_extraction import DictVectorizer
data=[
{'city':'BeiJing','temperature':100},
{'city':'ShangHai','temperature':60},
{'city':'ShenZheng','temperature':30}
]
transer = DictVectorizer(sparse=True) # sparse=True,稀疏矩阵,将非零的值的位置表示出来,节省内存,提高运行效率
data_new = transer.fit_transform(data)
print(data_new)
print(type(data_new))
(0, 0) 1.0
(0, 3) 100.0
(1, 1) 1.0
(1, 3) 60.0
(2, 2) 1.0
(2, 3) 30.0
<class 'scipy.sparse.csr.csr_matrix'>
transer = DictVectorizer(sparse=False) # sparse=True,稀疏矩阵,将非零的值的位置表示出来,节省内存,提高运行效率
data_new = transer.fit_transform(data)
print(data_new)
print(type(data_new))
[[ 1. 0. 0. 100.]
[ 0. 1. 0. 60.]
[ 0. 0. 1. 30.]]
<class 'numpy.ndarray'>
print(transer.get_feature_names())
['city=BeiJing', 'city=ShangHai', 'city=ShenZheng', 'temperature']