一、鸢尾花数据集
# 鸢尾花数据集学习
import sklearn.model_selection
from sklearn.datasets import load_iris
iris = load_iris()
#print(iris) # 数据集
###print(iris.feature_names) # 特征名
##print(iris.target_names) # 标签名
#print(iris.data, iris.data.shape) # 特征数据数组与shape
# 数据集划分
def datasets_demo():
'''
data为数据集数组
target为标签
所以x特征,y为标签
test_size为测试集的大小,一般为float,这里的0.2就表示在数据集中测试集为百分之20
random_state随机数种子,不同的种子会造成不同的随机采样结果。相同的种子采样结果相同
'''
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(iris.data, iris.target, test_size=0.2, random_state=22) #数据集划分语句
print(x_train, x_train.shape) # 训练数据集
print(x_test, x_test.shape) # 测试数据集
return None
if __name__ == '__main__':
datasets_demo()
二、字典特征提取
# 字典特征提取
from sklearn.feature_extraction import DictVectorizer
def dict_demo():
"""
字典特征抽取
:return:
"""
data = [{'city': '北京', 'temperature': 100},
{'city': '上海', 'temperature': 60},
{'city': '深圳', 'temperature': 30}]
# 1、实例化一个转换器类
#transfer = DictVectorizer() # 返回sparse矩阵
transfer = DictVectorizer(sparse=False)
# 2、调用fit_transform()
data_new = transfer.fit_transform(data)
print("data_new:\n", data_new) # 转化后的
print("特征名字:\n", transfer.get_feature_names_out())
return None
if __name__ == "__main__":
dict_demo()
三、文本特征提取
# 文本特征提取
from sklearn.feature_extraction.text import CountVectorizer
def count_demo():
"""
文本特征抽取:CountVectorizer
:return:
"""
data = ['life is short,i like like python',
'life is too long,i dislike python']
# 1、实例化一个转换器类
transfer = CountVectorizer()
# 2、调用fit_transform
data_new = transfer.fit_transform(data)
print("data_new:\n", data_new.toarray()) # toarray转换为二维数组
print("特征名字:\n", transfer.get_feature_names_out())
return None
字典特征提取(中文)
def count_demo_cn():
"""
文本特征抽取:CountVectorizer
:return:
"""
#data = ['文本 或者 包 含 文本 字符串 的',
# '可迭代 对象 返回值 返回 sparse 矩 阵']
data = ['将模型拟合到一些数据并不意味着它将在看不见的数据上很好地预测']
# 1、实例化一个转换器类
transfer = CountVectorizer()
# 2、调用fit_transform
data_new = transfer.fit_transform(data)
print("data_new:\n", data_new.toarray()) # toarray转换为二维数组
print("特征名字:\n", transfer.get_feature_names_out())
return None
if __name__ == "__main__":
count_demo_cn()