# sklearn 数据集
# 鸢尾花分类的数据集
from sklearn.datasets import load_iris
from sklearn.datasets import load_boston
from sklearn.datasets import fetch_20newsgroups
"""
return Bunch( data=data, // 特征值
target=target, // 目标值
frame=frame,
target_names=target_names, // 目标值的名称
DESCR=fdescr, // 整体的魔术
feature_names=feature_names,// 特征值的名称
filename=iris_csv_filename)
"""
#
# lr = load_iris()
#
# # lr Bunch的对象
# print(lr.feature_names)
# print("lr的特征值: ", lr.data)
"""
萼片的长度 萼片的宽度 花瓣的长度 花瓣的宽度 类别
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'] ['setosa' 'versicolor' 'virginica']
[4.9 3. 1.4 0.2] 0
[4.7 3.2 1.3 0.2] 1
[4.6 3.1 1.5 0.2] 2
"""
#
# print(lr.target_names)
# print(lr.target)
"""
['setosa' 'versicolor' 'virginica']
0 1 2
"""
# lb = load_boston()
#
# # lb 就是波士顿房价的对象
#
# # 特征值
# print(lb.feature_names)
# print(lb.data)
#
#
# # 目标值
# print(lb.target)
数据集划分
分割比例:
训练集:70% 测试集 30%
划分函数:
from sklearn.model_selection import train_test_split
train_test_split: 原型
def train_test_split(*arrays,
test_size=None,
train_size=None,
random_state=None,
shuffle=True,
stratify=None):
参数:
*arrays : 特征值, 目标值
test_size: 划分的测试集比例: 0.3
random_state: 随机数种子
返回值:
x: 特征值 y:目标值 train: 训练集 test:测试集
x_train, x_test, y_train, y_test
from sklearn.model_selection import train_test_split
def split_iris():
"""
划分鸢尾花数据集
:return:
"""
# 获取鸢尾花数据集
lr = load_iris()
# 确定特征值与目标值
x = lr.data
y = lr.target
# 划分数据集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state= 1)
print("x_train的长度:", len(x_train))
print("x_train: \n", x_train[:5])
def split_news():
"""
新闻网站分类
:return:
"""
# 获取新闻网站的数据集
news = fetch_20newsgroups()
# 确认特征值与目标值
x = news.data
y = news.target
# 分割数据集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
print(x_train)
print(y_train)
return None
split_news()