第八课 机器学习简单介绍
第八节 数据集准备及划分
# 引入包
import pandas as pd
import numpy as np
准备数据集
iris_data = pd.read_csv('./dataset/iris.csv')
iris_data.head()
sepal_length sepal_width petal_length petal_width species label
0 5.1 3.5 1.4 0.2 setosa 1
1 4.9 3.0 1.4 0.2 setosa 1
2 4.7 3.2 1.3 0.2 setosa 1
3 4.6 3.1 1.5 0.2 setosa 1
4 5.0 3.6 1.4 0.2 setosa 1
# 获取特征
X = iris_data[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']].values
print(X.shape)
(150, 4)
# 获取标签
y = iris_data['label'].values
print(y)
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3
3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
3 3]
划分数据集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4, random_state=10)
print('原数据集的样本个数:', X.shape[0])
print('训练集的样本个数:', X_train.shape[0])
print('测试集的样本个数:', X_test.shape[0])
原数据集的样本个数: 150
训练集的样本个数: 112
测试集的样本个数: 38
选择模型
# 选择K近邻距离算法
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier()
训练模型
# 在训练集上进行训练
knn_model.fit(X_train, y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=5, p=2,
weights='uniform')
测试模型
# 在测试集上测试模型
y_pred = knn_model.predict(X_test)
y_pred
array([2, 3, 1, 2, 1, 2, 3, 2, 1, 2, 2, 3, 2, 1, 1, 3, 2, 1, 1, 1, 3, 3,
3, 1, 2, 1, 2, 2, 2, 3, 2, 2, 3, 3, 3, 1, 3, 3], dtype=int64)
print('真实值:', y_test)
print('预测值:', y_pred)
真实值: [2 3 1 2 1 2 2 2 1 2 2 3 2 1 1 3 2 1 1 1 3 3 3 1 2 1 2 2 2 3 2 2 3 3 3 1 3
3]
预测值: [2 3 1 2 1 2 3 2 1 2 2 3 2 1 1 3 2 1 1 1 3 3 3 1 2 1 2 2 2 3 2 2 3 3 3 1 3
3]
# 模型准确率
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred)
print('准确率:', acc)
准确率: 0.9736842105263158
机器学习-数据集准备及划分
最新推荐文章于 2024-05-26 07:59:09 发布