导入数据集
import pandas as pd
import numpy as np
df_wine = pd.read_csv('H:\python\machine-learning\wine.data',header=None)
df_wine.columns = ['class label','alcohol','malic acid','ash','alcalinity of ash','magnesium','total phenols','flavanoids','nonflavanoid phenols','proanthocyanins','color intensity','hue','od280/od315 of diluted wines','proline']
print('class labels',np.unique(df_wine['class label']))
print(df_wine.head())
输出:
class labels [1 2 3]
class label alcohol malic acid ash alcalinity of ash magnesium ... nonflavanoid phenols proanthocyanins color intensity hue od280/od315 of diluted wines proline
0 1 14.23 1.71 2.43 15.6 127 ... 0.28 2.29 5.64 1.04 3.92 1065
1 1 13.20 1.78 2.14 11.2 100 ... 0.26 1.28 4.38 1.05 3.40 1050
2 1 13.16 2.36 2.67 18.6 101 ... 0.30 2.81 5.68 1.03 3.17 1185
3 1 14.37 1.95 2.50 16.8 113 ... 0.24 2.18 7.80 0.86 3.45 1480
4 1 13.24 2.59 2.87 21.0 118 ... 0.39 1.82 4.32 1.04 2.93 735
分裂数据集
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
df_wine = pd.read_csv('H:\python\machine-learning\wine.data',header=None)
df_wine.columns = ['class label','alcohol','malic acid','ash','alcalinity of ash','magnesium','total phenols','flavanoids','nonflavanoid phenols','proanthocyanins','color intensity','hue','od280/od315 of diluted wines','proline']
x,y = df_wine.iloc[:,1:].values, df_wine.iloc[:,0].values #1-13标记为x;第一列为y(0-0就是第一列)
#调用train_test_split分裂数据集,0.3:0.7 stratify=y是确保训练集和测试集具有相同的分类比例
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=0,stratify=y)