包含全部示例的代码仓库见GIthub
1 导入库
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
2 数据准备
data = pd.read_csv('./dataset/iris.data',
header=None,
names=['sepal_length_cm','sepal_width_cm',
'petal_length_cm','petal_width_cm','cla'])
data
# output
sepal_length_cm sepal_width_cm petal_length_cm petal_width_cm cla
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
... ... ... ... ... ...
145 6.7 3.0 5.2 2.3 Iris-virginica
146 6.3 2.5 5.0 1.9 Iris-virginica
147 6.5 3.0 5.2 2.0 Iris-virginica
148 6.2 3.4 5.4 2.3 Iris-virginica
149 5.9 3.0 5.1 1.8 Iris-virginica
data.cla.unique()
# output
array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)
x = data.iloc[:, :-1]
y = data.iloc[:, -1]
x_train, x_test, y_train, y_test = train_test_split(x, y)
x_train.shape, x_test.shape
# output
((112, 4), (38, 4))
3 模型构建
model = RandomForestClassifier()
model.fit(x_train, y_train)
model.score(x_train, y_train)
# output
1.0
model.score(x_test, y_test)
# output
0.8421052631578947
model.predict(x_test)
# output
array(['Iris-versicolor', 'Iris-setosa', 'Iris-versicolor',
'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor',
'Iris-setosa', 'Iris-virginica', 'Iris-versicolor',
'Iris-virginica', 'Iris-virginica', 'Iris-setosa',
'Iris-versicolor', 'Iris-virginica', 'Iris-versicolor',
'Iris-virginica', 'Iris-setosa', 'Iris-versicolor', 'Iris-setosa',
'Iris-setosa', 'Iris-versicolor', 'Iris-virginica',
'Iris-virginica', 'Iris-virginica', 'Iris-versicolor',
'Iris-versicolor', 'Iris-versicolor', 'Iris-virginica',
'Iris-virginica', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa',
'Iris-versicolor', 'Iris-virginica', 'Iris-versicolor'],
dtype=object)