代码来自邹博老师的机器学习课程。其中对鸢尾花label转换为int,plt.pcolormesh()、决策树的可视化等方法都有一定的讲述。
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pydotplus
# 花萼长度、花萼宽度、花瓣长度、花瓣宽度
iris_feature_E = ['sepal length', 'sepal width', 'petal length', 'petal width']
iris_feature = ['花萼长度', '花萼宽度', '花瓣长度', '花瓣宽度']
iris_class = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
if __name__ == '__main__':
mpl.rcParams['font.family'] = 'SimHei'
mpl.rcParams['axes.unicode_minus'] = False
data = pd.read_csv('./iris.data', header=None)
x = data[[i for i in range(4)]]
y = pd.Categorical(data[4]).codes # 将obejet类型转化为int分类,得到numpy格式数据
# 为了可视化,仅使用前两列特征
x = x.iloc[:, :2]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.7, random_state=1)
'''
决策树参数估计
1、分类用entropy或者gini,回归用mse
2、min_samples_split=10,如果该节点包含的样本数目大于10,则(有可能)对其分支 --看这一次
3、min_samples_leaf=10:若将某节点分支后,得到的每个子节点样本数目都大于10,则完成分支;否则不进行分支--看上一次
4、max_depth=1:树的深度,很重要
'''
model = DecisionTreeClassifier(criterion='entropy')
model.fit(x_train,