import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import tree
from sklearn.model_selection import train_test_split
X,y = datasets.load_iris(True)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 1024)
# 数据清洗,即X,y
# 特征工程
# 使用模型进行训练
# 模型参数调优
# sklearn所有算法封装好了
# 直接用,使用规则如下
clf = DecisionTreeClassifier(criterion='entropy')
clf.fit(X_train,y_train)
y_ = clf.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_)
# 调整大小
plt.figure(figsize=(12,9))
_ = tree.plot_tree(clf) #绘制树形图
# 调整大小
plt.figure(figsize=(12,9))
_ = tree.plot_tree(clf,filled=True) #绘制树形图 filled调整颜色
39/120*np.log2(120/39)+42/120*np.log2(120/42)+39/120*np.log2(120/39)
42/81*np.log2(81/42)+ 39/81*np.log2(81/39)
# 调整大小
plt.figure(figsize=(18,12))
_ = tree.plot_tree(clf,filled=True,feature_names=feature_names) #绘制树形图 filled调整颜色
plt.savefig('./tree.jpg')
clf = DecisionTreeClassifier(criterion='entropy',max_depth=1) #深度为1
clf.fit(X_train,y_train)
y_ = clf.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_))
plt.figure(figsize=(18,12))
_ = tree.plot_tree(clf,filled=True,feature_names=feature_names)
深度变深,准确率提升
# 书的深度变浅,树的裁剪
clf = DecisionTreeClassifier(criterion='entropy',max_depth=2) #深度为1
clf.fit(X_train,y_train)
y_ = clf.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_))
plt.figure(figsize=(18,12))
_ = tree.plot_tree(clf,filled=True,feature_names=feature_names)
裂分标准
那个值最大,所含信息量越大,优先选择波动大的作为裂分条件
属性列表
由上可以看出波动性只是一个参考标准