import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import tree
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
X = iris['data']
y = iris['target']
feature_names = iris.feature_names
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 1024)
# 数据清洗,即X,y
# 特征工程
# 使用模型进行训练
# 模型参数调优
# sklearn所有算法封装好了
# 直接用,使用规则如下
clf = DecisionTreeClassifier(criterion='entropy')
clf.fit(X_train,y_train)
y_ = clf.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_)
1.0
39/120*np.log2(120/39)+42/120*np.log2(120/42)+39/120*np.log2(120/39)
1.5840680553754911
42/81*np.log2(81/42)+ 39/81*np.log2(81/39)
0.99901027088048133
# 调整大小
plt.figure(figsize=(18,12))
_ = tree.plot_tree(clf,filled=True,feature_names=feature_names) #绘制树形图 filled调整颜色
plt.savefig('./tree.jpg')
%%time
clf = DecisionTreeClassifier(criterion='gini',max_depth=2) #深度为1
clf.fit(X_train,y_train)
y_ = clf.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_))
plt.figure(figsize=(18,12))
_ = tree.plot_tree(clf,filled=True,feature_names=feature_names)
0.966666666667
Wall time: 45.9 ms
∑ i = 0 n p ( x i ) ∗ ( 1 − p ( x i ) ) \sum_{i = 0}^{n} p(xi)*(1-p(xi)) i=0∑np(xi)∗(1−p(xi))
# 39/120*(1- 39/120)
39/120*(1 -39/120)*2+42/120*(1-42/120)
0.66625