1.决策树使用与原理
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import tree
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
X = iris['data']
y = iris['target']
feature_names = iris.feature_names
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 1024)
clf = DecisionTreeClassifier(criterion='entropy')
clf.fit(X_train,y_train)
y_ = clf.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_)
39/120*np.log2(120/39) + 42/120*np.log2(120/42) + 39/120*np.log2(120/39)
42/81*np.log2(81/42) + 39/81*np.log2(81/39)
42/81*np.log2(81/42) + 39/81*np.log2(81/39)
plt.figure(figsize=(18,12))
_ = tree.plot_tree(clf,filled = True,feature_names=feature_names,max_depth=1)
X_train
X_train.std(axis = 0)
%%time
clf = DecisionTreeClassifier(criterion='entropy',max_depth=5)
clf.fit(X_train,y_train)
y_ = clf.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_))
plt.figure(figsize=(18,12))
_ = tree.plot_tree(clf,filled=True,feature_names = feature_names)
%%time
1.9 + 3.3 = 5.2
5.2/2 = 2.6
np.sort(X_train[:,2])
clf = DecisionTreeClassifier(criterion='gini',max_depth=5)
clf.fit(X_train,y_train)
y_ = clf.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_))
plt.figure(figsize=(18,12))
_ = tree.plot_tree(clf,filled=True,feature_names = feature_names)
2.随机森林
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn import datasets
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
wine = datasets.load_wine()
wine
X = wine['data']
y = wine['target']
X.shape
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
clf = RandomForestClassifier()
clf.fit(X_train,y_train)
y_ = clf.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_)
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train,y_train)
dt_clf.score(X_test,y_test)
score = 0
for i in range(100):
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train,y_train)
score+=dt_clf.score(X_test,y_test)/100
print('决策树多次运行准确率:',score)
score = 0
for i in range(100):
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train,y_train)
score+=clf.score(X_test,y_test)/100
print('随机森林多次运行准确率:',score)