模型原型
class sklearn.tree.DecisionTreeClassifier(criterion=’gini’,splitter=’best’,
max_depth=None,min_samples_samples_split=2,min_samples_leaf=1,
min_weight_fraction_leaf=0.0,max_features=None,random_state=None, max_leaf_nodes=None,class_weight=None,presor=False)
参数
- criterion:指定切分质量的评分准则
- ’gini’:切分时评价准则是Gini系数
- ‘entropy’:切分时评价准则是熵
- splitter
- max_depth
- min_samples_split
- min_samples_leaf
- min_weight_fraction_leaf
- max_features
- random_state
- max_leaf_nodes
- class_weight
- presor
属性
- classes_:分类的标签值
- featureimportances
- maxfeatures
- nclasses:分类的数量
- nfeatures
- noutputs
- tree_
方法
- fit(X,y[,sample_weight,check_input,…])
- predict(X[,check_input])
- predict_log_proba(X)
- predict_proda(X)
- score(X,y[,sample_weight])
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn import cross_validation
产生随机的数据集
def creat_data(n):
np.random.seed(0)
X=5*np.random.rand(n,1)
y=np.sin(X).ravel()
noise_num=(int)(n/5)
y[::5]+=3*(0.5-np.random.rand(noise_num))
return cross_validation.train_test_split(X,y,test_size=0.25,random_state=1)
加载数据集
def load_data():
iris=datasets.load_iris()
X_train=iris.data
y_train=iris.target
return cross_validation.train_test_split(X_train,y_train,test_size=0.25,random_state=0,stratify=y_train)
使用DecisionTreeClassifier
def test_DecisionTreeClassifier(*data):
X_train,X_test,y_train,y_test=data
clf=DecisionTreeClassifier()
clf.fit(X_train,y_train)
print("Training score:%f"%(clf.score(X_train,y_train)))
print("Testing score:%f"%(clf.score(X_test,y_test)))
X_train,X_test,y_train,y_test=load_data()
test_DecisionTreeClassifier(X_train,X_test,y_train,y_test)
评价准则criterion对分类性能的影响
def test_DecisionTreeClassifier_criterion(*data):
X_train,X_test,y_train,y_test=data
criterions=['gini','entropy']
for criterion in criterions:
clf=DecisionTreeClassifier(criterion=criterion)
clf.fit(X_train,y_train.astype('int'))
print('criterion:%s'%criterion)
print('Training score:%f'%(clf.score(X_train,
y_train.astype('int'))))
print('Testing score:%f'%(clf.score(X_test,y_test.astype('int'))))
X_train,X_test,y_train,y_test=creat_data(100)
test_DecisionTreeClassifier_criterion(X_train,X_test,y_train,y_test)
决策树深度的影响
def test_DecisionTreeClassifier_depth(*data,maxdepth):
X_train,X_test,y_train,y_test=data
depths=np.arange(1,maxdepth)
training_scores=[]
testing_scores=[]
for depth in depths:
clf=DecisionTreeClassifier(max_depth=depth)
clf.fit(X_train,y_train.astype('int'))
training_scores.append(clf.score(X_train,y_train.astype('int')))
testing_scores.append(clf.score(X_test,y_test.astype('int')))
#绘图
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
ax.plot(depths,training_scores,label='train score',marker='o')
ax.plot(depths,testing_scores,label='testing score',marker='*')
ax.set_xlabel('maxdepth')
ax.set_ylabel('score')
ax.set_title('Decision Tree Classification')
ax.legend(framealpha=0.5,loc='best')
plt.show()
X_train,X_test,y_train,y_test=creat_data(100)
test_DecisionTreeClassifier_depth(X_train,X_test,y_train,y_test,
maxdepth=100)