import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets,cross_validation,ensemble
加载数据集
#糖尿病病人
def load_data_regression():
diabetes=datasets.load_diabetes()
return cross_validation.train_test_split(diabetes.data,diabetes.target,test_size=0.25,random_state=0)
#手写识别数据集Digit Dataset
def load_data_classification():
digits=datasets.load_digits()
return cross_validation.train_test_split(digits.data,digits.target,test_size=0.25,random_state=0)
RandomForestClassifier随机森林分类器
模型原型
class sklearn.ensemble.RandomForestClassifier(n_estimators=10,criterion=’gini’,max_depth=None,min_samples_split=2, min_samples_leaf=1,min_weight_fraction_leaf=0.0,max_feature=’auto’,max_leaf_nodes=None,bootstrap=True,oob_score=Flase, n_jobs=1,random_state=None,verbose=0,warm_start=False,class_weight=None)
参数
- n_estimators:随机森林中决策树的数量
- criterion
- max_depth
- min_samples_split
- min_samples_leaf
- min_weight_fraction_leaf
- max_feature
- max_leaf_nodes
- bootstrap
- oob_score:如果为True,则使用包外样本来计算泛化误差
- n_jobs
- random_state
- verbose
- warm_start
- class_weight
属性
- estimators_:所有训练过的基础分类器
- classes_
- n_classes_
- n_features_
- n_outputs_
- feature_importances
- oobscore:训练数据使用包外估计时的得分
方法
- fit(X,y[,samples_weight])
- predict(X)
- predict_log_proba(X)
- predict_proba(X)
- score(X,y[,samples_weight])
使用RandomForestClassifier类
def test_RandomForestClassifier(*data):
X_train,X_test,y_train,y_test=data
clf=ensemble.RandomForestClassifier()
clf.fit(X_train,y_train)
print('Training Score:%f'%clf.score(X_train,y_train))
print('Testing Score:%f'%clf.score(X_test,y_test))
X_train,X_test,y_train,y_test=load_data_classification()
test_RandomForestClassifier(X_train,X_test,y_train,y_test)
森林中决策树的个数的影响
def test_RandomForestClassifier_num(*data):
X_train,X_test,y_train,y_test=data
nums=np.arange(1,100,step=2)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
training_scores=[]
testing_scores=[]
for num in nums:
clf=ensemble.RandomForestClassifier(n_estimators=num)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(nums,training_scores,label='Training Score')
ax.plot(nums,testing_scores,label='Testing Score')
ax.set_xlabel('estimator num')
ax.set_ylabel('score')
ax.legend(loc='lower right')
ax.set_ylim(0,1.05)
plt.suptitle('RandomForestClassifier')
plt.show()
test_RandomForestClassifier_num(X_train,X_test,y_train,y_test)
max_depth参数的影响
def test_RandomForestClassifier_max_depth(*data):
X_train,X_test,y_train,y_test=data
maxdepths=np.arange(1,20)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
training_scores=[]
testing_scores=[]
for max_depth in maxdepths:
clf=ensemble.RandomForestClassifier(max_depth=max_depth)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(maxdepths,training_scores,label='Training Score')
ax.plot(maxdepths,testing_scores,label='Testing Score')
ax.set_xlabel('max_depth')
ax.set_ylabel('score')
ax.legend(loc='lower right')
ax.set_ylim(0,1.05)
plt.suptitle('RandomForestClassifier')
plt.show()
test_RandomForestClassifier_max_depth(X_train,X_test,y_train,y_test)
max_features参数的影响
def test_RandomForestClassifier_max_features(*data):
X_train,X_test,y_train,y_test=data
max_features=np.linspace(0.01,1.0)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
training_scores=[]
testing_scores=[]
for max_feature in max_features:
clf=ensemble.RandomForestClassifier(max_features=max_feature)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(max_features,training_scores,label='Training Score')
ax.plot(max_features,testing_scores,label='Testing Score')
ax.set_xlabel('max_feature')
ax.set_ylabel('score')
ax.legend(loc='lower right')
ax.set_ylim(0,1.05)
plt.suptitle('RandomForestClassifier')
plt.show()
test_RandomForestClassifier_max_features(X_train,X_test,y_train,y_test)
RandomForestRegressor随机森林回归器
模型原型
class sklearn.ensemble.RandomForestRegressor(n_estimators=10,criterion=’mse’,max_depth=None,min_samples_split=2, min_samples_leaf=1,min_weight_fraction_leaf=0.0,max_feature=’auto’,max_leaf_nodes=None,bootstrap=True,oob_score=Flase, n_jobs=1,random_state=None,verbose=0,warm_start=False)
参数
- n_estimators:随机森林中决策树的数量
- criterion
- max_depth
- min_samples_split
- min_samples_leaf
- min_weight_fraction_leaf
- max_feature
- max_leaf_nodes
- bootstrap
- oob_score:如果为True,则使用包外样本来计算泛化误差
- n_jobs
- random_state
- verbose
- warm_start
属性
- estimators_:所有训练过的基础分类器
- n_features
- n_outputs
- feature_importances
- oobscore:训练数据使用包外估计时的得分
- oobprediction
方法
- fit(X,y[,samples_weight])
- predict(X)
- score(X,y[,samples_weight])
使用RandomForestRegressor类
def test_RandomForestRegressor(*data):
X_train,X_test,y_train,y_test=data
regr=ensemble.RandomForestRegressor()
regr.fit(X_train,y_train)
print('Training Score:%f'%regr.score(X_train,y_train))
print('Testing Score:%f'%regr.score(X_test,y_test))
X_train,X_test,y_train,y_test=load_data_regression()
test_RandomForestRegressor(X_train,X_test,y_train,y_test)
森林中决策树的个数的影响
def test_RandomForestRegressor_num(*data):
X_train,X_test,y_train,y_test=data
nums=np.arange(1,100,step=2)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
training_scores=[]
testing_scores=[]
for num in nums:
clf=ensemble.RandomForestRegressor(n_estimators=num)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(nums,training_scores,label='Training Score')
ax.plot(nums,testing_scores,label='Testing Score')
ax.set_xlabel('estimator num')
ax.set_ylabel('score')
ax.legend(loc='lower right')
ax.set_ylim(-1,1)
plt.suptitle('RandomForestRegressor')
plt.show()
test_RandomForestRegressor_num(X_train,X_test,y_train,y_test)
max_depth参数的影响
def test_RandomForestRegressor_max_depth(*data):
X_train,X_test,y_train,y_test=data
maxdepths=np.arange(1,20)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
training_scores=[]
testing_scores=[]
for max_depth in maxdepths:
clf=ensemble.RandomForestRegressor(max_depth=max_depth)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(maxdepths,training_scores,label='Training Score')
ax.plot(maxdepths,testing_scores,label='Testing Score')
ax.set_xlabel('max_depth')
ax.set_ylabel('score')
ax.legend(loc='lower right')
ax.set_ylim(0,1.05)
plt.suptitle('RandomForestRegressor')
plt.show()
test_RandomForestRegressor_max_depth(X_train,X_test,y_train,y_test)
max_features参数的影响
def test_RandomForestRegressor_max_features(*data):
X_train,X_test,y_train,y_test=data
max_features=np.linspace(0.01,1.0)
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
training_scores=[]
testing_scores=[]
for max_feature in max_features:
clf=ensemble.RandomForestRegressor(max_features=max_feature)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
ax.plot(max_features,training_scores,label='Training Score')
ax.plot(max_features,testing_scores,label='Testing Score')
ax.set_xlabel('max_feature')
ax.set_ylabel('score')
ax.legend(loc='lower right')
ax.set_ylim(0,1.05)
plt.suptitle('RandomForestRegressor')
plt.show()
test_RandomForestRegressor_max_features(X_train,X_test,y_train,y_test)