集成学习——Random Forest

最新推荐文章于 2021-08-19 14:18:54 发布

小小蒲公英

最新推荐文章于 2021-08-19 14:18:54 发布

阅读量431

点赞数

分类专栏：机器学习

本文链接：https://blog.csdn.net/weixin_39777626/article/details/79859844

版权

机器学习专栏收录该内容

44 篇文章 1 订阅

订阅专栏

import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets,cross_validation,ensemble

加载数据集

#糖尿病病人
def load_data_regression():
    diabetes=datasets.load_diabetes()
    return cross_validation.train_test_split(diabetes.data,diabetes.target,test_size=0.25,random_state=0)

#手写识别数据集Digit Dataset
def load_data_classification():
    digits=datasets.load_digits()
    return cross_validation.train_test_split(digits.data,digits.target,test_size=0.25,random_state=0)

RandomForestClassifier随机森林分类器

模型原型

class sklearn.ensemble.RandomForestClassifier(n_estimators=10,criterion=’gini’,max_depth=None,min_samples_split=2, min_samples_leaf=1,min_weight_fraction_leaf=0.0,max_feature=’auto’,max_leaf_nodes=None,bootstrap=True,oob_score=Flase, n_jobs=1,random_state=None,verbose=0,warm_start=False,class_weight=None)
参数

n_estimators:随机森林中决策树的数量
criterion
max_depth
min_samples_split
min_samples_leaf
min_weight_fraction_leaf
max_feature
max_leaf_nodes
bootstrap
oob_score:如果为True，则使用包外样本来计算泛化误差
n_jobs
random_state
verbose
warm_start
class_weight

属性

estimators_:所有训练过的基础分类器
classes_
n_classes_
n_features_
n_outputs_
feature_importances
oobscore:训练数据使用包外估计时的得分

方法

fit(X,y[,samples_weight])
predict(X)
predict_log_proba(X)
predict_proba(X)
score(X,y[,samples_weight])

使用RandomForestClassifier类

def test_RandomForestClassifier(*data):
    X_train,X_test,y_train,y_test=data
    clf=ensemble.RandomForestClassifier()
    clf.fit(X_train,y_train)
    print('Training Score:%f'%clf.score(X_train,y_train))
    print('Testing Score:%f'%clf.score(X_test,y_test))

X_train,X_test,y_train,y_test=load_data_classification()
test_RandomForestClassifier(X_train,X_test,y_train,y_test)

森林中决策树的个数的影响

def test_RandomForestClassifier_num(*data):
    X_train,X_test,y_train,y_test=data
    nums=np.arange(1,100,step=2)
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    training_scores=[]
    testing_scores=[]
    for num in nums:
        clf=ensemble.RandomForestClassifier(n_estimators=num)
        clf.fit(X_train,y_train)
        training_scores.append(clf.score(X_train,y_train))
        testing_scores.append(clf.score(X_test,y_test))
    ax.plot(nums,training_scores,label='Training Score')
    ax.plot(nums,testing_scores,label='Testing Score')
    ax.set_xlabel('estimator num')
    ax.set_ylabel('score')
    ax.legend(loc='lower right')
    ax.set_ylim(0,1.05)
    plt.suptitle('RandomForestClassifier')
    plt.show()

test_RandomForestClassifier_num(X_train,X_test,y_train,y_test)

max_depth参数的影响

def test_RandomForestClassifier_max_depth(*data):
    X_train,X_test,y_train,y_test=data
    maxdepths=np.arange(1,20)
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    training_scores=[]
    testing_scores=[]
    for max_depth in maxdepths:
        clf=ensemble.RandomForestClassifier(max_depth=max_depth)
        clf.fit(X_train,y_train)
        training_scores.append(clf.score(X_train,y_train))
        testing_scores.append(clf.score(X_test,y_test))
    ax.plot(maxdepths,training_scores,label='Training Score')
    ax.plot(maxdepths,testing_scores,label='Testing Score')
    ax.set_xlabel('max_depth')
    ax.set_ylabel('score')
    ax.legend(loc='lower right')
    ax.set_ylim(0,1.05)
    plt.suptitle('RandomForestClassifier')
    plt.show()

test_RandomForestClassifier_max_depth(X_train,X_test,y_train,y_test)

max_features参数的影响

def test_RandomForestClassifier_max_features(*data):
    X_train,X_test,y_train,y_test=data
    max_features=np.linspace(0.01,1.0)
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    training_scores=[]
    testing_scores=[]
    for max_feature in max_features:
        clf=ensemble.RandomForestClassifier(max_features=max_feature)
        clf.fit(X_train,y_train)
        training_scores.append(clf.score(X_train,y_train))
        testing_scores.append(clf.score(X_test,y_test))
    ax.plot(max_features,training_scores,label='Training Score')
    ax.plot(max_features,testing_scores,label='Testing Score')
    ax.set_xlabel('max_feature')
    ax.set_ylabel('score')
    ax.legend(loc='lower right')
    ax.set_ylim(0,1.05)
    plt.suptitle('RandomForestClassifier')
    plt.show()

test_RandomForestClassifier_max_features(X_train,X_test,y_train,y_test)

RandomForestRegressor随机森林回归器

模型原型

class sklearn.ensemble.RandomForestRegressor(n_estimators=10,criterion=’mse’,max_depth=None,min_samples_split=2, min_samples_leaf=1,min_weight_fraction_leaf=0.0,max_feature=’auto’,max_leaf_nodes=None,bootstrap=True,oob_score=Flase, n_jobs=1,random_state=None,verbose=0,warm_start=False)
参数

n_estimators:随机森林中决策树的数量
criterion
max_depth
min_samples_split
min_samples_leaf
min_weight_fraction_leaf
max_feature
max_leaf_nodes
bootstrap
oob_score:如果为True，则使用包外样本来计算泛化误差
n_jobs
random_state
verbose
warm_start

属性

estimators_:所有训练过的基础分类器
n_features
n_outputs
feature_importances
oobscore:训练数据使用包外估计时的得分
oobprediction

方法

fit(X,y[,samples_weight])
predict(X)
score(X,y[,samples_weight])

使用RandomForestRegressor类

def test_RandomForestRegressor(*data):
    X_train,X_test,y_train,y_test=data
    regr=ensemble.RandomForestRegressor()
    regr.fit(X_train,y_train)
    print('Training Score:%f'%regr.score(X_train,y_train))
    print('Testing Score:%f'%regr.score(X_test,y_test))

X_train,X_test,y_train,y_test=load_data_regression()
test_RandomForestRegressor(X_train,X_test,y_train,y_test)

森林中决策树的个数的影响

def test_RandomForestRegressor_num(*data):
    X_train,X_test,y_train,y_test=data
    nums=np.arange(1,100,step=2)
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    training_scores=[]
    testing_scores=[]
    for num in nums:
        clf=ensemble.RandomForestRegressor(n_estimators=num)
        clf.fit(X_train,y_train)
        training_scores.append(clf.score(X_train,y_train))
        testing_scores.append(clf.score(X_test,y_test))
    ax.plot(nums,training_scores,label='Training Score')
    ax.plot(nums,testing_scores,label='Testing Score')
    ax.set_xlabel('estimator num')
    ax.set_ylabel('score')
    ax.legend(loc='lower right')
    ax.set_ylim(-1,1)
    plt.suptitle('RandomForestRegressor')
    plt.show()

test_RandomForestRegressor_num(X_train,X_test,y_train,y_test)

max_depth参数的影响

def test_RandomForestRegressor_max_depth(*data):
    X_train,X_test,y_train,y_test=data
    maxdepths=np.arange(1,20)
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    training_scores=[]
    testing_scores=[]
    for max_depth in maxdepths:
        clf=ensemble.RandomForestRegressor(max_depth=max_depth)
        clf.fit(X_train,y_train)
        training_scores.append(clf.score(X_train,y_train))
        testing_scores.append(clf.score(X_test,y_test))
    ax.plot(maxdepths,training_scores,label='Training Score')
    ax.plot(maxdepths,testing_scores,label='Testing Score')
    ax.set_xlabel('max_depth')
    ax.set_ylabel('score')
    ax.legend(loc='lower right')
    ax.set_ylim(0,1.05)
    plt.suptitle('RandomForestRegressor')
    plt.show()

test_RandomForestRegressor_max_depth(X_train,X_test,y_train,y_test)

max_features参数的影响

def test_RandomForestRegressor_max_features(*data):
    X_train,X_test,y_train,y_test=data
    max_features=np.linspace(0.01,1.0)
    fig=plt.figure()
    ax=fig.add_subplot(1,1,1)
    training_scores=[]
    testing_scores=[]
    for max_feature in max_features:
        clf=ensemble.RandomForestRegressor(max_features=max_feature)
        clf.fit(X_train,y_train)
        training_scores.append(clf.score(X_train,y_train))
        testing_scores.append(clf.score(X_test,y_test))
    ax.plot(max_features,training_scores,label='Training Score')
    ax.plot(max_features,testing_scores,label='Testing Score')
    ax.set_xlabel('max_feature')
    ax.set_ylabel('score')
    ax.legend(loc='lower right')
    ax.set_ylim(0,1.05)
    plt.suptitle('RandomForestRegressor')
    plt.show()

test_RandomForestRegressor_max_features(X_train,X_test,y_train,y_test)