机器学习Sklearn实战——极限森林、梯度提升树算法

最新推荐文章于 2023-05-31 21:01:57 发布

Grateful_Dead424

最新推荐文章于 2023-05-31 21:01:57 发布

阅读量1.1k

点赞数

文章标签：极限森林梯度提升树 sklearn

本文链接：https://blog.csdn.net/Grateful_Dead424/article/details/121042212

版权

极限森林

from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt
#决策树，进行裂分时候，根据信息增益最大进行列分
#极限森林 1、样本随机 2、分裂条件随机（不是最好的裂分条件）
#像在随机森林中一样，使用候选特征的随机子集，但不是寻找最有区别的阈值
#而是为每个候选特征随机绘制阈值
#并选择这些随机生成的阈值中的最佳阈值作为划分规则

X,y = datasets.load_wine(True)
clf = DecisionTreeClassifier()
cross_val_score(clf,X,y,cv=6,scoring="accuracy").mean()

forest = RandomForestClassifier(n_estimators=100)
cross_val_score(forest,X,y,cv=6,scoring="accuracy").mean()

extra = ExtraTreesClassifier(n_estimators=100)
cross_val_score(extra,X,y,cv=6,scoring="accuracy").mean()

结果：

0.8653256704980842

0.9777777777777779

0.9833333333333334

梯度提升树的使用

import numpy as np 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
X,y = datasets.load_iris(True)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
gbdt = GradientBoostingClassifier()
gbdt.fit(X_train,y_train)
gbdt.score(X_test,y_test)

结果：

0.9666666666666667

import numpy as np
import matplotlib.pyplot as plt

#回归时分类的极限思想
#分类的类别多到一定程度，那么就是回归
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor
from sklearn import tree


# X数据：上网时间和购物金额
# y目标：14（高一），16（高三），24（大学毕业），26（工作两年）
X = np.array([[800,3],[1200,1],[1800,4],[2500,2]])
y = np.array([14,16,24,26])
gbdt = GradientBoostingRegressor(n_estimators=10)
gbdt.fit(X,y)

gbdt.predict(X)

结果：

array([16.09207064, 17.39471376, 22.60528624, 23.90792936])

plt.rcParams["font.sans-serif"]="KaiTi"
plt.figure(figsize=(9,6))
_ = tree.plot_tree(gbdt[0,0],filled=True,feature_names=["消费","上网"])

friedman_mse = ((y[:2]-y[:2].mean())**2).mean() =1

value是14，16，24，26和20的差，即残差,残差越小——>越好——>越准确

plt.rcParams["font.sans-serif"]="KaiTi"
plt.figure(figsize=(9,6))
_ = tree.plot_tree(gbdt[1,0],filled=True,feature_names=["消费","上网"])

#learning_rate = 0.1
gbdt1 = np.array([-6,-4,6,4])
#梯度提升 学习率0.1
gbdt1 - gbdt1*0.1

结果：

array([-5.4, -3.6,  5.4,  3.6])

#learning_rate = 0.1
gbdt2 = np.array([-5.4,-3.6,5.4,3.6])
#梯度提升 学习率0.1
gbdt2 - gbdt2*0.1

结果：

array([-4.86, -3.24,  4.86,  3.24])

plt.rcParams["font.sans-serif"]="KaiTi"
plt.figure(figsize=(9,6))
_ = tree.plot_tree(gbdt[2,0],filled=True,feature_names=["消费","上网"])

最后一棵树

plt.rcParams["font.sans-serif"]="KaiTi"
plt.figure(figsize=(9,6))
_ = tree.plot_tree(gbdt[-1,0],filled=True,feature_names=["消费","上网"])

#learning_rate = 0.1
gbdt3 = np.array([-2.325,-1.55,2.325,1.55])
#梯度提升 学习率0.1
gbdt3 - gbdt3*0.1

结果：

array([-2.0925,-1.395,2.0925,1.395])
array([-2.0925,-1.395,1.395,2.0925])

14，16，24，26下减上

16.0925，17.395，22.605，23.9075

gbdt.predict(X)

结果：

array([16.09207064, 17.39471376, 22.60528624, 23.90792936])

梯度上升梯度下降

下降——减法求最小值；上升——加法求最大值

import numpy as np
import matplotlib.pyplot as plt
f = lambda x:(x-3)**2 + 2.5*x -7.5
f

#导数 = 梯度
2(x-3)+2.5 = 0
x = 1.75

x = np.linspace(-2,5,100)
y = f(x)
plt.plot(x,y)

import numpy as np
import matplotlib.pyplot as plt
f = lambda x:(x-3)**2 + 2.5*x -7.5
f

#导数 = 梯度

x = np.linspace(-2,5,100)
y = f(x)
plt.plot(x,y)

learning_rate = 0.1

#导数函数
d = lambda x:2*(x-3) + 2.5
min_value = np.random.randint(-3,5,size=1)[0]

print("---------------",min_value)
#记录数据更新了，原来的值，上一步的值，退出条件
min_value_last = min_value +0.1
tol = 0.0001

count = 0
while True:
    if np.abs(min_value-min_value_last)<tol:
        break
#梯度下降
    min_value_last = min_value
#更新值：梯度下降
    min_value = min_value - learning_rate*d(min_value)
    print("+++++++++++++++++%d"%(count),min_value)
    count = count + 1
print("****************",min_value)

结果：

----------------- 4
+++++++++++++++++0 3.55
+++++++++++++++++1 3.19
+++++++++++++++++2 2.902
+++++++++++++++++3 2.6716
+++++++++++++++++4 2.48728
+++++++++++++++++5 2.339824
+++++++++++++++++6 2.2218592
+++++++++++++++++7 2.12748736
+++++++++++++++++8 2.051989888
+++++++++++++++++9 1.9915919104
+++++++++++++++++10 1.94327352832
+++++++++++++++++11 1.904618822656
+++++++++++++++++12 1.8736950581248
+++++++++++++++++13 1.84895604649984
+++++++++++++++++14 1.829164837199872
+++++++++++++++++15 1.8133318697598977
+++++++++++++++++16 1.8006654958079182
+++++++++++++++++17 1.7905323966463347
+++++++++++++++++18 1.7824259173170678
+++++++++++++++++19 1.7759407338536541
+++++++++++++++++20 1.7707525870829233
+++++++++++++++++21 1.7666020696663387
+++++++++++++++++22 1.763281655733071
+++++++++++++++++23 1.760625324586457
+++++++++++++++++24 1.7585002596691655
+++++++++++++++++25 1.7568002077353324
+++++++++++++++++26 1.755440166188266
+++++++++++++++++27 1.7543521329506127
+++++++++++++++++28 1.7534817063604902
+++++++++++++++++29 1.7527853650883922
+++++++++++++++++30 1.7522282920707137
+++++++++++++++++31 1.751782633656571
+++++++++++++++++32 1.7514261069252568
+++++++++++++++++33 1.7511408855402055
+++++++++++++++++34 1.7509127084321645
+++++++++++++++++35 1.7507301667457316
+++++++++++++++++36 1.7505841333965853
+++++++++++++++++37 1.7504673067172682
+++++++++++++++++38 1.7503738453738147
***************** 1.7503738453738147

import numpy as np
import matplotlib.pyplot as plt

f2 = lambda x : -(x - 3)**2 + 2.5*x -7.5

# 梯度提升 导数函数
result = []
d2 = lambda x : -2*(x - 3) + 2.5
learning_rate = 0.1
# max_value瞎蒙的值，方法，最快的速度找到最优解（梯度下降）
# 梯度消失，梯度爆炸（因为学习率太大）
max_value = np.random.randint(2,8,size = 1)[0]
# max_value = 1000

result.append(max_value)

print('-------------------',max_value)
# 记录数据更新了，原来的值，上一步的值,退出条件
max_value_last = max_value + 0.001
# tollerence容忍度，误差，在万分之一，任务结束
# precision精确度，精度达到了万分之一，任务结束
precision = 0.0001
count = 0
while True:
    if np.abs(max_value - max_value_last) < precision:
        break
#     梯度上升
    max_value_last = max_value
#     更新值:梯度上升
    max_value = max_value + learning_rate*d2(max_value)
    result.append(max_value)
    count +=1
    print('+++++++++++++++++++++%d'%(count),max_value)
print('**********************',max_value)

# 观察一下变化
plt.figure(figsize=(12,9))
x = np.linspace(4,8,100)
y = f2(x)
plt.plot(x,y)
result = np.asarray(result)
plt.plot(result,f2(result),'*')

结果：

------------------- 5
+++++++++++++++++++++1 4.85
+++++++++++++++++++++2 4.7299999999999995
+++++++++++++++++++++3 4.6339999999999995
+++++++++++++++++++++4 4.5572
+++++++++++++++++++++5 4.49576
+++++++++++++++++++++6 4.4466079999999994
+++++++++++++++++++++7 4.407286399999999
+++++++++++++++++++++8 4.37582912
+++++++++++++++++++++9 4.350663296
+++++++++++++++++++++10 4.3305306368
+++++++++++++++++++++11 4.31442450944
+++++++++++++++++++++12 4.301539607552
+++++++++++++++++++++13 4.2912316860416
+++++++++++++++++++++14 4.2829853488332805
+++++++++++++++++++++15 4.276388279066625
+++++++++++++++++++++16 4.2711106232533
+++++++++++++++++++++17 4.26688849860264
+++++++++++++++++++++18 4.263510798882112
+++++++++++++++++++++19 4.260808639105689
+++++++++++++++++++++20 4.2586469112845515
+++++++++++++++++++++21 4.256917529027641
+++++++++++++++++++++22 4.255534023222113
+++++++++++++++++++++23 4.254427218577691
+++++++++++++++++++++24 4.2535417748621525
+++++++++++++++++++++25 4.252833419889722
+++++++++++++++++++++26 4.252266735911777
+++++++++++++++++++++27 4.251813388729422
+++++++++++++++++++++28 4.251450710983538
+++++++++++++++++++++29 4.251160568786831
+++++++++++++++++++++30 4.250928455029465
+++++++++++++++++++++31 4.250742764023572
+++++++++++++++++++++32 4.250594211218858
+++++++++++++++++++++33 4.250475368975087
+++++++++++++++++++++34 4.2503802951800695
********************** 4.2503802951800695

Grateful_Dead424

关注

0
点赞
踩
5

收藏

觉得还不错? 一键收藏
0
评论
机器学习Sklearn实战——极限森林、梯度提升树算法

极限森林from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifierfrom sklearn.tree import DecisionTreeClassifierfrom sklearn.model_selection import cross_val_scorefrom sklearn import datasetsimport numpy as npimport matplotlib.pyplot as
复制链接

扫一扫