极限森林
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt
#决策树,进行裂分时候,根据信息增益最大进行列分
#极限森林 1、样本随机 2、分裂条件随机(不是最好的裂分条件)
#像在随机森林中一样,使用候选特征的随机子集,但不是寻找最有区别的阈值
#而是为每个候选特征随机绘制阈值
#并选择这些随机生成的阈值中的最佳阈值作为划分规则
X,y = datasets.load_wine(True)
clf = DecisionTreeClassifier()
cross_val_score(clf,X,y,cv=6,scoring="accuracy").mean()
forest = RandomForestClassifier(n_estimators=100)
cross_val_score(forest,X,y,cv=6,scoring="accuracy").mean()
extra = ExtraTreesClassifier(n_estimators=100)
cross_val_score(extra,X,y,cv=6,scoring="accuracy").mean()
结果:
0.8653256704980842
0.9777777777777779
0.9833333333333334
梯度提升树的使用
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
X,y = datasets.load_iris(True)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
gbdt = GradientBoostingClassifier()
gbdt.fit(X_train,y_train)
gbdt.score(X_test,y_test)
结果:
0.9666666666666667
import numpy as np
import matplotlib.pyplot as plt
#回归时分类的极限思想
#分类的类别多到一定程度,那么就是回归
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor
from sklearn import tree
# X数据:上网时间和购物金额
# y目标:14(高一),16(高三),24(大学毕业),26(工作两年)
X = np.array([[800,3],[1200,1],[1800,4],[2500,2]])
y = np.array([14,16,24,26])
gbdt = GradientBoostingRegressor(n_estimators=10)
gbdt.fit(X,y)
gbdt.predict(X)
结果:
array([16.09207064, 17.39471376, 22.60528624, 23.90792936])
plt.rcParams["font.sans-serif"]="KaiTi"
plt.figure(figsize=(9,6))
_ = tree.plot_tree(gbdt[0,0],filled=True,feature_names=["消费","上网"])
friedman_mse = ((y[:2]-y[:2].mean())**2).mean() =1
value是14,16,24,26和20的差,即残差,残差越小——>越好——>越准确
plt.rcParams["font.sans-serif"]="KaiTi"
plt.figure(figsize=(9,6))
_ = tree.plot_tree(gbdt[1,0],filled=True,feature_names=["消费","上网"])
#learning_rate = 0.1
gbdt1 = np.array([-6,-4,6,4])
#梯度提升 学习率0.1
gbdt1 - gbdt1*0.1
结果:
array([-5.4, -3.6, 5.4, 3.6])
#learning_rate = 0.1
gbdt2 = np.array([-5.4,-3.6,5.4,3.6])
#梯度提升 学习率0.1
gbdt2 - gbdt2*0.1
结果:
array([-4.86, -3.24, 4.86, 3.24])
plt.rcParams["font.sans-serif"]="KaiTi"
plt.figure(figsize=(9,6))
_ = tree.plot_tree(gbdt[2,0],filled=True,feature_names=["消费","上网"])
最后一棵树
plt.rcParams["font.sans-serif"]="KaiTi"
plt.figure(figsize=(9,6))
_ = tree.plot_tree(gbdt[-1,0],filled=True,feature_names=["消费","上网"])
#learning_rate = 0.1
gbdt3 = np.array([-2.325,-1.55,2.325,1.55])
#梯度提升 学习率0.1
gbdt3 - gbdt3*0.1
结果:
array([-2.0925,-1.395,2.0925,1.395])
array([-2.0925,-1.395,1.395,2.0925])
14,16,24,26下减上
16.0925,17.395,22.605,23.9075
gbdt.predict(X)
结果:
array([16.09207064, 17.39471376, 22.60528624, 23.90792936])
梯度上升梯度下降
下降——减法求最小值;上升——加法求最大值
import numpy as np
import matplotlib.pyplot as plt
f = lambda x:(x-3)**2 + 2.5*x -7.5
f
#导数 = 梯度
2(x-3)+2.5 = 0
x = 1.75
x = np.linspace(-2,5,100)
y = f(x)
plt.plot(x,y)
import numpy as np
import matplotlib.pyplot as plt
f = lambda x:(x-3)**2 + 2.5*x -7.5
f
#导数 = 梯度
x = np.linspace(-2,5,100)
y = f(x)
plt.plot(x,y)
learning_rate = 0.1
#导数函数
d = lambda x:2*(x-3) + 2.5
min_value = np.random.randint(-3,5,size=1)[0]
print("---------------",min_value)
#记录数据更新了,原来的值,上一步的值,退出条件
min_value_last = min_value +0.1
tol = 0.0001
count = 0
while True:
if np.abs(min_value-min_value_last)<tol:
break
#梯度下降
min_value_last = min_value
#更新值:梯度下降
min_value = min_value - learning_rate*d(min_value)
print("+++++++++++++++++%d"%(count),min_value)
count = count + 1
print("****************",min_value)
结果:
----------------- 4
+++++++++++++++++0 3.55
+++++++++++++++++1 3.19
+++++++++++++++++2 2.902
+++++++++++++++++3 2.6716
+++++++++++++++++4 2.48728
+++++++++++++++++5 2.339824
+++++++++++++++++6 2.2218592
+++++++++++++++++7 2.12748736
+++++++++++++++++8 2.051989888
+++++++++++++++++9 1.9915919104
+++++++++++++++++10 1.94327352832
+++++++++++++++++11 1.904618822656
+++++++++++++++++12 1.8736950581248
+++++++++++++++++13 1.84895604649984
+++++++++++++++++14 1.829164837199872
+++++++++++++++++15 1.8133318697598977
+++++++++++++++++16 1.8006654958079182
+++++++++++++++++17 1.7905323966463347
+++++++++++++++++18 1.7824259173170678
+++++++++++++++++19 1.7759407338536541
+++++++++++++++++20 1.7707525870829233
+++++++++++++++++21 1.7666020696663387
+++++++++++++++++22 1.763281655733071
+++++++++++++++++23 1.760625324586457
+++++++++++++++++24 1.7585002596691655
+++++++++++++++++25 1.7568002077353324
+++++++++++++++++26 1.755440166188266
+++++++++++++++++27 1.7543521329506127
+++++++++++++++++28 1.7534817063604902
+++++++++++++++++29 1.7527853650883922
+++++++++++++++++30 1.7522282920707137
+++++++++++++++++31 1.751782633656571
+++++++++++++++++32 1.7514261069252568
+++++++++++++++++33 1.7511408855402055
+++++++++++++++++34 1.7509127084321645
+++++++++++++++++35 1.7507301667457316
+++++++++++++++++36 1.7505841333965853
+++++++++++++++++37 1.7504673067172682
+++++++++++++++++38 1.7503738453738147
***************** 1.7503738453738147
import numpy as np
import matplotlib.pyplot as plt
f2 = lambda x : -(x - 3)**2 + 2.5*x -7.5
# 梯度提升 导数函数
result = []
d2 = lambda x : -2*(x - 3) + 2.5
learning_rate = 0.1
# max_value瞎蒙的值,方法,最快的速度找到最优解(梯度下降)
# 梯度消失,梯度爆炸(因为学习率太大)
max_value = np.random.randint(2,8,size = 1)[0]
# max_value = 1000
result.append(max_value)
print('-------------------',max_value)
# 记录数据更新了,原来的值,上一步的值,退出条件
max_value_last = max_value + 0.001
# tollerence容忍度,误差,在万分之一,任务结束
# precision精确度,精度达到了万分之一,任务结束
precision = 0.0001
count = 0
while True:
if np.abs(max_value - max_value_last) < precision:
break
# 梯度上升
max_value_last = max_value
# 更新值:梯度上升
max_value = max_value + learning_rate*d2(max_value)
result.append(max_value)
count +=1
print('+++++++++++++++++++++%d'%(count),max_value)
print('**********************',max_value)
# 观察一下变化
plt.figure(figsize=(12,9))
x = np.linspace(4,8,100)
y = f2(x)
plt.plot(x,y)
result = np.asarray(result)
plt.plot(result,f2(result),'*')
结果:
------------------- 5
+++++++++++++++++++++1 4.85
+++++++++++++++++++++2 4.7299999999999995
+++++++++++++++++++++3 4.6339999999999995
+++++++++++++++++++++4 4.5572
+++++++++++++++++++++5 4.49576
+++++++++++++++++++++6 4.4466079999999994
+++++++++++++++++++++7 4.407286399999999
+++++++++++++++++++++8 4.37582912
+++++++++++++++++++++9 4.350663296
+++++++++++++++++++++10 4.3305306368
+++++++++++++++++++++11 4.31442450944
+++++++++++++++++++++12 4.301539607552
+++++++++++++++++++++13 4.2912316860416
+++++++++++++++++++++14 4.2829853488332805
+++++++++++++++++++++15 4.276388279066625
+++++++++++++++++++++16 4.2711106232533
+++++++++++++++++++++17 4.26688849860264
+++++++++++++++++++++18 4.263510798882112
+++++++++++++++++++++19 4.260808639105689
+++++++++++++++++++++20 4.2586469112845515
+++++++++++++++++++++21 4.256917529027641
+++++++++++++++++++++22 4.255534023222113
+++++++++++++++++++++23 4.254427218577691
+++++++++++++++++++++24 4.2535417748621525
+++++++++++++++++++++25 4.252833419889722
+++++++++++++++++++++26 4.252266735911777
+++++++++++++++++++++27 4.251813388729422
+++++++++++++++++++++28 4.251450710983538
+++++++++++++++++++++29 4.251160568786831
+++++++++++++++++++++30 4.250928455029465
+++++++++++++++++++++31 4.250742764023572
+++++++++++++++++++++32 4.250594211218858
+++++++++++++++++++++33 4.250475368975087
+++++++++++++++++++++34 4.2503802951800695
********************** 4.2503802951800695