机器学习:boosing复习

非原创,代码来源葁sir

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor
# Ada的回归& GBDT的回归
from sklearn.datasets import load_boston
# 波士顿房价
from sklearn.neighbors import KNeighborsRegressor
boston = load_boston()
data = boston.data
target = boston.target
feature_names = boston.feature_names
# 建立普通的knn模型进行比较
knn = KNeighborsRegressor()
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(data, target, test_size=0.2, random_state=1)
X_train = DataFrame(data=X_train,columns=feature_names)
X_train
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTAT
00.141500.06.910.00.4486.1696.65.72093.0233.017.9383.375.81
10.1544525.05.130.00.4536.14529.27.81488.0284.019.7390.686.86
216.811800.018.100.00.7005.27798.11.426124.0666.020.2396.9030.81
30.056460.012.830.00.4376.23253.75.01415.0398.018.7386.4012.34
48.792120.018.100.00.5845.56570.62.063524.0666.020.23.6517.16
..........................................
3990.0354880.03.640.00.3925.87619.19.22031.0315.016.4395.189.25
4000.091640.010.810.00.4136.0657.85.28734.0305.019.2390.915.52
4015.872050.018.100.00.6936.40596.01.676824.0666.020.2396.9019.37
4020.330450.06.200.00.5076.08661.53.65198.0307.017.4376.7510.88
4030.080140.05.960.00.4995.85041.53.93425.0279.019.2396.908.77

404 rows × 13 columns

X_train.describe().T
countmeanstdmin25%50%75%max
CRIM404.03.6974559.1467430.006320.0825980.2344053.59492788.9762
ZN404.011.52722823.2882840.000000.0000000.00000020.000000100.0000
INDUS404.011.0775006.8484120.460005.1900009.12500018.10000027.7400
CHAS404.00.0792080.2703980.000000.0000000.0000000.0000001.0000
NOX404.00.5530260.1168950.385000.4480000.5350000.6240000.8710
RM404.06.2687920.6892293.561005.8767506.1790006.6265008.7800
AGE404.067.93564428.5631862.9000043.25000076.80000093.825000100.0000
DIS404.03.8261112.1209991.129602.1053503.2986005.14147512.1265
RAD404.09.4702978.6802371.000004.0000005.00000024.00000024.0000
TAX404.0403.257426169.030480187.00000277.000000329.000000666.000000711.0000
PTRATIO404.018.4386142.16946912.6000017.22500019.00000020.20000022.0000
B404.0357.15368891.5416470.32000376.092500391.575000396.157500396.9000
LSTAT404.012.7785407.2164031.730007.09250011.46500017.10250037.9700
X_train.min() # 没有负值 可以使用区缩放发 压缩到01
CRIM         0.00632
ZN           0.00000
INDUS        0.46000
CHAS         0.00000
NOX          0.38500
RM           3.56100
AGE          2.90000
DIS          1.12960
RAD          1.00000
TAX        187.00000
PTRATIO     12.60000
B            0.32000
LSTAT        1.73000
dtype: float64
from sklearn.preprocessing import MinMaxScaler
# 区缩放法:压缩数据到01之间
mms = MinMaxScaler()
data = mms.fit_transform(X_train)

feature_names
array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')
X_train = pd.DataFrame(data=data,columns=feature_names)
X_train
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTAT
00.0015190.000.2364370.00.1296300.4997130.0381050.4175090.0869570.0877860.5638300.9658830.112583
10.0016650.250.1711880.00.1399180.4951140.2708550.6079170.3043480.1851150.7553190.9843160.141556
20.1888900.000.6466280.00.6481480.3287990.9804330.0269621.0000000.9141220.8085111.0000000.802428
30.0005640.000.4534460.00.1069960.5117840.5231720.3532360.1739130.4026720.6489360.9735240.292770
40.0987500.000.6466280.00.4094650.3839820.6972190.0849241.0000000.9141220.8085110.0083970.425773
..........................................
3990.0003280.800.1165690.00.0144030.4435720.1668380.7357260.0000000.2442750.4042550.9956630.207506
4000.0009590.000.3793990.00.0576130.4797850.0504630.3780790.1304350.2251910.7021280.9848960.104581
4010.0659290.000.6466280.00.6337450.5449320.9588050.0497591.0000000.9141220.8085111.0000000.486755
4020.0036430.000.2104110.00.2510290.4838090.6035020.2293650.3043480.2290080.5106380.9491910.252483
4030.0008300.000.2016130.00.2345680.4385900.3975280.2550360.1739130.1755730.7021281.0000000.194260

404 rows × 13 columns

knn.fit(X_train,y_train)
KNeighborsRegressor()
# 评判回归问题 用什么指标:score? mae mse
from sklearn.metrics import mean_squared_error
mean_squared_error(y_train,knn.predict(X_train))
15.028524752475246
mean_squared_error(y_test,knn.predict(X_test))
204.1549686274509

实验:adaboosting

aba = AdaBoostRegressor(base_estimator=KNeighborsRegressor(),n_estimators=100)
aba.fit(X_train,y_train)
AdaBoostRegressor(base_estimator=KNeighborsRegressor(), n_estimators=100)
mean_squared_error(y_train,aba.predict(X_train))
4.755455445544554
mean_squared_error(y_test,aba.predict(X_test))
127.99668627450978
# 每一个基学习器上在样本集上的预测结果
err_list = []
for i,y_ in enumerate(aba.staged_predict(X_test)):
    err = mean_squared_error(y_test,y_)
    err_list.append(err)
    print('C{}:ERROR:{}'.format(i,err))
C0:ERROR:199.32693725490194
C1:ERROR:199.32693725490194
C2:ERROR:220.0897333333333
C3:ERROR:199.7595607843137
C4:ERROR:207.813894117647
C5:ERROR:207.82781176470584
C6:ERROR:192.8428588235294
C7:ERROR:190.94257647058825
C8:ERROR:192.6760862745098
C9:ERROR:189.0082431372549
C10:ERROR:188.7143254901961
C11:ERROR:184.35898823529416
C12:ERROR:179.3270117647059
C13:ERROR:178.614568627451
C14:ERROR:166.1954274509804
C15:ERROR:169.03231372549018
C16:ERROR:158.3203725490196
C17:ERROR:158.4866470588235
C18:ERROR:142.9087137254902
C19:ERROR:158.1416
C20:ERROR:142.4846666666667
C21:ERROR:142.4846666666667
C22:ERROR:157.30826666666667
C23:ERROR:157.38923529411767
C24:ERROR:157.35952941176473
C25:ERROR:142.61410196078435
C26:ERROR:135.35090980392155
C27:ERROR:142.68588627450984
C28:ERROR:135.77389411764707
C29:ERROR:142.68588627450984
C30:ERROR:142.04365098039216
C31:ERROR:141.93987843137253
C32:ERROR:135.87796470588236
C33:ERROR:141.93987843137253
C34:ERROR:141.82033725490197
C35:ERROR:141.95123529411765
C36:ERROR:135.73551764705883
C37:ERROR:141.9422549019608
C38:ERROR:135.90520784313725
C39:ERROR:141.4849843137255
C40:ERROR:135.90342352941178
C41:ERROR:142.09143921568628
C42:ERROR:135.91702352941178
C43:ERROR:136.00622745098042
C44:ERROR:135.89461960784314
C45:ERROR:142.01684705882352
C46:ERROR:135.91702352941178
C47:ERROR:136.18957647058824
C48:ERROR:136.2971137254902
C49:ERROR:136.19823137254903
C50:ERROR:134.7982470588235
C51:ERROR:136.41440000000003
C52:ERROR:136.29789803921568
C53:ERROR:136.4323411764706
C54:ERROR:134.9813450980392
C55:ERROR:136.3194980392157
C56:ERROR:136.40856078431375
C57:ERROR:136.41440000000003
C58:ERROR:142.32514901960786
C59:ERROR:136.48449803921568
C60:ERROR:142.32336470588237
C61:ERROR:142.43412941176473
C62:ERROR:142.40421176470588
C63:ERROR:136.38720784313728
C64:ERROR:142.4454392156863
C65:ERROR:136.4293843137255
C66:ERROR:142.46783137254903
C67:ERROR:136.48109019607844
C68:ERROR:142.5336549019608
C69:ERROR:144.18005490196077
C70:ERROR:142.50667450980393
C71:ERROR:136.4272549019608
C72:ERROR:142.50667450980393
C73:ERROR:142.5022117647059
C74:ERROR:142.5354392156863
C75:ERROR:136.4629137254902
C76:ERROR:136.31138823529412
C77:ERROR:135.0016
C78:ERROR:136.49930196078432
C79:ERROR:135.03171764705883
C80:ERROR:135.01974901960781
C81:ERROR:128.98411372549018
C82:ERROR:135.078631372549
C83:ERROR:129.0248196078431
C84:ERROR:135.078631372549
C85:ERROR:128.98411372549018
C86:ERROR:135.078631372549
C87:ERROR:128.9571333333333
C88:ERROR:128.96369803921567
C89:ERROR:128.6254862745098
C90:ERROR:129.0248196078431
C91:ERROR:128.6512274509804
C92:ERROR:128.39080392156865
C93:ERROR:128.39080392156865
C94:ERROR:128.40443529411766
C95:ERROR:128.68032549019608
C96:ERROR:128.39080392156865
C97:ERROR:128.2014862745098
C98:ERROR:128.39080392156865
C99:ERROR:127.99668627450978
# 展示误差的变化
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()
plt.plot(err_list)
[<matplotlib.lines.Line2D at 0x1e4f02ccf40>]

请添加图片描述

# 获取每一个基学习器的错误率
plt.plot(aba.estimator_errors_)
[<matplotlib.lines.Line2D at 0x1e4ee793070>]

请添加图片描述

# 每一个基学习器的权重
plt.plot(aba.estimator_weights_)
[<matplotlib.lines.Line2D at 0x1e4ee42f940>]

请添加图片描述

GBDT观察表现

from sklearn.ensemble import RandomForestRegressor
# max_depth = None 是完全生长的决策树
RandomForestRegressor()
# 看gbdt的情况 深度限制为3 max_depth=3 需要一个弱学习器
gbdt = GradientBoostingRegressor(n_estimators=100)
gbdt.fit(X_train,y_train)
GradientBoostingRegressor()
mean_squared_error(y_train,gbdt.predict(X_train))
1.7840841714565248
mean_squared_error(y_test,gbdt.predict(X_test))
159.79621357980093

特征评估结果

# aba.feature_importances_ # knn作为基学习器的时候 没有这个对象 但是可以换基学习器为逻辑斯蒂回归等
---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

D:\software\anaconda\lib\site-packages\sklearn\ensemble\_weight_boosting.py in feature_importances_(self)
    253             norm = self.estimator_weights_.sum()
--> 254             return (sum(weight * clf.feature_importances_ for weight, clf
    255                     in zip(self.estimator_weights_, self.estimators_))


D:\software\anaconda\lib\site-packages\sklearn\ensemble\_weight_boosting.py in <genexpr>(.0)
    253             norm = self.estimator_weights_.sum()
--> 254             return (sum(weight * clf.feature_importances_ for weight, clf
    255                     in zip(self.estimator_weights_, self.estimators_))


AttributeError: 'KNeighborsRegressor' object has no attribute 'feature_importances_'

The above exception was the direct cause of the following exception:


AttributeError                            Traceback (most recent call last)

~\AppData\Local\Temp/ipykernel_12168/2668898732.py in <module>
----> 1 aba.feature_importances_


D:\software\anaconda\lib\site-packages\sklearn\ensemble\_weight_boosting.py in feature_importances_(self)
    257 
    258         except AttributeError as e:
--> 259             raise AttributeError(
    260                 "Unable to compute feature importances "
    261                 "since base_estimator does not have a "


AttributeError: Unable to compute feature importances since base_estimator does not have a feature_importances_ attribute
pd.Series(data=gbdt.feature_importances_,index=feature_names).plot(kind='bar')
<AxesSubplot:>

请添加图片描述

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值