集成学习算法_otto案例：商品分类

¥骁勇善战¥

已于 2023-09-26 12:15:02 修改

阅读量120

点赞数

文章标签：分类数据挖掘人工智能

于 2023-09-13 11:04:27 首次发布

本文链接：https://blog.csdn.net/m0_68462260/article/details/132848091

版权

数据获取

data = pd.read_csv("./data/otto/train.csv")

data.head()

	id	feat_1	feat_4	feat_5	feat_6	feat_7	feat_8	...	feat_85	feat_86	feat_87	feat_90	target
0	1	1	0	0	0	0	0	...	1	0	0	0	Class_1
1	2	0	0	0	0	0	1	...	0	0	0	0	Class_1
2	3	0	0	0	0	0	1	...	0	0	0	0	Class_1
3	4	1	1	6	1	5	0	...	0	1	2	0	Class_1
4	5	0	0	0	0	0	0	...	1	0	0	1	Class_1

5 rows × 95 columns

id - 产品id
feat_1, feat_2, …, feat_93 - 产品的各个特征
target - 产品被划分的类别

data.shape

(61878, 95)

data.describe()

	id	feat_1	feat_2	feat_3	feat_4	feat_5	feat_6	feat_7	feat_8	feat_9	...	feat_84	feat_85	feat_86	feat_87	feat_88	feat_89	feat_90	feat_91	feat_92	feat_93
count	61878.000000	61878.00000	61878.000000	61878.000000	61878.000000	61878.000000	61878.000000	61878.000000	61878.000000	61878.000000	...	61878.000000	61878.000000	61878.000000	61878.000000	61878.000000	61878.000000	61878.000000	61878.000000	61878.000000	61878.000000
mean	30939.500000	0.38668	0.263066	0.901467	0.779081	0.071043	0.025696	0.193704	0.662433	1.011296	...	0.070752	0.532306	1.128576	0.393549	0.874915	0.457772	0.812421	0.264941	0.380119	0.126135
std	17862.784315	1.52533	1.252073	2.934818	2.788005	0.438902	0.215333	1.030102	2.255770	3.474822	...	1.151460	1.900438	2.681554	1.575455	2.115466	1.527385	4.597804	2.045646	0.982385	1.201720
min	1.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	15470.250000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	30939.500000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
75%	46408.750000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	...	0.000000	0.000000	1.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000
max	61878.000000	61.00000	51.000000	64.000000	70.000000	19.000000	10.000000	38.000000	76.000000	43.000000	...	76.000000	55.000000	65.000000	67.000000	30.000000	61.000000	130.000000	52.000000	19.000000	87.000000

8 rows × 94 columns

# 图形可视化,查看数据分布
import seaborn as sns

sns.countplot(data.target)

plt.show()

在这里插入图片描述

由上图可以看出,该数据类别不均衡,所以需要后期处理

数据基本处理

数据已经经过脱敏,不再需要特殊处理

数据的脱敏：对敏感数据比如手机号、银行卡号等信息，进行转换或者修改的一种技术手段，防止敏感数据直接在不可靠的环境下使用。

截取部分数据

new1_data = data[:10000]
new1_data.shape

(10000, 95)

# 图形可视化,查看数据分布
import seaborn as sns

sns.countplot(new1_data.target)

plt.show()

在这里插入图片描述

使用上面方式获取数据不可行,然后使用随机欠采样获取响应的数据

# 随机欠采样获取数据
# 首先需要确定特征值\标签值

y = data["target"]
x = data.drop(["id", "target"], axis=1)

x.head()

	feat_1	feat_4	feat_5	feat_6	feat_7	feat_8	feat_10	...	feat_84	feat_85	feat_86	feat_87	feat_90
0	1	0	0	0	0	0	0	...	0	1	0	0	0
1	0	0	0	0	0	1	0	...	0	0	0	0	0
2	0	0	0	0	0	1	0	...	0	0	0	0	0
3	1	1	6	1	5	0	1	...	22	0	1	2	0
4	0	0	0	0	0	0	0	...	0	1	0	0	1

5 rows × 93 columns

y.head()

0    Class_1
1    Class_1
2    Class_1
3    Class_1
4    Class_1
Name: target, dtype: object

# 欠采样获取数据
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=0)

X_resampled, y_resampled = rus.fit_resample(x, y)

x.shape, y.shape

((61878, 93), (61878,))

X_resampled.shape, y_resampled.shape

((17361, 93), (17361,))

# 图形可视化,查看数据分布
import seaborn as sns

sns.countplot(y_resampled)

plt.show()

在这里插入图片描述

把标签值转换为数字

y_resampled.head()

0    Class_1
1    Class_1
2    Class_1
3    Class_1
4    Class_1
Name: target, dtype: object

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_resampled = le.fit_transform(y_resampled)

y_resampled

array([0, 0, 0, …, 8, 8, 8])

分割数据

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2)

x_train.shape, y_train.shape

((13888, 93), (13888,))

x_test.shape, y_test.shape

((3473, 93), (3473,))

模型训练

基本模型训练

from sklearn.ensemble import RandomForestClassifier # 随机森林


rf = RandomForestClassifier(oob_score=True) # 外包估计
rf.fit(x_train, y_train)

RandomForestClassifier(oob_score=True)

y_pre = rf.predict(x_test)
y_pre

array([3, 7, 8, …, 3, 2, 5])

rf.score(x_test, y_test)

0.7854880506766484

rf.oob_score_

0.7633208525345622

# 图形可视化,查看数据分布
import seaborn as sns

sns.countplot(y_pre)

plt.show()

在这里插入图片描述

# logloss模型评估
# from sklearn.metrics import log_loss

# log_loss(y_test, y_pre, eps=1e-15, normalize=True)

# y_test, y_pre

上面报错原因:logloss使用过程中,必须要求将输出用one-hot表示,

需要将这个多类别问题的输出结果通过OneHotEncoder修改为如下:

from sklearn.preprocessing import OneHotEncoder

one_hot = OneHotEncoder(sparse=False)

y_test1 = one_hot.fit_transform(y_test.reshape(-1, 1))
y_pre1 = one_hot.fit_transform(y_pre.reshape(-1, 1))

y_test1

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

y_pre1

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# logloss模型评估

log_loss(y_test1, y_pre1, eps=1e-15, normalize=True)

7.40898025171566

# 改变预测值的输出模式,让输出结果为百分占比,降低logloss值
y_pre_proba = rf.predict_proba(x_test)

y_pre_proba

array([[0.01, 0.16, 0.16, ..., 0.06, 0.  , 0.  ],
       [0.14, 0.  , 0.  , ..., 0.02, 0.81, 0.02],
       [0.1 , 0.01, 0.03, ..., 0.1 , 0.03, 0.5 ],
       ...,
       [0.02, 0.23, 0.3 , ..., 0.09, 0.  , 0.05],
       [0.  , 0.27, 0.48, ..., 0.01, 0.01, 0.08],
       [0.02, 0.01, 0.  , ..., 0.02, 0.02, 0.  ]])

rf.oob_score_

0.7633208525345622

# logloss模型评估

log_loss(y_test1, y_pre_proba, eps=1e-15, normalize=True)

0.7800586622785408

模型调优

n_estimators, max_feature, max_depth, min_samples_leaf

评分标准

具体公式：
在这里插入图片描述

上公式中，

i 表示样本，j 表示类别。Pij 代表第 i 个样本属于类别 j 的概率，
如果第 i 个样本真的属于类别 j ，则 yij 等于1，否则为0。
根据上公式，假如你将所有的测试样本都正确分类，所有pij都是1，那每个log(pij)都是0，最终的logloss也是0。
假如第1个样本本来是属于1类别的，但是你给它的类别概率pij=0.1，那logloss就会累加上log(0.1)这⼀项。我们知道这⼀项是负数，⽽且pij越⼩，负得越多，如果pij=0，将是⽆穷。这会导致这种情况：你分错了⼀个，logloss就是⽆穷。这当然不合理，为了避免这⼀情况，我们对⾮常⼩的值做如下处理：

也就是说最⼩不会⼩于10^-15。

确定最优的n_estimators

# 确定n_estimators的取值范围
tuned_parameters = range(10, 200, 10)

# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))

# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))

# 调优过程实现
for j, one_parameter in enumerate(tuned_parameters):
    rf2 = RandomForestClassifier(n_estimators=one_parameter, 
                                 max_depth=10, 
                                 max_features=10, 
                                 min_samples_leaf=10, 
                                 oob_score=True, 
                                 random_state=0, 
                                 n_jobs=-1)
    
    rf2.fit(x_train, y_train)
    
    # 输出accuracy
    accuracy_t[j] = rf2.oob_score_
    
    # 输出log_loss
    y_pre = rf2.predict_proba(x_test)
    error_t[j] = log_loss(y_test, y_pre, eps=1e-15, normalize=True)
    
    print(error_t)

[1.11969649 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.        ]
[1.11969649 1.11497311 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.        ]
[1.11969649 1.11497311 1.11608578 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.        ]
[1.11969649 1.11497311 1.11608578 1.11172577 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.        ]
[1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.        ]
[1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.        ]
[1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966
 1.11000822 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.        ]
[1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966
 1.11000822 1.11048257 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.        ]
[1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966
 1.11000822 1.11048257 1.11173118 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.        ]
[1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966
 1.11000822 1.11048257 1.11173118 1.10990752 0.         0.
 0.         0.         0.         0.         0.         0.
 0.        ]
[1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966
 1.11000822 1.11048257 1.11173118 1.10990752 1.10799403 0.
 0.         0.         0.         0.         0.         0.
 0.        ]
[1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966
 1.11000822 1.11048257 1.11173118 1.10990752 1.10799403 1.10837138
 0.         0.         0.         0.         0.         0.
 0.        ]
[1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966
 1.11000822 1.11048257 1.11173118 1.10990752 1.10799403 1.10837138
 1.10756294 0.         0.         0.         0.         0.
 0.        ]
[1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966
 1.11000822 1.11048257 1.11173118 1.10990752 1.10799403 1.10837138
 1.10756294 1.10880934 0.         0.         0.         0.
 0.        ]
[1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966
 1.11000822 1.11048257 1.11173118 1.10990752 1.10799403 1.10837138
 1.10756294 1.10880934 1.10678094 0.         0.         0.
 0.        ]
[1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966
 1.11000822 1.11048257 1.11173118 1.10990752 1.10799403 1.10837138
 1.10756294 1.10880934 1.10678094 1.1060839  0.         0.
 0.        ]
[1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966
 1.11000822 1.11048257 1.11173118 1.10990752 1.10799403 1.10837138
 1.10756294 1.10880934 1.10678094 1.1060839  1.10634137 0.
 0.        ]
[1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966
 1.11000822 1.11048257 1.11173118 1.10990752 1.10799403 1.10837138
 1.10756294 1.10880934 1.10678094 1.1060839  1.10634137 1.10661559
 0.        ]
[1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966
 1.11000822 1.11048257 1.11173118 1.10990752 1.10799403 1.10837138
 1.10756294 1.10880934 1.10678094 1.1060839  1.10634137 1.10661559
 1.1074386 ]

# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 4), dpi=100)

axes[0].plot(tuned_parameters, error_t)
axes[1].plot(tuned_parameters, accuracy_t)

axes[0].set_xlabel("n_estimators")
axes[0].set_ylabel("error_t")
axes[1].set_xlabel("n_estimators")
axes[1].set_ylabel("accuracy_t")

axes[0].grid(True)
axes[1].grid(True)


plt.show()

在这里插入图片描述

经过图像展示,最后确定n_estimators=175的时候,表现效果不错

确定最优的max_features

# 确定n_estimators的取值范围
tuned_parameters = range(5, 40, 5)

# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))

# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))

# 调优过程实现
for j, one_parameter in enumerate(tuned_parameters):
    rf2 = RandomForestClassifier(n_estimators=175, 
                                 max_depth=10, 
                                 max_features=one_parameter, 
                                 min_samples_leaf=10, 
                                 oob_score=True, 
                                 random_state=0, 
                                 n_jobs=-1)
    
    rf2.fit(x_train, y_train)
    
    # 输出accuracy
    accuracy_t[j] = rf2.oob_score_
    
    # 输出log_loss
    y_pre = rf2.predict_proba(x_test)
    error_t[j] = log_loss(y_test, y_pre, eps=1e-15, normalize=True)
    
    print(error_t)

[1.20308122 0.         0.         0.         0.         0.
 0.        ]
[1.20308122 1.1069064  0.         0.         0.         0.
 0.        ]
[1.20308122 1.1069064  1.07297852 0.         0.         0.
 0.        ]
[1.20308122 1.1069064  1.07297852 1.06020463 0.         0.
 0.        ]
[1.20308122 1.1069064  1.07297852 1.06020463 1.05062457 0.
 0.        ]
[1.20308122 1.1069064  1.07297852 1.06020463 1.05062457 1.05232421
 0.        ]
[1.20308122 1.1069064  1.07297852 1.06020463 1.05062457 1.05232421
 1.05392152]

# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 4), dpi=100)

axes[0].plot(tuned_parameters, error_t)
axes[1].plot(tuned_parameters, accuracy_t)

axes[0].set_xlabel("max_features")
axes[0].set_ylabel("error_t")
axes[1].set_xlabel("max_features")
axes[1].set_ylabel("accuracy_t")

axes[0].grid(True)
axes[1].grid(True)


plt.show()

在这里插入图片描述

经过图像展示,最后确定max_features=15的时候,表现效果不错

确定最优的max_depth

# 确定n_estimators的取值范围
tuned_parameters = range(10, 100, 10)

# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))

# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))

# 调优过程实现
for j, one_parameter in enumerate(tuned_parameters):
    rf2 = RandomForestClassifier(n_estimators=175, 
                                 max_depth=one_parameter, 
                                 max_features=15, 
                                 min_samples_leaf=10, 
                                 oob_score=True, 
                                 random_state=0, 
                                 n_jobs=-1)
    
    rf2.fit(x_train, y_train)
    
    # 输出accuracy
    accuracy_t[j] = rf2.oob_score_
    
    # 输出log_loss
    y_pre = rf2.predict_proba(x_test)
    error_t[j] = log_loss(y_test, y_pre, eps=1e-15, normalize=True)
    
    print(error_t)

[1.07297852 0.         0.         0.         0.         0.
 0.         0.         0.        ]
[1.07297852 0.84671097 0.         0.         0.         0.
 0.         0.         0.        ]
[1.07297852 0.84671097 0.82480963 0.         0.         0.
 0.         0.         0.        ]
[1.07297852 0.84671097 0.82480963 0.8257728  0.         0.
 0.         0.         0.        ]
[1.07297852 0.84671097 0.82480963 0.8257728  0.8256875  0.
 0.         0.         0.        ]
[1.07297852 0.84671097 0.82480963 0.8257728  0.8256875  0.8256875
 0.         0.         0.        ]
[1.07297852 0.84671097 0.82480963 0.8257728  0.8256875  0.8256875
 0.8256875  0.         0.        ]
[1.07297852 0.84671097 0.82480963 0.8257728  0.8256875  0.8256875
 0.8256875  0.8256875  0.        ]
[1.07297852 0.84671097 0.82480963 0.8257728  0.8256875  0.8256875
 0.8256875  0.8256875  0.8256875 ]

# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 4), dpi=100)

axes[0].plot(tuned_parameters, error_t)
axes[1].plot(tuned_parameters, accuracy_t)

axes[0].set_xlabel("max_depth")
axes[0].set_ylabel("error_t")
axes[1].set_xlabel("max_depth")
axes[1].set_ylabel("accuracy_t")

axes[0].grid(True)
axes[1].grid(True)


plt.show()

在这里插入图片描述

经过图像展示,最后确定max_depth=30的时候,表现效果不错

确定最优的min_sample_leaf

# 确定n_estimators的取值范围
tuned_parameters = range(1, 10, 2)

# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))

# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))

# 调优过程实现
for j, one_parameter in enumerate(tuned_parameters):
    rf2 = RandomForestClassifier(n_estimators=175, 
                                 max_depth=30, 
                                 max_features=15, 
                                 min_samples_leaf=one_parameter, 
                                 oob_score=True, 
                                 random_state=0, 
                                 n_jobs=-1)
    
    rf2.fit(x_train, y_train)
    
    # 输出accuracy
    accuracy_t[j] = rf2.oob_score_
    
    # 输出log_loss
    y_pre = rf2.predict_proba(x_test)
    error_t[j] = log_loss(y_test, y_pre, eps=1e-15, normalize=True)
    
    print(error_t)

[0.70599298 0.         0.         0.         0.        ]
[0.70599298 0.74117382 0.         0.         0.        ]
[0.70599298 0.74117382 0.77111756 0.         0.        ]
[0.70599298 0.74117382 0.77111756 0.79627966 0.        ]
[0.70599298 0.74117382 0.77111756 0.79627966 0.81900835]

# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 4), dpi=100)

axes[0].plot(tuned_parameters, error_t)
axes[1].plot(tuned_parameters, accuracy_t)

axes[0].set_xlabel("min_sample_leaf")
axes[0].set_ylabel("error_t")
axes[1].set_xlabel("min_sample_leaf")
axes[1].set_ylabel("accuracy_t")

axes[0].grid(True)
axes[1].grid(True)


plt.show()

在这里插入图片描述

经过图像展示,最后确定min_sample_leaf=1的时候,表现效果不错

确定最优模型

n_estimators=175,

max_depth=30,

max_features=15,

min_samples_leaf=1,

rf3 = RandomForestClassifier(n_estimators=175, max_depth=30, max_features=15, min_samples_leaf=1, 
                             oob_score=True, random_state=40, n_jobs=-1)

rf3.fit(x_train, y_train)

RandomForestClassifier(max_depth=30, max_features=15, n_estimators=175,
                       n_jobs=-1, oob_score=True, random_state=40)

rf3.score(x_test, y_test)

0.7782896631154621

rf3.oob_score_

0.7710973502304147

y_pre_proba1 = rf3.predict_proba(x_test)

log_loss(y_test, y_pre_proba1)

0.7013205968285336

生成提交数据

test_data = pd.read_csv("./data/otto/test.csv")

test_data.head()

	id	feat_1	feat_2	feat_3	feat_4	feat_7	feat_8	...	feat_85	feat_86	feat_87	feat_88	feat_89	feat_91	feat_92	feat_93
0	1	0	0	0	0	0	0	...	0	11	1	20	0	0	0	0
1	2	2	2	14	16	0	0	...	0	0	0	0	4	0	2	0
2	3	0	1	12	1	0	0	...	0	0	0	2	0	0	0	1
3	4	0	0	0	1	0	0	...	3	1	0	0	0	0	0	0
4	5	1	0	0	1	1	2	...	0	0	0	0	0	9	0	0

5 rows × 94 columns

test_data_drop_id = test_data.drop(["id"], axis=1)
test_data_drop_id.head()

	feat_1	feat_2	feat_3	feat_4	feat_7	feat_8	feat_10	...	feat_85	feat_86	feat_87	feat_88	feat_89	feat_91	feat_92	feat_93
0	0	0	0	0	0	0	3	...	0	11	1	20	0	0	0	0
1	2	2	14	16	0	0	0	...	0	0	0	0	4	0	2	0
2	0	1	12	1	0	0	0	...	0	0	0	2	0	0	0	1
3	0	0	0	1	0	0	0	...	3	1	0	0	0	0	0	0
4	1	0	0	1	1	2	3	...	0	0	0	0	0	9	0	0

5 rows × 93 columns

y_pre_test = rf3.predict_proba(test_data_drop_id)

y_pre_test

array([[1.71428571e-02, 5.32533417e-02, 8.00000000e-02, ...,
        5.14285714e-02, 0.00000000e+00, 1.14285714e-02],
       [1.08571429e-01, 7.42857143e-02, 1.14285714e-02, ...,
        2.28571429e-02, 2.45714286e-01, 1.71428571e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.14285714e-02, 5.71428571e-03, 0.00000000e+00],
       ...,
       [1.14285714e-02, 3.23500847e-01, 3.48548842e-01, ...,
        4.57142857e-02, 5.71428571e-03, 0.00000000e+00],
       [1.14560440e-02, 2.08943705e-01, 1.44017533e-01, ...,
        1.17582418e-02, 2.74725275e-05, 1.71703297e-02],
       [9.10536850e-03, 1.95349907e-01, 2.94229924e-01, ...,
        2.11763951e-01, 3.03030303e-04, 8.00014117e-03]])

result_data = pd.DataFrame(y_pre_test, columns=["Class_"+str(i) for i in range(1, 10)])

result_data.head()

	Class_1	Class_2	Class_3	Class_4	Class_5	Class_6	Class_7	Class_8	Class_9
0	0.017143	0.053253	0.080000	0.781032	0.000000	0.005714	0.051429	0.000000	0.011429
1	0.108571	0.074286	0.011429	0.085714	0.017143	0.417143	0.022857	0.245714	0.017143
2	0.000000	0.000000	0.000000	0.000000	0.000000	0.982857	0.011429	0.005714	0.000000
3	0.045714	0.339048	0.280000	0.163810	0.000000	0.005714	0.017143	0.017143	0.131429
4	0.223810	0.000000	0.000000	0.000000	0.000000	0.011429	0.028571	0.253333	0.482857

result_data.insert(loc=0, column="id", value=test_data.id)

result_data.head()

	id	Class_1	Class_2	Class_3	Class_4	Class_5	Class_6	Class_7	Class_8	Class_9
0	1	0.017143	0.053253	0.080000	0.781032	0.000000	0.005714	0.051429	0.000000	0.011429
1	2	0.108571	0.074286	0.011429	0.085714	0.017143	0.417143	0.022857	0.245714	0.017143
2	3	0.000000	0.000000	0.000000	0.000000	0.000000	0.982857	0.011429	0.005714	0.000000
3	4	0.045714	0.339048	0.280000	0.163810	0.000000	0.005714	0.017143	0.017143	0.131429
4	5	0.223810	0.000000	0.000000	0.000000	0.000000	0.011429	0.028571	0.253333	0.482857

result_data.to_csv("./data/otto/submission.csv", index=False)

¥骁勇善战¥

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

	feat_1	feat_4	feat_5	feat_6	feat_7	feat_8	feat_10	...	feat_84	feat_85	feat_86	feat_87	feat_90
0	1	0	0	0	0	0	0	...	0	1	0	0	0
1	0	0	0	0	0	1	0	...	0	0	0	0	0
2	0	0	0	0	0	1	0	...	0	0	0	0	0
3	1	1	6	1	5	0	1	...	22	0	1	2	0
4	0	0	0	0	0	0	0	...	0	1	0	0	1

	id	feat_1	feat_2	feat_3	feat_4	feat_7	feat_8	...	feat_85	feat_86	feat_87	feat_88	feat_89	feat_91	feat_92	feat_93
0	1	0	0	0	0	0	0	...	0	11	1	20	0	0	0	0
1	2	2	2	14	16	0	0	...	0	0	0	0	4	0	2	0
2	3	0	1	12	1	0	0	...	0	0	0	2	0	0	0	1
3	4	0	0	0	1	0	0	...	3	1	0	0	0	0	0	0
4	5	1	0	0	1	1	2	...	0	0	0	0	0	9	0	0

	feat_1	feat_2	feat_3	feat_4	feat_7	feat_8	feat_10	...	feat_85	feat_86	feat_87	feat_88	feat_89	feat_91	feat_92	feat_93
0	0	0	0	0	0	0	3	...	0	11	1	20	0	0	0	0
1	2	2	14	16	0	0	0	...	0	0	0	0	4	0	2	0
2	0	1	12	1	0	0	0	...	0	0	0	2	0	0	0	1
3	0	0	0	1	0	0	0	...	3	1	0	0	0	0	0	0
4	1	0	0	1	1	2	3	...	0	0	0	0	0	9	0	0

	feat_1	feat_4	feat_5	feat_6	feat_7	feat_8	feat_10	...	feat_84	feat_85	feat_86	feat_87	feat_90
0	1	0	0	0	0	0	0	...	0	1	0	0	0
1	0	0	0	0	0	1	0	...	0	0	0	0	0
2	0	0	0	0	0	1	0	...	0	0	0	0	0
3	1	1	6	1	5	0	1	...	22	0	1	2	0
4	0	0	0	0	0	0	0	...	0	1	0	0	1

	id	feat_1	feat_2	feat_3	feat_4	feat_7	feat_8	...	feat_85	feat_86	feat_87	feat_88	feat_89	feat_91	feat_92	feat_93
0	1	0	0	0	0	0	0	...	0	11	1	20	0	0	0	0
1	2	2	2	14	16	0	0	...	0	0	0	0	4	0	2	0
2	3	0	1	12	1	0	0	...	0	0	0	2	0	0	0	1
3	4	0	0	0	1	0	0	...	3	1	0	0	0	0	0	0
4	5	1	0	0	1	1	2	...	0	0	0	0	0	9	0	0

	feat_1	feat_2	feat_3	feat_4	feat_7	feat_8	feat_10	...	feat_85	feat_86	feat_87	feat_88	feat_89	feat_91	feat_92	feat_93
0	0	0	0	0	0	0	3	...	0	11	1	20	0	0	0	0
1	2	2	14	16	0	0	0	...	0	0	0	0	4	0	2	0
2	0	1	12	1	0	0	0	...	0	0	0	2	0	0	0	1
3	0	0	0	1	0	0	0	...	3	1	0	0	0	0	0	0
4	1	0	0	1	1	2	3	...	0	0	0	0	0	9	0	0

集成学习算法_otto案例：商品分类

目录

数据获取

数据基本处理

截取部分数据

把标签值转换为数字

分割数据

模型训练

基本模型训练

模型调优

评分标准

确定最优的n_estimators

确定最优的max_features

确定最优的max_depth

确定最优的min_sample_leaf

确定最优模型

生成提交数据

	feat_1	feat_4	feat_5	feat_6	feat_7	feat_8	feat_10	...	feat_84	feat_85	feat_86	feat_87	feat_90
0	1	0	0	0	0	0	0	...	0	1	0	0	0
1	0	0	0	0	0	1	0	...	0	0	0	0	0
2	0	0	0	0	0	1	0	...	0	0	0	0	0
3	1	1	6	1	5	0	1	...	22	0	1	2	0
4	0	0	0	0	0	0	0	...	0	1	0	0	1

	id	feat_1	feat_2	feat_3	feat_4	feat_7	feat_8	...	feat_85	feat_86	feat_87	feat_88	feat_89	feat_91	feat_92	feat_93
0	1	0	0	0	0	0	0	...	0	11	1	20	0	0	0	0
1	2	2	2	14	16	0	0	...	0	0	0	0	4	0	2	0
2	3	0	1	12	1	0	0	...	0	0	0	2	0	0	0	1
3	4	0	0	0	1	0	0	...	3	1	0	0	0	0	0	0
4	5	1	0	0	1	1	2	...	0	0	0	0	0	9	0	0

	feat_1	feat_2	feat_3	feat_4	feat_7	feat_8	feat_10	...	feat_85	feat_86	feat_87	feat_88	feat_89	feat_91	feat_92	feat_93
0	0	0	0	0	0	0	3	...	0	11	1	20	0	0	0	0
1	2	2	14	16	0	0	0	...	0	0	0	0	4	0	2	0
2	0	1	12	1	0	0	0	...	0	0	0	2	0	0	0	1
3	0	0	0	1	0	0	0	...	3	1	0	0	0	0	0	0
4	1	0	0	1	1	2	3	...	0	0	0	0	0	9	0	0