目录
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
数据获取
data = pd.read_csv("./data/otto/train.csv")
data.head()
id | feat_1 | feat_2 | feat_3 | feat_4 | feat_5 | feat_6 | feat_7 | feat_8 | feat_9 | ... | feat_85 | feat_86 | feat_87 | feat_88 | feat_89 | feat_90 | feat_91 | feat_92 | feat_93 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_1 |
1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_1 |
2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Class_1 |
3 | 4 | 1 | 0 | 0 | 1 | 6 | 1 | 5 | 0 | 0 | ... | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | Class_1 |
4 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | Class_1 |
5 rows × 95 columns
- id - 产品id
- feat_1, feat_2, …, feat_93 - 产品的各个特征
- target - 产品被划分的类别
data.shape
(61878, 95)
data.describe()
id | feat_1 | feat_2 | feat_3 | feat_4 | feat_5 | feat_6 | feat_7 | feat_8 | feat_9 | ... | feat_84 | feat_85 | feat_86 | feat_87 | feat_88 | feat_89 | feat_90 | feat_91 | feat_92 | feat_93 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 61878.000000 | 61878.00000 | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 | ... | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 | 61878.000000 |
mean | 30939.500000 | 0.38668 | 0.263066 | 0.901467 | 0.779081 | 0.071043 | 0.025696 | 0.193704 | 0.662433 | 1.011296 | ... | 0.070752 | 0.532306 | 1.128576 | 0.393549 | 0.874915 | 0.457772 | 0.812421 | 0.264941 | 0.380119 | 0.126135 |
std | 17862.784315 | 1.52533 | 1.252073 | 2.934818 | 2.788005 | 0.438902 | 0.215333 | 1.030102 | 2.255770 | 3.474822 | ... | 1.151460 | 1.900438 | 2.681554 | 1.575455 | 2.115466 | 1.527385 | 4.597804 | 2.045646 | 0.982385 | 1.201720 |
min | 1.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 15470.250000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 30939.500000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
75% | 46408.750000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
max | 61878.000000 | 61.00000 | 51.000000 | 64.000000 | 70.000000 | 19.000000 | 10.000000 | 38.000000 | 76.000000 | 43.000000 | ... | 76.000000 | 55.000000 | 65.000000 | 67.000000 | 30.000000 | 61.000000 | 130.000000 | 52.000000 | 19.000000 | 87.000000 |
8 rows × 94 columns
# 图形可视化,查看数据分布
import seaborn as sns
sns.countplot(data.target)
plt.show()
由上图可以看出,该数据类别不均衡,所以需要后期处理
数据基本处理
数据已经经过脱敏,不再需要特殊处理
- 数据的脱敏: 对敏感数据比如 手机号、银行卡号 等信息,进行转换或者修改的一种技术手段,防止敏感数据直接在不可靠的环境下使用。
截取部分数据
new1_data = data[:10000]
new1_data.shape
(10000, 95)
# 图形可视化,查看数据分布
import seaborn as sns
sns.countplot(new1_data.target)
plt.show()
使用上面方式获取数据不可行,然后使用随机欠采样获取响应的数据
# 随机欠采样获取数据
# 首先需要确定特征值\标签值
y = data["target"]
x = data.drop(["id", "target"], axis=1)
x.head()
feat_1 | feat_2 | feat_3 | feat_4 | feat_5 | feat_6 | feat_7 | feat_8 | feat_9 | feat_10 | ... | feat_84 | feat_85 | feat_86 | feat_87 | feat_88 | feat_89 | feat_90 | feat_91 | feat_92 | feat_93 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 1 | 0 | 0 | 1 | 6 | 1 | 5 | 0 | 0 | 1 | ... | 22 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
5 rows × 93 columns
y.head()
0 Class_1 1 Class_1 2 Class_1 3 Class_1 4 Class_1 Name: target, dtype: object
# 欠采样获取数据
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(x, y)
x.shape, y.shape
((61878, 93), (61878,))
X_resampled.shape, y_resampled.shape
((17361, 93), (17361,))
# 图形可视化,查看数据分布
import seaborn as sns
sns.countplot(y_resampled)
plt.show()
把标签值转换为数字
y_resampled.head()
0 Class_1 1 Class_1 2 Class_1 3 Class_1 4 Class_1 Name: target, dtype: object
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_resampled = le.fit_transform(y_resampled)
y_resampled
array([0, 0, 0, …, 8, 8, 8])
分割数据
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2)
x_train.shape, y_train.shape
((13888, 93), (13888,))
x_test.shape, y_test.shape
((3473, 93), (3473,))
模型训练
基本模型训练
from sklearn.ensemble import RandomForestClassifier # 随机森林
rf = RandomForestClassifier(oob_score=True) # 外包估计
rf.fit(x_train, y_train)
RandomForestClassifier(oob_score=True)
y_pre = rf.predict(x_test)
y_pre
array([3, 7, 8, …, 3, 2, 5])
rf.score(x_test, y_test)
0.7854880506766484
rf.oob_score_
0.7633208525345622
# 图形可视化,查看数据分布
import seaborn as sns
sns.countplot(y_pre)
plt.show()
# logloss模型评估
# from sklearn.metrics import log_loss
# log_loss(y_test, y_pre, eps=1e-15, normalize=True)
# y_test, y_pre
上面报错原因:logloss使用过程中,必须要求将输出用one-hot表示,
需要将这个多类别问题的输出结果通过OneHotEncoder修改为如下:
from sklearn.preprocessing import OneHotEncoder
one_hot = OneHotEncoder(sparse=False)
y_test1 = one_hot.fit_transform(y_test.reshape(-1, 1))
y_pre1 = one_hot.fit_transform(y_pre.reshape(-1, 1))
y_test1
array([[0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 1., 0.], [0., 0., 0., ..., 0., 0., 1.], ..., [0., 0., 1., ..., 0., 0., 0.], [0., 0., 1., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]])
y_pre1
array([[0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 1., 0.], [0., 0., 0., ..., 0., 0., 1.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 1., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]])
# logloss模型评估
log_loss(y_test1, y_pre1, eps=1e-15, normalize=True)
7.40898025171566
# 改变预测值的输出模式,让输出结果为百分占比,降低logloss值
y_pre_proba = rf.predict_proba(x_test)
y_pre_proba
array([[0.01, 0.16, 0.16, ..., 0.06, 0. , 0. ], [0.14, 0. , 0. , ..., 0.02, 0.81, 0.02], [0.1 , 0.01, 0.03, ..., 0.1 , 0.03, 0.5 ], ..., [0.02, 0.23, 0.3 , ..., 0.09, 0. , 0.05], [0. , 0.27, 0.48, ..., 0.01, 0.01, 0.08], [0.02, 0.01, 0. , ..., 0.02, 0.02, 0. ]])
rf.oob_score_
0.7633208525345622
# logloss模型评估
log_loss(y_test1, y_pre_proba, eps=1e-15, normalize=True)
0.7800586622785408
模型调优
n_estimators, max_feature, max_depth, min_samples_leaf
评分标准
具体公式:
上公式中,
- i 表示样本,j 表示类别。Pij 代表第 i 个样本属于类别 j 的概率,
- 如果第 i 个样本真的属于类别 j ,则 yij 等于1,否则为0。
- 根据上公式,假如你将所有的测试样本都正确分类,所有pij都是1,那每个log(pij)都是0,最终的logloss也是0。
- 假如第1个样本本来是属于1类别的,但是你给它的类别概率pij=0.1,那logloss就会累加上log(0.1)这⼀项。我们知 道这⼀项是负数,⽽且pij越⼩,负得越多,如果pij=0,将是⽆穷。这会导致这种情况:你分错了⼀个,logloss就是 ⽆穷。这当然不合理,为了避免这⼀情况,我们对⾮常⼩的值做如下处理:
也就是说最⼩不会⼩于10^-15。
确定最优的n_estimators
# 确定n_estimators的取值范围
tuned_parameters = range(10, 200, 10)
# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
# 调优过程实现
for j, one_parameter in enumerate(tuned_parameters):
rf2 = RandomForestClassifier(n_estimators=one_parameter,
max_depth=10,
max_features=10,
min_samples_leaf=10,
oob_score=True,
random_state=0,
n_jobs=-1)
rf2.fit(x_train, y_train)
# 输出accuracy
accuracy_t[j] = rf2.oob_score_
# 输出log_loss
y_pre = rf2.predict_proba(x_test)
error_t[j] = log_loss(y_test, y_pre, eps=1e-15, normalize=True)
print(error_t)
[1.11969649 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ] [1.11969649 1.11497311 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ] [1.11969649 1.11497311 1.11608578 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ] [1.11969649 1.11497311 1.11608578 1.11172577 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ] [1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ] [1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ] [1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966 1.11000822 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ] [1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966 1.11000822 1.11048257 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ] [1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966 1.11000822 1.11048257 1.11173118 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ] [1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966 1.11000822 1.11048257 1.11173118 1.10990752 0. 0. 0. 0. 0. 0. 0. 0. 0. ] [1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966 1.11000822 1.11048257 1.11173118 1.10990752 1.10799403 0. 0. 0. 0. 0. 0. 0. 0. ] [1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966 1.11000822 1.11048257 1.11173118 1.10990752 1.10799403 1.10837138 0. 0. 0. 0. 0. 0. 0. ] [1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966 1.11000822 1.11048257 1.11173118 1.10990752 1.10799403 1.10837138 1.10756294 0. 0. 0. 0. 0. 0. ] [1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966 1.11000822 1.11048257 1.11173118 1.10990752 1.10799403 1.10837138 1.10756294 1.10880934 0. 0. 0. 0. 0. ] [1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966 1.11000822 1.11048257 1.11173118 1.10990752 1.10799403 1.10837138 1.10756294 1.10880934 1.10678094 0. 0. 0. 0. ] [1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966 1.11000822 1.11048257 1.11173118 1.10990752 1.10799403 1.10837138 1.10756294 1.10880934 1.10678094 1.1060839 0. 0. 0. ] [1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966 1.11000822 1.11048257 1.11173118 1.10990752 1.10799403 1.10837138 1.10756294 1.10880934 1.10678094 1.1060839 1.10634137 0. 0. ] [1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966 1.11000822 1.11048257 1.11173118 1.10990752 1.10799403 1.10837138 1.10756294 1.10880934 1.10678094 1.1060839 1.10634137 1.10661559 0. ] [1.11969649 1.11497311 1.11608578 1.11172577 1.11057536 1.11115966 1.11000822 1.11048257 1.11173118 1.10990752 1.10799403 1.10837138 1.10756294 1.10880934 1.10678094 1.1060839 1.10634137 1.10661559 1.1074386 ]
# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 4), dpi=100)
axes[0].plot(tuned_parameters, error_t)
axes[1].plot(tuned_parameters, accuracy_t)
axes[0].set_xlabel("n_estimators")
axes[0].set_ylabel("error_t")
axes[1].set_xlabel("n_estimators")
axes[1].set_ylabel("accuracy_t")
axes[0].grid(True)
axes[1].grid(True)
plt.show()
经过图像展示,最后确定n_estimators=175的时候,表现效果不错
确定最优的max_features
# 确定n_estimators的取值范围
tuned_parameters = range(5, 40, 5)
# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
# 调优过程实现
for j, one_parameter in enumerate(tuned_parameters):
rf2 = RandomForestClassifier(n_estimators=175,
max_depth=10,
max_features=one_parameter,
min_samples_leaf=10,
oob_score=True,
random_state=0,
n_jobs=-1)
rf2.fit(x_train, y_train)
# 输出accuracy
accuracy_t[j] = rf2.oob_score_
# 输出log_loss
y_pre = rf2.predict_proba(x_test)
error_t[j] = log_loss(y_test, y_pre, eps=1e-15, normalize=True)
print(error_t)
[1.20308122 0. 0. 0. 0. 0. 0. ] [1.20308122 1.1069064 0. 0. 0. 0. 0. ] [1.20308122 1.1069064 1.07297852 0. 0. 0. 0. ] [1.20308122 1.1069064 1.07297852 1.06020463 0. 0. 0. ] [1.20308122 1.1069064 1.07297852 1.06020463 1.05062457 0. 0. ] [1.20308122 1.1069064 1.07297852 1.06020463 1.05062457 1.05232421 0. ] [1.20308122 1.1069064 1.07297852 1.06020463 1.05062457 1.05232421 1.05392152]
# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 4), dpi=100)
axes[0].plot(tuned_parameters, error_t)
axes[1].plot(tuned_parameters, accuracy_t)
axes[0].set_xlabel("max_features")
axes[0].set_ylabel("error_t")
axes[1].set_xlabel("max_features")
axes[1].set_ylabel("accuracy_t")
axes[0].grid(True)
axes[1].grid(True)
plt.show()
经过图像展示,最后确定max_features=15的时候,表现效果不错
确定最优的max_depth
# 确定n_estimators的取值范围
tuned_parameters = range(10, 100, 10)
# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
# 调优过程实现
for j, one_parameter in enumerate(tuned_parameters):
rf2 = RandomForestClassifier(n_estimators=175,
max_depth=one_parameter,
max_features=15,
min_samples_leaf=10,
oob_score=True,
random_state=0,
n_jobs=-1)
rf2.fit(x_train, y_train)
# 输出accuracy
accuracy_t[j] = rf2.oob_score_
# 输出log_loss
y_pre = rf2.predict_proba(x_test)
error_t[j] = log_loss(y_test, y_pre, eps=1e-15, normalize=True)
print(error_t)
[1.07297852 0. 0. 0. 0. 0. 0. 0. 0. ] [1.07297852 0.84671097 0. 0. 0. 0. 0. 0. 0. ] [1.07297852 0.84671097 0.82480963 0. 0. 0. 0. 0. 0. ] [1.07297852 0.84671097 0.82480963 0.8257728 0. 0. 0. 0. 0. ] [1.07297852 0.84671097 0.82480963 0.8257728 0.8256875 0. 0. 0. 0. ] [1.07297852 0.84671097 0.82480963 0.8257728 0.8256875 0.8256875 0. 0. 0. ] [1.07297852 0.84671097 0.82480963 0.8257728 0.8256875 0.8256875 0.8256875 0. 0. ] [1.07297852 0.84671097 0.82480963 0.8257728 0.8256875 0.8256875 0.8256875 0.8256875 0. ] [1.07297852 0.84671097 0.82480963 0.8257728 0.8256875 0.8256875 0.8256875 0.8256875 0.8256875 ]
# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 4), dpi=100)
axes[0].plot(tuned_parameters, error_t)
axes[1].plot(tuned_parameters, accuracy_t)
axes[0].set_xlabel("max_depth")
axes[0].set_ylabel("error_t")
axes[1].set_xlabel("max_depth")
axes[1].set_ylabel("accuracy_t")
axes[0].grid(True)
axes[1].grid(True)
plt.show()
经过图像展示,最后确定max_depth=30的时候,表现效果不错
确定最优的min_sample_leaf
# 确定n_estimators的取值范围
tuned_parameters = range(1, 10, 2)
# 创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
# 创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
# 调优过程实现
for j, one_parameter in enumerate(tuned_parameters):
rf2 = RandomForestClassifier(n_estimators=175,
max_depth=30,
max_features=15,
min_samples_leaf=one_parameter,
oob_score=True,
random_state=0,
n_jobs=-1)
rf2.fit(x_train, y_train)
# 输出accuracy
accuracy_t[j] = rf2.oob_score_
# 输出log_loss
y_pre = rf2.predict_proba(x_test)
error_t[j] = log_loss(y_test, y_pre, eps=1e-15, normalize=True)
print(error_t)
[0.70599298 0. 0. 0. 0. ] [0.70599298 0.74117382 0. 0. 0. ] [0.70599298 0.74117382 0.77111756 0. 0. ] [0.70599298 0.74117382 0.77111756 0.79627966 0. ] [0.70599298 0.74117382 0.77111756 0.79627966 0.81900835]
# 优化结果过程可视化
fig,axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 4), dpi=100)
axes[0].plot(tuned_parameters, error_t)
axes[1].plot(tuned_parameters, accuracy_t)
axes[0].set_xlabel("min_sample_leaf")
axes[0].set_ylabel("error_t")
axes[1].set_xlabel("min_sample_leaf")
axes[1].set_ylabel("accuracy_t")
axes[0].grid(True)
axes[1].grid(True)
plt.show()
经过图像展示,最后确定min_sample_leaf=1的时候,表现效果不错
确定最优模型
n_estimators=175,
max_depth=30,
max_features=15,
min_samples_leaf=1,
rf3 = RandomForestClassifier(n_estimators=175, max_depth=30, max_features=15, min_samples_leaf=1,
oob_score=True, random_state=40, n_jobs=-1)
rf3.fit(x_train, y_train)
RandomForestClassifier(max_depth=30, max_features=15, n_estimators=175, n_jobs=-1, oob_score=True, random_state=40)
rf3.score(x_test, y_test)
0.7782896631154621
rf3.oob_score_
0.7710973502304147
y_pre_proba1 = rf3.predict_proba(x_test)
log_loss(y_test, y_pre_proba1)
0.7013205968285336
生成提交数据
test_data = pd.read_csv("./data/otto/test.csv")
test_data.head()
id | feat_1 | feat_2 | feat_3 | feat_4 | feat_5 | feat_6 | feat_7 | feat_8 | feat_9 | ... | feat_84 | feat_85 | feat_86 | feat_87 | feat_88 | feat_89 | feat_90 | feat_91 | feat_92 | feat_93 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 11 | 1 | 20 | 0 | 0 | 0 | 0 | 0 |
1 | 2 | 2 | 2 | 14 | 16 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 2 | 0 |
2 | 3 | 0 | 1 | 12 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 1 |
3 | 4 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 5 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 2 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | 0 | 0 |
5 rows × 94 columns
test_data_drop_id = test_data.drop(["id"], axis=1)
test_data_drop_id.head()
feat_1 | feat_2 | feat_3 | feat_4 | feat_5 | feat_6 | feat_7 | feat_8 | feat_9 | feat_10 | ... | feat_84 | feat_85 | feat_86 | feat_87 | feat_88 | feat_89 | feat_90 | feat_91 | feat_92 | feat_93 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | ... | 0 | 0 | 11 | 1 | 20 | 0 | 0 | 0 | 0 | 0 |
1 | 2 | 2 | 14 | 16 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 4 | 0 | 0 | 2 | 0 |
2 | 0 | 1 | 12 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 1 |
3 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 2 | 0 | 3 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | 0 | 0 |
5 rows × 93 columns
y_pre_test = rf3.predict_proba(test_data_drop_id)
y_pre_test
array([[1.71428571e-02, 5.32533417e-02, 8.00000000e-02, ..., 5.14285714e-02, 0.00000000e+00, 1.14285714e-02], [1.08571429e-01, 7.42857143e-02, 1.14285714e-02, ..., 2.28571429e-02, 2.45714286e-01, 1.71428571e-02], [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ..., 1.14285714e-02, 5.71428571e-03, 0.00000000e+00], ..., [1.14285714e-02, 3.23500847e-01, 3.48548842e-01, ..., 4.57142857e-02, 5.71428571e-03, 0.00000000e+00], [1.14560440e-02, 2.08943705e-01, 1.44017533e-01, ..., 1.17582418e-02, 2.74725275e-05, 1.71703297e-02], [9.10536850e-03, 1.95349907e-01, 2.94229924e-01, ..., 2.11763951e-01, 3.03030303e-04, 8.00014117e-03]])
result_data = pd.DataFrame(y_pre_test, columns=["Class_"+str(i) for i in range(1, 10)])
result_data.head()
Class_1 | Class_2 | Class_3 | Class_4 | Class_5 | Class_6 | Class_7 | Class_8 | Class_9 | |
---|---|---|---|---|---|---|---|---|---|
0 | 0.017143 | 0.053253 | 0.080000 | 0.781032 | 0.000000 | 0.005714 | 0.051429 | 0.000000 | 0.011429 |
1 | 0.108571 | 0.074286 | 0.011429 | 0.085714 | 0.017143 | 0.417143 | 0.022857 | 0.245714 | 0.017143 |
2 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.982857 | 0.011429 | 0.005714 | 0.000000 |
3 | 0.045714 | 0.339048 | 0.280000 | 0.163810 | 0.000000 | 0.005714 | 0.017143 | 0.017143 | 0.131429 |
4 | 0.223810 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.011429 | 0.028571 | 0.253333 | 0.482857 |
result_data.insert(loc=0, column="id", value=test_data.id)
result_data.head()
id | Class_1 | Class_2 | Class_3 | Class_4 | Class_5 | Class_6 | Class_7 | Class_8 | Class_9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.017143 | 0.053253 | 0.080000 | 0.781032 | 0.000000 | 0.005714 | 0.051429 | 0.000000 | 0.011429 |
1 | 2 | 0.108571 | 0.074286 | 0.011429 | 0.085714 | 0.017143 | 0.417143 | 0.022857 | 0.245714 | 0.017143 |
2 | 3 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.982857 | 0.011429 | 0.005714 | 0.000000 |
3 | 4 | 0.045714 | 0.339048 | 0.280000 | 0.163810 | 0.000000 | 0.005714 | 0.017143 | 0.017143 | 0.131429 |
4 | 5 | 0.223810 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.011429 | 0.028571 | 0.253333 | 0.482857 |
result_data.to_csv("./data/otto/submission.csv", index=False)