随机森林实现otto案例

最新推荐文章于 2024-08-29 20:32:56 发布

忘三日

最新推荐文章于 2024-08-29 20:32:56 发布

阅读量414

点赞数 15

分类专栏：机器学习简记文章标签：随机森林机器学习人工智能

本文链接：https://blog.csdn.net/weixin_51787442/article/details/140209027

版权

机器学习简记专栏收录该内容

14 篇文章 0 订阅

订阅专栏

otto案例-随机森林

otto是一个很大的电子商务公司，该案例是实现将该公司下的产品进行分类

获取数据集的链接：https://www.kaggle.com/c/otto-group-product-classification-challenge/overview

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#获取数据集
data = pd.read_csv("文件所在路径.csv")

#图形可视化
sns.countplot(data.target)
plt.show()

#截取部分数据
#随机欠采样获取数据
x = data.drop(["id","target"],axis=1)
y = data["target"]
rus = RandomUnderSampler(random_state=0)
x_resampled,y_resampled = rus.fit_resample(x,y)

#把标签值转换为数字
le = LabelEncoder()
y_resampled = le.fit_transform(y_resampled)

#分割数据
x_train,x_test,y_train,y_test = train_test_split(x_resampled,y_resampled,test_size=0.2)

#模型训练
#基本模型训练
rf = RandomForestClassifier(oob_score=True)
rf.fit(x_train,y_train)

y_pre = rf.predict(x_test)
print(y_pre)
print(rf.score(x_test,y_test))
print(rf.oob_score_)

sns.countplot(y_pre)
plt.show()

#logloss模型评估
one_hot = OneHotEncoder(sparse=False)
y_test1 = one_hot.fit_transform(y_test.reshape(-1,1))
y_pre1 = one_hot.fit_transform(y_pre.reshape(-1,1))
log_loss(y_test1,y_pre1,eps=1e-15,normalize=True)

#改变预测值的输出模式，让输出结果为百分占比，降低logloss值
y_pre_proba = rf.predict_proba(x_test)
log_loss(y_test1,y_pre_proba,eps=1e-15,normalize=True)

模型调优

确定最优的n_estimators

#续随机森林部分的代码
#模型调优
#确定最优的n_estimators
tuned_parameters = range(10,200,10)
#创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
#创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
#调优实现过程
for j,one_parameter in enumerate(tuned_parameters):
	rf2 = RandomForestClassifier(n_estimators=one_parameter,
                                 max_depth=10,
                                 max_features=10,
                                 min_samples_leaf=10,
                                 oob_score=True,
                                 random_state=0,
                                 n_jobs=-1)
	rf2.fit(x_train,y_train)
    #输出accuracy
    accuracy_t[j] = rf2.oob_score_
    print(accuracy_t)
    #输出log_loss
    y_pre = rf2.predict_proba(x_test)
    error_t[j] = log_loss(y_test,y_pre,eps=le-15,normalize=True)
	print(error_t)
#优化结果过程可视化
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)
axes[0].plot(tuned_parameters,error_t)
axes[1].plot(tuned_parameters,accuracy_t)

axes[0].set_xlabel("n_estimators")
axes[1].set_ylabel("error_t")
axes[0].set_xlabel("n_estimators")
axes[1].set_ylabel("accuracy_t")

axes[0].grid(True)
axes[1].grid(True)

plt.show()

确定最优的max_features

#续随机森林部分的代码
#确定最优的max_featurs
tuned_parameters = range(5,40,5)
#创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
#创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
#调优实现过程
for j,one_parameter in enumerate(tuned_parameters):
	rf2 = RandomForestClassifier(n_estimators=175,
                                 max_depth=10,
                                 max_features=one_parameter,
                                 min_samples_leaf=10,
                                 oob_score=True,
                                 random_state=0,
                                 n_jobs=-1)
	rf2.fit(x_train,y_train)
    #输出accuracy
    accuracy_t[j] = rf2.oob_score_
    print(accuracy_t)
    #输出log_loss
    y_pre = rf2.predict_proba(x_test)
    error_t[j] = log_loss(y_test,y_pre,eps=le-15,normalize=True)
	print(error_t)
#优化结果过程可视化
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)
axes[0].plot(tuned_parameters,error_t)
axes[1].plot(tuned_parameters,accuracy_t)

axes[0].set_xlabel("max_features")
axes[1].set_ylabel("error_t")
axes[0].set_xlabel("max_features")
axes[1].set_ylabel("accuracy_t")

axes[0].grid(True)
axes[1].grid(True)

plt.show()

确定最优的max_depth

#续随机森林部分的代码
#确定最优的max_depth
tuned_parameters = range(10,100,10)
#创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
#创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
#调优实现过程
for j,one_parameter in enumerate(tuned_parameters):
	rf2 = RandomForestClassifier(n_estimators=175,
                                 max_depth=one_parameter,
                                 max_features=15,
                                 min_samples_leaf=10,
                                 oob_score=True,
                                 random_state=0,
                                 n_jobs=-1)
	rf2.fit(x_train,y_train)
    #输出accuracy
    accuracy_t[j] = rf2.oob_score_
    print(accuracy_t)
    #输出log_loss
    y_pre = rf2.predict_proba(x_test)
    error_t[j] = log_loss(y_test,y_pre,eps=le-15,normalize=True)
	print(error_t)
#优化结果过程可视化
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)
axes[0].plot(tuned_parameters,error_t)
axes[1].plot(tuned_parameters,accuracy_t)

axes[0].set_xlabel("max_depth")
axes[1].set_ylabel("error_t")
axes[0].set_xlabel("max_depth")
axes[1].set_ylabel("accuracy_t")

axes[0].grid(True)
axes[1].grid(True)

plt.show()

确定最优的min_samples_leaf

#续随机森林部分的代码
#确定最优的min_samples_leaf
tuned_parameters = range(1,10,2)
#创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
#创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
#调优实现过程
for j,one_parameter in enumerate(tuned_parameters):
	rf2 = RandomForestClassifier(n_estimators=175,
                                 max_depth=30,
                                 max_features=15,
                                 min_samples_leaf=one_parameter,#min_samples_leaf=1时最优
                                 oob_score=True,
                                 random_state=0,
                                 n_jobs=-1)
	rf2.fit(x_train,y_train)
    #输出accuracy
    accuracy_t[j] = rf2.oob_score_
    print(accuracy_t)
    #输出log_loss
    y_pre = rf2.predict_proba(x_test)
    error_t[j] = log_loss(y_test,y_pre,eps=le-15,normalize=True)
	print(error_t)
#优化结果过程可视化
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)
axes[0].plot(tuned_parameters,error_t)
axes[1].plot(tuned_parameters,accuracy_t)

axes[0].set_xlabel("min_samples_leaf")
axes[1].set_ylabel("error_t")
axes[0].set_xlabel("min_samples_leaf")
axes[1].set_ylabel("accuracy_t")

axes[0].grid(True)
axes[1].grid(True)

plt.show()