随机森林实现otto案例

otto案例-随机森林

otto是一个很大的电子商务公司,该案例是实现将该公司下的产品进行分类

获取数据集的链接:https://www.kaggle.com/c/otto-group-product-classification-challenge/overview

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#获取数据集
data = pd.read_csv("文件所在路径.csv")

#图形可视化
sns.countplot(data.target)
plt.show()

#截取部分数据
#随机欠采样获取数据
x = data.drop(["id","target"],axis=1)
y = data["target"]
rus = RandomUnderSampler(random_state=0)
x_resampled,y_resampled = rus.fit_resample(x,y)

#把标签值转换为数字
le = LabelEncoder()
y_resampled = le.fit_transform(y_resampled)

#分割数据
x_train,x_test,y_train,y_test = train_test_split(x_resampled,y_resampled,test_size=0.2)

#模型训练
#基本模型训练
rf = RandomForestClassifier(oob_score=True)
rf.fit(x_train,y_train)

y_pre = rf.predict(x_test)
print(y_pre)
print(rf.score(x_test,y_test))
print(rf.oob_score_)

sns.countplot(y_pre)
plt.show()

#logloss模型评估
one_hot = OneHotEncoder(sparse=False)
y_test1 = one_hot.fit_transform(y_test.reshape(-1,1))
y_pre1 = one_hot.fit_transform(y_pre.reshape(-1,1))
log_loss(y_test1,y_pre1,eps=1e-15,normalize=True)

#改变预测值的输出模式,让输出结果为百分占比,降低logloss值
y_pre_proba = rf.predict_proba(x_test)
log_loss(y_test1,y_pre_proba,eps=1e-15,normalize=True)

模型调优

确定最优的n_estimators
#续随机森林部分的代码
#模型调优
#确定最优的n_estimators
tuned_parameters = range(10,200,10)
#创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
#创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
#调优实现过程
for j,one_parameter in enumerate(tuned_parameters):
	rf2 = RandomForestClassifier(n_estimators=one_parameter,
                                 max_depth=10,
                                 max_features=10,
                                 min_samples_leaf=10,
                                 oob_score=True,
                                 random_state=0,
                                 n_jobs=-1)
	rf2.fit(x_train,y_train)
    #输出accuracy
    accuracy_t[j] = rf2.oob_score_
    print(accuracy_t)
    #输出log_loss
    y_pre = rf2.predict_proba(x_test)
    error_t[j] = log_loss(y_test,y_pre,eps=le-15,normalize=True)
	print(error_t)
#优化结果过程可视化
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)
axes[0].plot(tuned_parameters,error_t)
axes[1].plot(tuned_parameters,accuracy_t)

axes[0].set_xlabel("n_estimators")
axes[1].set_ylabel("error_t")
axes[0].set_xlabel("n_estimators")
axes[1].set_ylabel("accuracy_t")

axes[0].grid(True)
axes[1].grid(True)

plt.show()
确定最优的max_features
#续随机森林部分的代码
#确定最优的max_featurs
tuned_parameters = range(5,40,5)
#创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
#创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
#调优实现过程
for j,one_parameter in enumerate(tuned_parameters):
	rf2 = RandomForestClassifier(n_estimators=175,
                                 max_depth=10,
                                 max_features=one_parameter,
                                 min_samples_leaf=10,
                                 oob_score=True,
                                 random_state=0,
                                 n_jobs=-1)
	rf2.fit(x_train,y_train)
    #输出accuracy
    accuracy_t[j] = rf2.oob_score_
    print(accuracy_t)
    #输出log_loss
    y_pre = rf2.predict_proba(x_test)
    error_t[j] = log_loss(y_test,y_pre,eps=le-15,normalize=True)
	print(error_t)
#优化结果过程可视化
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)
axes[0].plot(tuned_parameters,error_t)
axes[1].plot(tuned_parameters,accuracy_t)

axes[0].set_xlabel("max_features")
axes[1].set_ylabel("error_t")
axes[0].set_xlabel("max_features")
axes[1].set_ylabel("accuracy_t")

axes[0].grid(True)
axes[1].grid(True)

plt.show()
确定最优的max_depth
#续随机森林部分的代码
#确定最优的max_depth
tuned_parameters = range(10,100,10)
#创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
#创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
#调优实现过程
for j,one_parameter in enumerate(tuned_parameters):
	rf2 = RandomForestClassifier(n_estimators=175,
                                 max_depth=one_parameter,
                                 max_features=15,
                                 min_samples_leaf=10,
                                 oob_score=True,
                                 random_state=0,
                                 n_jobs=-1)
	rf2.fit(x_train,y_train)
    #输出accuracy
    accuracy_t[j] = rf2.oob_score_
    print(accuracy_t)
    #输出log_loss
    y_pre = rf2.predict_proba(x_test)
    error_t[j] = log_loss(y_test,y_pre,eps=le-15,normalize=True)
	print(error_t)
#优化结果过程可视化
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)
axes[0].plot(tuned_parameters,error_t)
axes[1].plot(tuned_parameters,accuracy_t)

axes[0].set_xlabel("max_depth")
axes[1].set_ylabel("error_t")
axes[0].set_xlabel("max_depth")
axes[1].set_ylabel("accuracy_t")

axes[0].grid(True)
axes[1].grid(True)

plt.show()
确定最优的min_samples_leaf
#续随机森林部分的代码
#确定最优的min_samples_leaf
tuned_parameters = range(1,10,2)
#创建添加accuracy的一个numpy
accuracy_t = np.zeros(len(tuned_parameters))
#创建添加error的一个numpy
error_t = np.zeros(len(tuned_parameters))
#调优实现过程
for j,one_parameter in enumerate(tuned_parameters):
	rf2 = RandomForestClassifier(n_estimators=175,
                                 max_depth=30,
                                 max_features=15,
                                 min_samples_leaf=one_parameter,#min_samples_leaf=1时最优
                                 oob_score=True,
                                 random_state=0,
                                 n_jobs=-1)
	rf2.fit(x_train,y_train)
    #输出accuracy
    accuracy_t[j] = rf2.oob_score_
    print(accuracy_t)
    #输出log_loss
    y_pre = rf2.predict_proba(x_test)
    error_t[j] = log_loss(y_test,y_pre,eps=le-15,normalize=True)
	print(error_t)
#优化结果过程可视化
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(20,4),dpi=100)
axes[0].plot(tuned_parameters,error_t)
axes[1].plot(tuned_parameters,accuracy_t)

axes[0].set_xlabel("min_samples_leaf")
axes[1].set_ylabel("error_t")
axes[0].set_xlabel("min_samples_leaf")
axes[1].set_ylabel("accuracy_t")

axes[0].grid(True)
axes[1].grid(True)

plt.show()
  • 15
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值