【机器学习】6.2.决策树随机森林泰坦尼克数据集

最新推荐文章于 2023-12-20 14:54:29 发布

LouHerGetUp

最新推荐文章于 2023-12-20 14:54:29 发布

阅读量903

点赞数 25

分类专栏：机器学习文章标签：机器学习决策树随机森林

本文链接：https://blog.csdn.net/CSDNLHCC/article/details/134872154

版权

机器学习专栏收录该内容

48 篇文章 0 订阅

订阅专栏

包含全部示例的代码仓库见GIthub

1 导入库

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

2 数据准备

data = pd.read_csv('./dataset/tt/train.csv')
data.columns
# output
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

data = data[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
data.head()
# output
Survived	Pclass	Sex	Age	SibSp	Parch	Fare	Embarked
0	0	3	male	22.0	1	0	7.2500	S
1	1	1	female	38.0	1	0	71.2833	C
2	1	3	female	26.0	0	0	7.9250	S
3	1	1	female	35.0	1	0	53.1000	S
4	0	3	male	35.0	0	0	8.0500	S

data['Age'] = data['Age'].fillna(data['Age'].mean())
data.fillna(0, inplace=True)
data['Sex'] = [1 if x=='male' else 0 for x in data.Sex]

data['p1'] = np.array(data['Pclass']==1).astype(np.int32)
data['p2'] = np.array(data['Pclass']==2).astype(np.int32)
data['p3'] = np.array(data['Pclass']==3).astype(np.int32)
del data['Pclass']

data.Embarked.unique()
# output
array(['S', 'C', 'Q', 0], dtype=object)

data['e1'] = np.array(data['Embarked']=='S').astype(np.int32)
data['e2'] = np.array(data['Embarked']=='C').astype(np.int32)
data['e3'] = np.array(data['Embarked']=='Q').astype(np.int32)
del data['Embarked']
data.values.dtype
# output
dtype('float64')

data.head()
# output
  Survived	Sex	Age	SibSp	Parch	Fare	p1	p2	p3	e1	e2	e3
0	0	1	22.0	1	0	7.2500	0	0	1	1	0	0
1	1	0	38.0	1	0	71.2833	1	0	0	0	1	0
2	1	0	26.0	0	0	7.9250	0	0	1	1	0	0
3	1	0	35.0	1	0	53.1000	1	0	0	1	0	0
4	0	1	35.0	0	0	8.0500	0	0	1	1	0	0

X与Y划分

data_train = data[[x for x in data.columns if x != 'Survived']].values
data_target = data['Survived'].values.reshape(len(data),1)
np.shape(data_train), np.shape(data_target)
# output
((891, 11), (891, 1))

训练集与测试集划分

x_train, x_test, y_train, y_test = train_test_split(data_train, data_target,
                                                    test_size=0.2)
x_train.shape, x_test.shape
# output
((712, 11), (179, 11))

3 决策树模型

如果特征为连续值，需要根据需求对数据进行离散化处理

model = DecisionTreeClassifier()
model.fit(x_train, y_train)
model.score(x_test, y_test)
# output
0.8044692737430168

model.score(x_train, y_train)
# output
0.9859550561797753

3.1 控制树的深度处理过拟合

def m_score(depth):
    model = DecisionTreeClassifier(max_depth=depth)
    model.fit(x_train, y_train)
    train_score = model.score(x_train, y_train)
    test_score = model.score(x_test, y_test)
    return train_score, test_score

depths = range(2,15)
scores = [m_score(depth) for depth in depths]
scores
# output
[(0.7907303370786517, 0.770949720670391),
 (0.8188202247191011, 0.8156424581005587),
 (0.8412921348314607, 0.7988826815642458),
 (0.8539325842696629, 0.8100558659217877),
 (0.8609550561797753, 0.7877094972067039),
 (0.8848314606741573, 0.8044692737430168),
 (0.9030898876404494, 0.7653631284916201),
 (0.9171348314606742, 0.7932960893854749),
 (0.9325842696629213, 0.7877094972067039),
 (0.9480337078651685, 0.776536312849162),
 (0.9662921348314607, 0.8100558659217877),
 (0.9719101123595506, 0.8100558659217877),
 (0.976123595505618, 0.8044692737430168)]

train_s = [s[0] for s in scores]
test_s = [s[1] for s in scores]
plt.plot(train_s)
plt.plot(test_s)

在这里插入图片描述

3.2 设置信息增益阈值处理过拟合

def m_score(depth):
    model = DecisionTreeClassifier(min_impurity_split=depth)
    model.fit(x_train, y_train)
    train_score = model.score(x_train, y_train)
    test_score = model.score(x_test, y_test)
    return train_score, test_score
values = np.linspace(0,0.5,50)
scores = [m_score(value) for value in values]
train_s = [s[0] for s in scores]
test_s = [s[1] for s in scores]

best_index = np.argmax(test_s)
best_score = test_s[best_index]
best_value = values[best_index]
best_score, best_value
# output
(0.8156424581005587, 0.01020408163265306)

plt.plot(train_s)
plt.plot(test_s)

在这里插入图片描述

4 交叉验证_决策树

为什么需要交叉验证——解决随机划分的差异和参数选择的问题，让所有的数据都参加到训练和评价当中

values = np.linspace(0, 0.5, 50)
depths = range(2,15)
param_grid = {'max_depth':depths, 'min_impurity_decrease': values}

model = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)  # 5折交叉验证
model.fit(data_train, data_target)
# output
GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': range(2, 15),
                         'min_impurity_decrease': array([0.        , 0.01020408, 0.02040816, 0.03061224, 0.04081633,
       0.05102041, 0.06122449, 0.07142857, 0.08163265, 0.09183673,
       0.10204082, 0.1122449 , 0.12244898, 0.13265306, 0.14285714,
       0.15306122, 0.16326531, 0.17346939, 0.18367347, 0.19387755,
       0.20408163, 0.21428571, 0.2244898 , 0.23469388, 0.24489796,
       0.25510204, 0.26530612, 0.2755102 , 0.28571429, 0.29591837,
       0.30612245, 0.31632653, 0.32653061, 0.33673469, 0.34693878,
       0.35714286, 0.36734694, 0.37755102, 0.3877551 , 0.39795918,
       0.40816327, 0.41836735, 0.42857143, 0.43877551, 0.44897959,
       0.45918367, 0.46938776, 0.47959184, 0.48979592, 0.5       ])})

model.best_params_
# output
{'max_depth': 6, 'min_impurity_decrease': 0.0}

model.best_score_
# output
0.8170736300295023

5 随机森林模型

x_train.shape, x_test.shape
# output
((712, 11), (179, 11))

model = RandomForestClassifier(n_estimators=100, n_jobs=4)
# n_estimators=100 100颗决策树，n_jobs=4，处理器为4核
# max_features 最大特征数，min_sample_leaf，叶子的数量

模型要求y必须为1列，用y_train.ravel()

model.fit(x_train, y_train.ravel())  # 模型要求y必须为1列
model.score(x_test, y_test)
# output
0.8268156424581006

model.feature_importances_  # 权重，每个特征的重要性
# output
array([0.2605452 , 0.26374442, 0.05267658, 0.0382214 , 0.24781589,
       0.02800777, 0.01477589, 0.05457004, 0.01605285, 0.01461969,
       0.00897026])

for futh, imp in zip(['Sex','Age','SibSp','Parch','Fare','p1','p2','p3','e1',
                       'e2','e3'], model.feature_importances_):
    print(futh, ': ', imp)
# output
Sex :  0.2605451984578339
Age :  0.26374442405468573
SibSp :  0.05267657561222891
Parch :  0.038221403793938465
Fare :  0.24781589170572155
p1 :  0.0280077665796943
p2 :  0.014775893932383533
p3 :  0.054570038964384275
e1 :  0.016052852543197944
e2 :  0.014619692568660756
e3 :  0.008970261787270525

模型预测

model.predict(x_test[0].reshape(1,-1))
# output
array([0], dtype=int64)

6 交叉验证_随机森林

n_estimators = range(80, 130)
param_grid = {'n_estimators': n_estimators}
model = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
model.fit(data_train, data_target.ravel())
model.best_params_
# output
{'n_estimators': 94}

model.best_score_
# output
0.8193333751804657