决策树和随机森林实战

决策树和随机森林实战

1.决策树使用与原理

import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import tree
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
X = iris['data']
y = iris['target']
feature_names = iris.feature_names
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state  = 1024)
# 数据清洗,花时间
# 特征工程
# 使用模型进行训练# 模型参数调优
# sklearn所有算法,封装好了
# 直接用,使用规则如下
clf = DecisionTreeClassifier(criterion='entropy')
clf.fit(X_train,y_train)
y_ = clf.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_)
# 计算根节点交叉熵
39/120*np.log2(120/39) + 42/120*np.log2(120/42) + 39/120*np.log2(120/39)
42/81*np.log2(81/42) + 39/81*np.log2(81/39)
# 1.5840680553754911
42/81*np.log2(81/42) + 39/81*np.log2(81/39)
#右节点熵值 0.9990102708804813
plt.figure(figsize=(18,12))
_ = tree.plot_tree(clf,filled = True,feature_names=feature_names,max_depth=1)
# value表示每一类样本数

在这里插入图片描述

# 连续的,continuous 属性 阈值 threshold
X_train

在这里插入图片描述

# 波动程度,越大,离散,越容易分开
X_train.std(axis = 0)
# array([0.82300095, 0.42470578, 1.74587112, 0.75016619])
%%time
# 树的深度变浅了,树的裁剪
clf = DecisionTreeClassifier(criterion='entropy',max_depth=5)
clf.fit(X_train,y_train)
y_ = clf.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_))
plt.figure(figsize=(18,12))
_ = tree.plot_tree(clf,filled=True,feature_names = feature_names)
%%time
1.9 + 3.3 = 5.2
5.2/2 = 2.6
np.sort(X_train[:,2])

在这里插入图片描述

# 树的深度变浅了,树的裁剪
clf = DecisionTreeClassifier(criterion='gini',max_depth=5)

clf.fit(X_train,y_train)

y_ = clf.predict(X_test)

from sklearn.metrics import accuracy_score

print(accuracy_score(y_test,y_))

plt.figure(figsize=(18,12))

_ = tree.plot_tree(clf,filled=True,feature_names = feature_names)
# 决策树模型,不需要对数据进行去量纲化,规划化,标准化 

在这里插入图片描述

2.随机森林

import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier

from sklearn import datasets

import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
# 随机森立 :多颗决策树构建而成,每一颗决策树都是刚才讲到的决策树原理
# 多颗决策树一起运算------------>集成算法、
wine = datasets.load_wine()
wine
X = wine['data']

y = wine['target']

X.shape
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
clf = RandomForestClassifier()

clf.fit(X_train,y_train)

y_ = clf.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_)
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train,y_train)
dt_clf.score(X_test,y_test)
score = 0 
for i in range(100):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
    dt_clf = DecisionTreeClassifier()
    dt_clf.fit(X_train,y_train)
    score+=dt_clf.score(X_test,y_test)/100
print('决策树多次运行准确率:',score)
#决策树多次运行准确率: 0.909166666666666
score = 0 
for i in range(100):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(X_train,y_train)
    score+=clf.score(X_test,y_test)/100
print('随机森林多次运行准确率:',score)
#随机森林多次运行准确率: 0.9808333333333332
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值