机器学习-简单了解决策树的逻辑过程

机器学习-决策树

西瓜案例

数据集

编号,色泽,根蒂,敲声,纹理,脐部,触感,好瓜
1,青绿,蜷缩,浊响,清晰,凹陷,硬滑,是
2,乌黑,蜷缩,沉闷,清晰,凹陷,硬滑,是
3,乌黑,蜷缩,浊响,清晰,凹陷,硬滑,是
4,青绿,蜷缩,沉闷,清晰,凹陷,硬滑,是
5,浅白,蜷缩,浊响,清晰,凹陷,硬滑,是
6,青绿,稍蜷,浊响,清晰,稍凹,软粘,是
7,乌黑,稍蜷,浊响,稍糊,稍凹,软粘,是
8,乌黑,稍蜷,浊响,清晰,稍凹,硬滑,是
9,乌黑,稍蜷,沉闷,稍糊,稍凹,硬滑,否
10,青绿,硬挺,清脆,清晰,平坦,软粘,否
11,浅白,硬挺,清脆,模糊,平坦,硬滑,否
12,浅白,蜷缩,浊响,模糊,平坦,软粘,否
13,青绿,稍蜷,浊响,稍糊,凹陷,硬滑,否
14,浅白,稍蜷,沉闷,稍糊,凹陷,硬滑,否
15,乌黑,稍蜷,浊响,清晰,稍凹,软粘,否
16,浅白,蜷缩,浊响,稍糊,平坦,硬滑,否
17,青绿,蜷缩,沉闷,稍糊,稍凹,硬滑,否

代码


import numpy as np
import pandas as pd

# 决策树API
from sklearn.tree import DecisionTreeClassifier

data = pd.read_csv('./data/data_watermelon.csv')

# 分割X与Y  以及简单处理
data = data.loc[:, '色泽':'好瓜']

# 替换数据集
def replace_data(data):
    # 替换中文
    data = data.replace('青绿', 1)
    data = data.replace('乌黑', 2)
    data = data.replace('浅白', 3)

    data = data.replace('蜷缩', 4)
    data = data.replace('稍蜷', 5)
    data = data.replace('硬挺', 6)

    data = data.replace('浊响', 7)
    data = data.replace('沉闷', 8)
    data = data.replace('清脆', 9)

    data = data.replace('清晰', 10)
    data = data.replace('稍糊', 11)
    data = data.replace('模糊', 12)

    data = data.replace('凹陷', 13)
    data = data.replace('稍凹', 14)
    data = data.replace('平坦', 15)

    data = data.replace('硬滑', 16)
    data = data.replace('软粘', 17)

    data = data.replace('是', 0)
    data = data.replace('否', 1)

    return data

data = replace_data(data)

X = data.loc[:, '色泽':'触感']
Y = data.loc[:, '好瓜':]


# for i in range(1, 10):
#     # 创建树
#     decision_tree = DecisionTreeClassifier(criterion='entropy', max_depth=i)
#
#     # 拟合
#     decision_tree.fit(X, Y)
#
#     # 预测
#     # decision_tree.predict()
#
#     # 评估-  准确率
#     print('树的准确率', i, decision_tree.score(X, Y))
#     print('预测', decision_tree.predict([[1,2,1,2,3,2]]))

decision_tree = DecisionTreeClassifier(criterion='entropy')
decision_tree.fit(X,Y)

from sklearn import tree
import pydotplus

doc = tree.export_graphviz(
    decision_tree=decision_tree,
    out_file=None,
    feature_names=['色泽','根蒂','敲声','纹理','脐部','触感'],
    class_names=['好瓜','坏瓜'],
    filled=True,rounded=True,
    special_characters=True
)

graph = pydotplus.graph_from_dot_data(doc)

graph.write_svg('xx.svg')
graph.write_png('xx.png')

在这里插入图片描述

鸢尾花数据分类

数据 只摘选出一部分

5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa
5.4,3.9,1.7,0.4,Iris-setosa
4.6,3.4,1.4,0.3,Iris-setosa
5.0,3.4,1.5,0.2,Iris-setosa
5.4,3.9,1.3,0.4,Iris-setosa
5.1,3.5,1.4,0.3,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa
4.4,3.0,1.3,0.2,Iris-setosa
5.1,3.4,1.5,0.2,Iris-setosa
5.0,3.5,1.3,0.3,Iris-setosa
5.0,3.3,1.4,0.2,Iris-setosa
7.0,3.2,4.7,1.4,Iris-versicolor
5.9,3.2,4.8,1.8,Iris-versicolor
5.5,2.5,4.0,1.3,Iris-versicolor
5.5,2.6,4.4,1.2,Iris-versicolor
6.1,3.0,4.6,1.4,Iris-versicolor
5.8,2.6,4.0,1.2,Iris-versicolor
5.0,2.3,3.3,1.0,Iris-versicolor
5.6,2.7,4.2,1.3,Iris-versicolor
5.7,3.0,4.2,1.2,Iris-versicolor
5.7,2.9,4.2,1.3,Iris-versicolor
6.2,2.9,4.3,1.3,Iris-versicolor
5.1,2.5,3.0,1.1,Iris-versicolor
5.7,2.8,4.1,1.3,Iris-versicolor
6.3,3.3,6.0,2.5,Iris-virginica
5.8,2.7,5.1,1.9,Iris-virginica
6.0,2.2,5.0,1.5,Iris-virginica
6.9,3.2,5.7,2.3,Iris-virginica
5.6,2.8,4.9,2.0,Iris-virginica
7.7,2.8,6.7,2.0,Iris-virginica
6.3,2.7,4.9,1.8,Iris-virginica
6.7,3.3,5.7,2.1,Iris-virginica
7.2,3.2,6.0,1.8,Iris-virginica
6.2,2.8,4.8,1.8,Iris-virginica
6.1,3.0,4.9,1.8,Iris-virginica
6.2,3.4,5.4,2.3,Iris-virginica
5.9,3.0,5.1,1.8,Iris-virginica

代码

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

names = ['length', 'width', 'length2', 'width2', 'cla']
data = pd.read_csv('./data/鸢尾花数据分类/iris.data', names=names)

data = data.replace('Iris-setosa', 0)
data = data.replace('Iris-versicolor', 1)
data = data.replace('Iris-virginica', 2)

# iloc是数字  loc是第一行标识
X = data.loc[:, 'length':'width2']
Y = data.iloc[:, -1:]
# 切割数据集
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.8, random_state=28)

for i in range(1, 10):
    # 创建模型
    decision_tree = DecisionTreeClassifier(criterion='entropy',max_depth=i)
    decision_tree.fit(x_train, y_train)
    print('------------')
    print('训练集合上的分数', decision_tree.score(x_train, y_train))
    print('预测集合上的分数', decision_tree.score(x_test, y_test))
    print('预测', decision_tree.predict(x_test))

# todo KNN数据可视化 需要下载graphviz和pydotplus包,以及在http://www.graphviz.org/download/网址下载此应用,
#  并在path环境变量中添加此应用bin的路径

from sklearn import tree
import pydotplus

# 第一种方式  生成dot文件 然后用命令行将dot转换为ptf
# # 生成dot文件
# with open('./tree.dot', 'w') as write:
#     tree.export_graphviz(decision_tree=decision_tree,out_file=write)

dot = tree.export_graphviz(
    # 模型
    decision_tree=decision_tree,
    # 输出文件
    out_file=None,
    # 特征
    feature_names=['length','width','length2','width2'],
    # 类别
    class_names=['Iris-setosa','Iris-versicolor','Iris-virginica'],
    # 填充,圆角
    filled=True,rounded=True,
    # charset
    special_characters=True)

# 创建图
graph = pydotplus.graph_from_dot_data(dot)

graph.write_svg('tree.svg')

在这里插入图片描述

根据西瓜数据集划分决策树

数据集

编号,色泽,根蒂,敲声,纹理,脐部,触感,好瓜
1,青绿,蜷缩,浊响,清晰,凹陷,硬滑,是
2,乌黑,蜷缩,沉闷,清晰,凹陷,硬滑,是
3,乌黑,蜷缩,浊响,清晰,凹陷,硬滑,是
4,青绿,蜷缩,沉闷,清晰,凹陷,硬滑,是
5,浅白,蜷缩,浊响,清晰,凹陷,硬滑,是
6,青绿,稍蜷,浊响,清晰,稍凹,软粘,是
7,乌黑,稍蜷,浊响,稍糊,稍凹,软粘,是
8,乌黑,稍蜷,浊响,清晰,稍凹,硬滑,是
9,乌黑,稍蜷,沉闷,稍糊,稍凹,硬滑,否
10,青绿,硬挺,清脆,清晰,平坦,软粘,否
11,浅白,硬挺,清脆,模糊,平坦,硬滑,否
12,浅白,蜷缩,浊响,模糊,平坦,软粘,否
13,青绿,稍蜷,浊响,稍糊,凹陷,硬滑,否
14,浅白,稍蜷,沉闷,稍糊,凹陷,硬滑,否
15,乌黑,稍蜷,浊响,清晰,稍凹,软粘,否
16,浅白,蜷缩,浊响,稍糊,平坦,硬滑,否
17,青绿,蜷缩,沉闷,稍糊,稍凹,硬滑,否

代码


# 书写决策树模型的代码

import numpy as np
import pandas as pd

# 读取数据
data = pd.read_csv('./data/data_watermelon.csv')
# 获得x y
X = data.loc[:, '色泽':'触感']
Y = data.loc[:, '好瓜':]

# 创建ID3
def create_tree(X, Y, Y_index, data, tree):
    '''
    :param X:
    :param Y:
    :return:
    '''

    # 定义最大的信息增益   第一个元素为列索引  第二个元素为信息增益
    max_gain = [-1, -1]
    # 遍历X计算 每一列与Y的信息增益
    for item in X:
        # 计算每一列的信息增益
        gain = get_information_gain(X[item], Y, Y_index, data)
        # 如果计算值大 的话就更新
        if gain > max_gain[1]:
            max_gain[0] = item
            max_gain[1] = gain
        pass
    # 并选择一个信息增益最大的
    tree[max_gain[0]] = {} # tree['纹理'] = {}
    # 划分数据集
    # 遍历信息熵最大的列的类别(纹理)
    for index, item in data[max_gain[0]].value_counts().items():
        item = {}
        # 拿到该条件下的列别         # dropna()去除带有NaN的数据  [Y_index].value_counts() 找Y_index(好瓜|坏瓜)的数量
        target_data = data.where(data[max_gain[0]] == index).dropna()[Y_index]
        # 判断该条件下y的列别有多少,如果只有一个 就直接判断出来
        if len(target_data.value_counts()) == 1:
            # 得到的列别
            print(target_data.iloc[0])
            tree[max_gain[0]][index] = target_data.iloc[0]
            pass
        # 如果有多个 继续遍历
        else:
            data_next = data.where(data[max_gain[0]] == index).dropna()
            x_next = data_next.loc[:, "色泽":"触感"]
            y_next = data_next.loc[:, "好瓜":]
            # todo 递归
            tree[max_gain[0]][index] = create_tree(x_next, y_next, Y_index='好瓜', data=data_next, tree=item)
            pass
        pass
    return tree

# 计算信息增益
def get_information_gain(X, Y, Y_index, data):

    # 计算Y的熵
    entropy = get_entropy(Y, Y_index)
    # 计算条件熵
    conditional_entropy = get_conditional_entropy(X, Y, Y_index, data)
    # 得到信息增益
    information_gain = entropy - conditional_entropy
    print(X.name, information_gain)
    return information_gain

def get_entropy(Y, Y_index):
    y_len = len(Y)
    entropy = 0
    # 遍历所有列
    for item in Y[Y_index].value_counts():
        # print(item) 9 8 这个value_counts()可以区分有几个类
        # 计算概率
        prob = item/y_len
        # 计算每个列别的熵
        entropy += prob * np.log2(prob**(-1))

    return entropy

# 计算条件熵
def get_conditional_entropy(X, Y, Y_index, data):

    # 计算的结果
    conditional_entropy = 0
    # 样本长度
    item_len = len(X)
    for index, item_num in X.value_counts().items():
        # 青绿 6  乌黑 6 浅白  5
        # print(index, item_num)
        # 概率
        prob = item_num/item_len
        I_condition = 0
        for target_index, target_num in Y[Y_index].value_counts().items():
            # 是 8  否 9
            # print(target_index, target_num)
            # 计算条件概率  index : 青绿|乌黑|浅白  X.name:色泽  target_index:是|否   Y_index:好瓜|坏瓜
            conditional_prob = get_conditional_prob(index, X.name, target_index, Y_index, data)
            # 判断对数是否为0
            if conditional_prob == 0:
                I_condition += 0
            else:
                # 确定一个的条件熵
                I_condition += conditional_prob * np.log2(conditional_prob**(-1))
            pass
        conditional_entropy += prob * I_condition
    return conditional_entropy

# 计算条件概率
def get_conditional_prob(condition, column_name, target_y, y_index, data):
    '''
    :param y_index: y的列索引
    :param column_name: 列索引
    :param condition: 在什么条件下,青 黑 白
    :param target_y: 好瓜 害瓜
    :param data: 全部数据
    :return:
    '''
    # 统计分母  这个返回的是True和False
    watermelon_count = (data[column_name] == condition)
    # True ->1  False ->0  统计True False的个数
    denominator = np.sum(watermelon_count.astype('int'))

    # 统计分子 在某条件下好瓜 害瓜 的数量
    molecule = ((data[column_name] == condition) & (data[y_index] == target_y))
    molecule = np.sum(molecule.astype('int'))

    return molecule/denominator

print(create_tree(X, Y, Y_index='好瓜', data=data, tree={}))

运行结果

{'纹理': {
	'清晰': {
		'根蒂': {
			'蜷缩': '是', 
			'稍蜷': {
				'色泽': {
					'乌黑': {
						'触感': {
							'软粘': '否', 
							'硬滑': '是'
								}
							}, 
					'青绿': '是'
						}
					}, 
			'硬挺': '否'
				}
			}, 
	'稍糊': {
		'触感': {
			'硬滑': '否', 
			'软粘': '是'
				}
			}, 
	'模糊': '否'
		}
}

使用sklearn构建Cart回归树

树的划分方式:ID3(信息增益,有几个分类就能分为几叉树) KDTree(方差) C4.5(信息增益率)

from sklearn.tree import DecisionTreeRegressor

import numpy as np
import pandas as pd

# 训练的X
X = [[1],[2],[3],[4],[5],[6],[7],[8],[9],[10]]
# 训练的Y
Y = [[5.56],[5.7],[5.91],[6.4],[6.8],[7.05],[8.9],[8.7],[9],[9.05]]

# 创建树模型 max_depth限制树的深度
decision_tree_reg = DecisionTreeRegressor()

# 给定数据
decision_tree_reg.fit(X,Y)
print(decision_tree_reg.score(X,Y))

print(decision_tree_reg.predict([[7.8]]))#输出[8.7]

from sklearn import tree

import pydotplus

dot = tree.export_graphviz(
    decision_tree=decision_tree_reg,
    out_file=None,
    feature_names=['X'],
    class_names=['Y'],
    filled=True,rounded=True,
    special_characters=True
)

graph = pydotplus.graph_from_dot_data(dot)

graph.write_svg('./data/图.svg')

在这里插入图片描述

Cart分类树

数据集

id,年龄,工作,房子,信贷,类别
1,青年,否,否,一般,否
2,青年,否,否,好,否
3,青年,是,否,好,是
4,青年,是,是,一般,是
5,青年,否,否,一般,否
6,中年,否,否,一般,否
7,中年,否,否,好,否
8,中年,是,是,好,是
9,中年,否,是,非常好,是
10,中年,否,是,非常好,是
11,老年,否,是,非常好,是
12,老年,否,是,好,是
13,老年,是,否,好,是
14,老年,是,否,非常好,是
15,老年,否,否,一般,否
代码

from sklearn.tree import DecisionTreeClassifier

import numpy as np
import pandas as pd

# 读取数据
data = pd.read_csv('./data/data.csv')

data = data.replace('青年', 1)
data = data.replace('中年', 2)
data = data.replace('老年', 3)

data = data.replace('一般', 1)
data = data.replace('好', 2)
data = data.replace('非常好', 3)

data = data.replace('是', 1)
data = data.replace('否', 2)

# X
X = data.loc[:,'年龄':'信贷']

# Y
Y = data.loc[:,'类别':'类别']

# cart树声明出来
decision_tree_cls = DecisionTreeClassifier()

# fit
decision_tree_cls.fit(X,Y)

# score
print(decision_tree_cls.score(X,Y))

# predict

# 模型可视化
from sklearn import tree
import pydotplus

dot = tree.export_graphviz(
    decision_tree=decision_tree_cls,
    out_file=None,
    filled=True,rounded=True
)
graph = pydotplus.graph_from_dot_data(dot)

graph.write_svg('./data/cart.svg')

在这里插入图片描述

案例-使用决策树来预测lol胜负

代码


from sklearn.tree import DecisionTreeClassifier
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('./data/high_diamond_ranked_10min.csv')
X = data.loc[:,'blueWardsPlaced':'redGoldPerMin']

Y = data.loc[:,"blueWins":'blueWins']

# 切割训练集
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=28, test_size=0.2)


#声明树模型
decision_tree = DecisionTreeClassifier(max_depth=3,criterion="entropy")

# fit
decision_tree.fit(x_train,y_train)

# score
print('树最大3层:',decision_tree.score(x_test,y_test))

# predict
from sklearn import tree

import pydotplus

dot = tree.export_graphviz(
    decision_tree=decision_tree,
    out_file=None,
    filled=True,rounded=True,
    feature_names=['blueWardsPlaced','blueWardsDestroyed','blueFirstBlood','blueKills','blueDeaths','blueAssists','blueEliteMonsters','blueDragons','blueHeralds','blueTowersDestroyed','blueTotalGold','blueAvgLevel','blueTotalExperience','blueTotalMinionsKilled','blueTotalJungleMinionsKilled','blueGoldDiff','blueExperienceDiff','blueCSPerMin','blueGoldPerMin','redWardsPlaced','redWardsDestroyed','redFirstBlood','redKills','redDeaths','redAssists','redEliteMonsters','redDragons','redHeralds','redTowersDestroyed','redTotalGold','redAvgLevel','redTotalExperience','redTotalMinionsKilled','redTotalJungleMinionsKilled','redGoldDiff','redExperienceDiff','redCSPerMin','redGoldPerMin'],
    class_names=['bluewin','redwin'],
    special_characters=True
)

graph = pydotplus.graph_from_dot_data(dot)

graph.write_svg('./data/lol.svg')

在这里插入图片描述

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值