例子来源:http://wenku.baidu.com/view/57ccb489d0d233d4b14e695c.html
这里我用python实现了一下,两种方法都有
一种是没有经过任何优化的决策树
还有一种是ID3,C4.5也是在ID3上优化过来的,跟复杂一些,有时间我也会实现一下
#-*- coding:utf-8 -*-
'''
Created on Aug 22, 2013
@author: blacklaw
@blog: blog.csdn.com/blacklaw0
@ref: http://wenku.baidu.com/view/57ccb489d0d233d4b14e695c.html
'''
'''
性格 家庭背景 性别 类别(+:成绩好 -:成绩坏)
'''
_DATA = [
['内向', '良', '女', '+'],
['外向', '良', '男', '+'],
['外向', '中', '女', '-'],
['内向', '差', '女', '-'],
['外向', '中', '男', '+'],
['内向', '良', '男', '+'],
['外向', '差', '女', '+'],
['外向', '差', '男', '-'],
['外向', '良', '女', '+'],
['内向', '中', '女', '-'],
['内向', '中', '男', '-'],
['内向', '差', '男', '-']
]
def _getElementSet(data, index):
keySet = set()
for i in range(len(data)):
keySet.add(data[i][index])
return keySet
def testNode(node):
result = ""
while(isinstance(node, dict)):
keys = node.keys()
guide = ""
for i in range(len(keys)):
guide += "%s : %s\t" % (str(i), keys[i])
index = int(raw_input(guide + '\n'))
key = keys[index]
result += key + ' '
node = node[key]
print "You Choose: %s \nClassfy Result:%s " % (result, node)
def showNode(node, left = 0):
if not isinstance(node, dict):
print '\t', node,
return
for key in node.keys():
print '\n',''.ljust(left,'\t'), key,
showNode(node[key], left + 1)
import math
def calcEntropySum(data, index, aim):
keySet = {}
for line in data:
key = line[index]
if not keySet.has_key(key):
keySet[key] = []
keySet[key].append(line)
ISum = .0
for key in keySet.keys():
i = calcEntropy(keySet[key], aim)
ISum += i * len(keySet[key]) / float(len(data))
return ISum
def calcEntropy(data, index):
propMap = {}
for line in data:
property = line[index]
if not propMap.has_key(property):
propMap[property] = 0
propMap[property] = propMap[property] + 1
entropy = .0
for property in propMap.keys():
probablity = propMap[property] / float(len(data))
entropy -= probablity * math.log(probablity, 2)
return entropy
def getMaxGainIndex(data, aim):
IAim = calcEntropy(data, aim)
max = 0
maxIndex = 0
for j in range(len(data[0])):
if j == aim:
continue
Gain = IAim - calcEntropySum(data, j, aim)
if max == 0:
max = Gain
maxIndex = j
else:
if Gain > max:
max = Gain
maxIndex = j
return maxIndex
'''
two core algorithm to construct decision tree
'''
def DefaultCore(node, aim):
return 0
def ID3Core(node, aim):
return getMaxGainIndex(node, aim)
import copy
def data2tree(nodeOrigin, aim=-1, _core = DefaultCore):
elemMatrix = copy.deepcopy(nodeOrigin)
if aim == -1:
# default aim element is the last element
aim = len(elemMatrix[0]) - 1
if len(elemMatrix[0]) == 1:
# elemMatrix[0]'s lenght == 1,mean it's only one element like [['element']]
return elemMatrix[0][0]
# it's the key code of calculating
gainIndex = _core(elemMatrix, aim)
elementSet = _getElementSet(elemMatrix, gainIndex)
subTree = {element:[] for element in elementSet}
for line in elemMatrix:
if isinstance(line, list):
subTree[line.pop(gainIndex)].append(line)
for key in subTree.keys():
if isinstance(subTree[key], list):
if gainIndex < aim:
subAim = aim - 1
# recursive calc sub tree
subTree[key] = data2tree(subTree[key], subAim, _core = _core)
return subTree
if __name__ == "__main__":
print 'calc decision tree without optimize'
tree = data2tree(_DATA)
showNode(tree)
print '\ncalc decision tree by ID3 algorithm'
tree = data2tree(_DATA, _core = ID3Core)
showNode(tree)
这里是两种方法的结果
calc decision tree without optimize
内向
中
女 -
男 -
差
女 -
男 -
良
女 +
男 +
外向
中
女 -
男 +
差
女 +
男 -
良
女 +
男 +
calc decision tree by ID3 algorithm
中
内向
女 -
男 -
外向
女 -
男 +
差
内向
女 -
男 -
外向
女 +
男 -
良
女
内向 +
外向 +
男
内向 +
外向 +
ID3会根据信息增益调整路径,结果很明显
参考这篇:http://wenku.baidu.com/view/7933c2f6f61fb7360b4c65fd.html
我又整理了一份数据,感兴趣的同学可以试一下
'''
年龄 收入 学生 信用等级 类别(购买电脑)
'''
_DATA2 = [
['<=30', '高', '否', '一般', '不会'],
['<=30', '高', '否', '良好', '不会'],
['31~40', '高', '否', '一般', '会'],
['>40', '中等', '否', '一般', '会'],
['>40', '低', '是', '一般', '会'],
['>40', '低', '是', '良好', '不会'],
['31~40', '低', '是', '良好', '会'],
['<=30', '中等', '否', '一般', '不会'],
['<=30', '低', '是', '一般', '会'],
['>40', '中等', '是', '一般', '会'],
['<=30', '中等', '是', '良好', '会'],
['31~40', '中等', '否', '良好', '会'],
['31~40', '高', '是', '一般', '会'],
['>40', '中等', '否', '良好', '不会'],
]