机器学习-决策树的编写(2):C4.5及CART简单实现决策树并可视化
文章目录
前导文章
改进
1.将手动输入数据改为从excel提取数据;
2.增加了C4.5及CART的决策树实现及可视化;
3.可手动选择构造决策树的方法(ID3,C4.5或CART)。
文件结构
说明
导入xlsx文件
import xlrd
def main():
fname = 'watermelon.xlsx'#相对路径
bk=xlrd.open_workbook(fname)#获取当前文档的表
try:
sh=bk.sheet_by_name('Sheet1')
except:
print("have not that sheet")
nrows=sh.nrows#获取行数
#ncols=sh.ncols
row_list=[]
for i in range(1,nrows):#从第二行开始提取数据,第一行是表头
row_data=sh.row_values(i)
row_list.append(row_data)
关于导入文件失败报错:raise XLRDError(FILE_FORMAT_DESCRIPTIONS[file_format]+‘; not supported‘)
需要install pyexcel-xls(实际上是回退到xlrd的旧版本)
代码改动
Comentropy.py 增加代码
def gainRatio(dataSet,placeCon,placeRoot):#计算增益率
lenDataSet = len(dataSet)
iv=0.0 #惩罚参数
gain=calCondComentropy(dataSet,placeCon,placeRoot) #信息增益
dictRep = {} # 用于下面的循环,判断某条件是否出现过
for data in dataSet:
cond = data[placeCon]
if cond not in dictRep:
number=0 #某条件出现次数
for dataCond in dataSet:
if dataCond[placeCon]==cond:
number+=1
dictRep[cond]=0
p = float(number) / float(lenDataSet)
iv -= p * log(p, 2)
if iv!=0:#不到叶子结点
ans = float(gain) / float(iv)
else:
ans=0.0
return ans
def Gini(dataSet,placeCon):#计算基尼系数
lenDataSet = len(dataSet)
ans=1.0
dictRep = {} # 用于下面的循环,判断某条件是否出现过
for data in dataSet:
cond = data[placeCon]
if cond not in dictRep:
number = 0 # 某条件出现次数
for dataCond in dataSet:
if dataCond[placeCon] == cond:
number += 1
dictRep[cond] = 0
p = float(number) / float(lenDataSet)
ans-=p*p
if ans==0:#否则会一直在叶子处迭代
ans=2.0
return ans
def cmpPlus(dataSet,sum,placeRoot):#比较信息增益率
cmpNum = 0.0 # 存储最大值
ans = -1 # 存储最大值所在属性的位置
condNum = 0
while condNum < sum:
if condNum == placeRoot:
continue
else:
if gainRatio(dataSet, condNum, placeRoot) > cmpNum:
cmpNum = gainRatio(dataSet, condNum, placeRoot)
ans = condNum
condNum += 1
return ans
def cmpGini(dataSet,sum,placeRoot):#比较基尼系数
cmpNum = 2.0 # 存储最小值
ans = -1 # 存储最大值所在属性的位置
condNum = 0
while condNum < sum:
if condNum == placeRoot:
continue
else:
if Gini(dataSet, condNum) < cmpNum:
cmpNum = Gini(dataSet, condNum)
ans = condNum
condNum += 1
return ans
CreateTree.py 改动代码
def createWholeTree(self,node,placeRoot,createRole):#createRole=1:ID3算法,createRole=2:C4.5算法,createRole=3:CART算法
if judgeSame(node.index,placeRoot)==1:
node.name=node.index[0][placeRoot]
return
else:
if createRole==1:
needNum=cmp(node.index,node.condNum,placeRoot)#ID3算法
elif createRole == 2:
needNum=cmpPlus(node.index,node.condNum,placeRoot)#C4.5算法
else:
needNum=cmpGini(node.index,node.condNum,placeRoot)#CART算法
node.name = node.label[needNum]
'''条件个数调试
dict = {} # 条件元素
sizeCond = 0 # 一共有几种该属性条件
for data in node.index:
cond = data[needNum]
if cond not in dict:
dict[cond] = 0 # 第一次出现该条件,将其加入字典
sizeCond += 1
dict[cond] += 1
print(cond,sizeCond)
'''
dictRep = {} # 用于下面的循环,判断某条件是否出现过
for data in node.index:
cond = data[needNum]
if cond not in dictRep:
dataSetCond = [] # 将符合该条件的新建数据集存储
for dataCond in node.index:
if dataCond[needNum] == cond:
dataSetCond.append(dataCond)
dictRep[cond] = 0 # 标记
newNode=Node(cond,node.condNum) #建立子结点
newNode.index=dataSetCond
newNode.label=node.label
node.child.append(newNode)
node.childNum+=1
self.createWholeTree(newNode, placeRoot,createRole)
return
init.py 改动代码
import xlrd
def main():
fname = 'watermelon.xlsx'#相对路径
bk=xlrd.open_workbook(fname)#获取当前文档的表
try:
sh=bk.sheet_by_name('Sheet1')
except:
print("have not that sheet")
nrows=sh.nrows#获取行数
#ncols=sh.ncols
row_list=[]
for i in range(1,nrows):#从第二行开始提取数据,第一行是表头
row_data=sh.row_values(i)
row_list.append(row_data)
dataSet=row_list
dataLabel=sh.row_values(0)
#print(row_list)
'''
dataLabel=['色泽','根蒂','敲声','纹理','脐部','触感','好瓜']
dataSet=[ ['青绿','蜷缩','浊响','清晰','凹陷','硬滑','是'],
['乌黑','蜷缩','沉闷','清晰','凹陷','硬滑','是'],
['乌黑','蜷缩','浊响','清晰','凹陷','硬滑','是'],
['青绿','蜷缩','沉闷','清晰','凹陷','硬滑','是'],
['浅白','蜷缩','浊响','清晰','凹陷','硬滑','是'],
['青绿','稍蜷','浊响','清晰','稍凹','软粘','是'],
['乌黑','稍蜷','浊响','稍糊','稍凹','软粘','是'],
['乌黑','稍蜷','浊响','清晰','稍凹','硬滑','是'],
['乌黑','稍蜷','沉闷','稍糊','稍凹','硬滑','否'],
['青绿','硬挺','清脆','清晰','平坦','软粘','否'],
['浅白','硬挺','清脆','模糊','平坦','硬滑','否'],
['浅白','蜷缩','浊响','模糊','平坦','软粘','否'],
['青绿','稍蜷','浊响','稍糊','凹陷','硬滑','否'],
['浅白','稍蜷','沉闷','稍糊','凹陷','硬滑','否'],
['乌黑','稍蜷','浊响','清晰','稍凹','软粘','否'],
['浅白','蜷缩','浊响','模糊','平坦','硬滑','否'],
['青绿','蜷缩','沉闷','稍糊','稍凹','硬滑','否'] ]
'''
print("信息熵为{}".format(calComentropy(dataSet,6)))
print("色泽的信息增益为{}".format(calCondComentropy(dataSet,0,6)))
root=Node('根',6)
root.index=dataSet
root.label=dataLabel
#print(root.index)
tree=BinaryTree(root)
tree.createWholeTree(root,6,1)
tree.lengthTree(root,1)
print("树的高度是",tree.high)
tree.leafTree(root)
print("树的叶子结点个数", tree.leafNum)
#createPlot()
createPlot.ax1 = plt.subplot(111, frameon=False) # frameon表示是否绘制坐标轴矩形
tree.drawTree(root, (0,0), (0,0))
plt.axis('off')# 去掉坐标轴以及刻度
plt.show()