import numpy as np
class DecisionTree:
"""决策树使用方法:
- 生成实例: clf = DecisionTrees(). 参数mode可选,ID3或C4.5,默认C4.5
- 训练,调用fit方法: clf.fit(X,y). X,y均为np.ndarray类型
- 预测,调用predict方法: clf.predict(X). X为np.ndarray类型
- 可视化决策树,调用showTree方法
"""
def __init__(self,mode='C4.5'):
self._tree = None
if mode == 'C4.5' or mode == 'ID3':
self._mode = mode
else:
raise Exception('mode should be C4.5 or ID3')
def _calcEntropy(self,y):
"""
函数功能:计算熵
参数y:数据集的标签
"""
num = y.shape[0]
#统计y中不同label值的个数,并用字典labelCounts存储
labelCounts = {}
for label in y:
if label not in labelCounts.keys(): labelCounts[label] = 0
labelCounts[label] += 1
#计算熵
entropy = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/num
entropy -= prob * np.log2(prob)
return entropy
def _splitDataSet(self,X,y,index,value):
"""
函数功能:返回数据集中特征下标为index,特征值等于value的子数据集
"""
ret = []
featVec = X[:,index]
X = X[:,[i for i in range(X.shape[1]) if i!=index]]
for i in range(len(featVec)):
if featVec[i]&#