#创建决策树
def createTree(dataSet, labels, featLabels):
classList = [example[-1] for example in dataSet] # 取分类标签(是否放贷:yes or no)
if classList.count(classList[0]) == len(classList): # 如果类别完全相同则停止继续划分;count()计算一个类别的个数=类别列表里类别数
return classList[0]
if len(dataSet[0]) == 1: # 遍历完所有特征时返回出现次数最多的类标签;没有特征时,用类别投票表决处理
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet) # 选择最优特征
bestFeatLabel = labels[bestFeat] # 最优特征的标签
featLabels.append(bestFeatLabel)
myTree = {bestFeatLabel: {}} # 根据最优特征的标签生成树
del (labels[bestFeat]) # 删除已经使用特征标签
featValues = [example[bestFeat] for example in dataSet] # 得到训练集中所有最优特征的属性值;
uniqueVals = set(featValues) # 去掉重复的属性值
for value in uniqueVals: # 遍历特征,创建决策树。
# 假设第二列是最优特征,使用该特征作为根节点,进行递归,则原来的dataSet,会变成两个子dataSet,然后对这两个子dataSet分别进行递归创建树,直到满足结束条件
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), labels, featLabels)
return myTree
if __name__ == '__main__':
dataSet, labels = createDataSet()
featLabels = []
myTree = createTree(dataSet, labels, featLabels)
print(myTree)
运行结果
实现分类:
#使用决策树分类
def classify(inputTree, featLabels, testVec):
firstStr = next(iter(inputTree)) # 获取决策树结点
secondDict = inputTree[firstStr]
featIndex = featLabels.index(firstStr) # featIndex = 0
for key in secondDict.keys(): # key = 0
if testVec[featIndex] == key:
if type(secondDict[key]).__name__ == 'dict': # 如果是字典类型则要继续递归判断
classLabel = classify(secondDict[key], featLabels, testVec)
else:
classLabel = secondDict[key] # 返回结果值
return classLabel
if __name__ == '__main__':
dataSet, labels = createDataSet()
featLabels = []
myTree = createTree(dataSet, labels, featLabels)
testVec = [0, 0] # 测试数据
result = classify(myTree, featLabels, testVec)
if result == 'yes':
print('推优')
if result == 'no':
print('不推优')
运行结果: