import numpy as np
import pandas as pd
def calEnt(dataSet) :
n = dataSet.shape[0] # 数据集总行数
iset = dataSet.iloc[: , -1] . value_counts() # 标签的所有类别
p = iset/n # 每一类标签所占比
ent = (-p*np.log2(p)).sum() # 计算信息熵
return ent
def createDataSet() :
row_data = { 'no surfacing' : [ 1, 1, 1,0,0] , 'flippers' : [1, 1, 0,1,1] , 'fish' : [ 'yes' , 'yes' , 'no' , 'no' , 'no']}
dataSet = pd.DataFrame(row_data)
return dataSet
dataSet = createDataSet()
print(calEnt(dataSet))
def bestSplit(dataSet) :
baseEnt = calEnt(dataSet) # 计算原始熵
bestGain = 0 # 初始化信息增益
axis = -1
for i in range(dataSet.shape[1]-1): # 对特征的每一列进行循环
levels= dataSet.iloc[: , i ] . value_counts().index # 提取出当前列的所有取值
ents = 0
for j in levels: # 对当前列的每一个取值进 行循环
childSet = dataSet[dataSet.iloc[: , i ] ==j] # 某一个子节点的
ent = calEnt(childSet)
ents += (childSet.shape[0]/dataSet.shape[0])*ent
infoGain = baseEnt-ents
if (infoGain > bestGain):
bestGain = infoGain # 选择最大信息增益
axis = i
return axis
print(bestSplit(dataSet))
def mySplit(dataSet,axis,value):
col = dataSet.columns[axis]
redataSet = dataSet.loc[dataSet[col]==value, : ] . drop(col , axis=1)
return redataSet
print(mySplit(dataSet,0,1))
def createTree(dataSet):
featlist = list(dataSet.columns) # 提取出数据集所有的列
classlist = dataSet.iloc[: , -1] . value_counts()
if classlist[0]==dataSet.shape[0] or dataSet.shape[1] == 1:
return classlist.index[0]
axis = bestSplit(dataSet)
bestfeat = featlist[axis]
myTree = {bestfeat:{}}
del featlist[axis] # 删除当前特征
valuelist = set(dataSet.iloc[: , axis])
for value in valuelist:
myTree[bestfeat] [value] = createTree(mySplit(dataSet,axis,value))
return myTree
np.save('myTree.npy' , createTree(dataSet)) # 树的读取
read_myTree = np.load('myTree.npy',allow_pickle=True).item()
print(read_myTree)
def classify(inputTree,labels, testVec):
firstStr = next(iter(inputTree)) # 获取决策树第一个节点
secondDict = inputTree[firstStr] # 下一个字典
featIndex = labels.index(firstStr)
for key in secondDict.keys():
if testVec[featIndex] == key:
if type(secondDict[key]) == dict :
classLabel = classify(secondDict[key] , labels, testVec)
else:
classLabel = secondDict[key]
return classLabel
def acc_classify(train,test) :
inputTree = createTree(train) # 根据测试集生成一棵树
labels = list(train.columns) # 数据集所有的列名称
result = []
for i in range(test.shape[0]) :
testVec = test.iloc[i , : -1] # 测试集中的一个实例
classLabel = classify(inputTree,labels,testVec) # 预测该实例的分类
result.append(classLabel)
test['predict']=result # 将预测结果追加到测试集最后一 列
acc = (test.iloc[: , -1]==test.iloc[: , -2]) . mean() # 计算准确率
print(f' 模型预测准确率为 {acc}')
return test
train = dataSet
test = dataSet.iloc[:3, : ]
acc_classify(train,test)
决策树算法的python实现
最新推荐文章于 2024-04-30 15:43:43 发布