#!/usr/bin/python #-*-coding:utf-8 -*- from math import log def createDataSET(): dataSet=[[1,1,"yes"], [1,1,"yes"], [1,0,"no"], [0,1,"no"], [0,1,"no"]] labels=["no surfacing","flippers"] return dataSet,labels #得到当前条件下最好的特征值 def getBeastFeature(dataSet): numFeature=len(dataSet[0])-1 #得到所有的特征值 (特征值以数字来定位) featherList=[i for i in range(numFeature)] baseFeather=-1 basegain=0.0 print(" --->feathreList"+str(featherList)) #比较所有的特征值的消息增益来的得到最好的消息增益 for i in featherList: #条件熵 conEntropy=0.0 #经验熵 empirical=getEntropy(dataSet,i) feathureList=[data[i] for data in dataSet] uniqueValus=set(feathureList) print("uniqueValues---->"+str(uniqueValus)) for value in uniqueValus: print("i--->"+str(i)+"---->"+str(value)) splitData=splitDataSet(dataSet,i,value) conEntropy+=(len(splitData)/len(dataSet))*getEntropy(splitData,i) nowgain=empirical-conEntropy if nowgain>basegain: basegain=nowgain baseFeather=i return baseFeather #得到当前数据的经验熵 def getEntropy(dataSet,i): """ :param dataSet:数据集 :param i: 表示当前特征值位于第几个 :return:得到当前数据的经验熵值 """ #按照特征值划分数据 dataLen=len(dataSet) # print(str(dataLen)+" getEntropy(dataSet,i): ---> dataLen") entropy=0.0 # print(str(i)+" ---->>>"+str(dataSet[0][0])) feathureList=[data[i] for data in dataSet] uniqueValus=set(feathureList) for value in feathureList: #类 value的样本个数 num_value=0 for data in dataSet: if data[i]==value: num_value+=1 # log(x,base) x 必须为float型 propotibity=float(num_value)/dataLen entropy-=propotibity*log(propotibity,2) return entropy # 划分数据,以第i 特征值划分数据 def splitDataSet(dataSet,i,values): ''' :param dataSet: 数据集 :param i: 以哪一个特征值划分数据 :param labels:标记这个剩下的特征值对应的名字 :param values:当前的特征值对应的值 :return:划分后的数据 ''' splitData=[] for data in dataSet: if data[i]==values: nowData=[] before=data[:i] after=data[i+1:] nowData.extend(before) nowData.extend(after) splitData.append(nowData) #对特征标签进行处理 print("splitData---->"+str(splitData)) return splitData #将实例树最大的类Ck作为该节点的类标记 def getMark(dataSet): MarkList=[data[-1] for data in dataSet] uniqueList=set(MarkList) #给当前的类标记赋初值 MainClass=uniqueList[0] for value in uniqueList: num_value=0 for data in dataSet: now_value=0 if data[-1]==value: now_value+=1 if num_value<now_value: num_value=now_value MainClass=value return MainClass # 数组表示,值代表的是子节点,字典代表的内部节点 如:{"no ":{1:1,0:{"fllipers"{1:1,0:0}}}} Tree={} def createTree(dataSet,labels): classList=[examle[-1] for examle in dataSet] #类别完全相同,则停止划分(意味着某一类别长度等于总长度) if classList.count(classList[0])==len(dataSet): return classList[0] #遍历完所有特征,则返回所有最大的类Ck作为该节点的类标记 if dataSet[0]==1: return getMark(dataSet) baseFeature=getBeastFeature(dataSet) baseLabel=labels[baseFeature] print("baselabel"+str(baseLabel)) mytree={baseLabel:{}} #去除当前的特征名 del(labels[baseFeature]) print("baseFeature--->"+str(baseFeature)) feathureList=[data[baseFeature] for data in dataSet] uniqueValus=set(feathureList) for values in uniqueValus: sublables=labels[:] splitData=splitDataSet(dataSet,baseFeature,values) #它包含在这里面了 mytree[baseLabel][values]=createTree(splitData,sublables) print("sublabels---->>"+str(sublables)) return mytree dataSet,labels=createDataSET() # baseFeature=getBeastFeature(dataSet,labels) # print(baseFeature) mytree=createTree(dataSet,labels) print(mytree)