思路
信息熵
dataSet:
----------
1 1 A
1 1 A
1 0 B
0 1 B
0 1 B
----------
N
----------
需要变量:
P(A) = n(A)/N;
P(B) = n(B)/N;
p(xi) = n(Class)/N
H = sum[-p(xi) *log2(p(xi))]
编程思维:
需要N、n(class)、shannonEnt
N 由len求得
n(class) 由字典查表求得,判断新旧类:新类则初始从0开始,然后+1,旧类+1
shannonEnt 有每类的 n(class) / N 计算得到
trees.py
'''
数据
'''
def createDataSet():
# dataSet = [[1, 1, 'yes'],
# [1, 1, 'yes'],
# [1, 0, 'no'],
# [0, 1, 'no'],
# [0, 1, 'no']]
# labels = ['no surfacing','flippers']
dataSet = [[1, 1, 1, 'yes'],
[1, 1, 0, 'yes'],
[1, 0, 0, 'no'],
[0, 1, 1, 'no'],
[0, 1, 0, 'yes']]
labels = ['no surfacing','flippers','head']
#change to discrete values
return dataSet, labels
'''
信息熵
dataSet:
----------
1 1 A
1 1 A
1 0 B
0 1 B
0 1 B
----------
N
----------
需要变量:
P(A) = n(A)/N;
P(B) = n(B)/N;
p(xi) = n(Class)/N
H = sum[-p(xi) *log2(p(xi))]
编程思维:
需要N、n(class)、shannonEnt
N 由len求得
n(class) 由字典查表求得,判断新旧类:新类则初始从0开始,然后+1,旧类+1
shannonEnt 有每类的 n(class) / N 计算得到
'''
from math import log
def calcShannonEnt(dataSet):
# 数据实例总数,多次使用,赋值为变量
numEntries = len(dataSet)
# 数据字典,键值为最后一列的数值,用于记录类别出现次数
labelCounts = {}
# 赋值字典,存放类别出现的次数
for featVec in dataSet:
currentLabel = featVec[-1]
# print("currentLabel", currentLabel)
# 编程思维:同类类别: 自加1, 没有在类别中:初始化0
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
# print("labelCounts[currentLabel]", labelCounts[currentLabel])
# 香农熵初始化
shannonEnt = 0.0
# 香浓熵计算
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -= prob * log(prob, 2)
# 返回香浓熵的计算值
return shannonEnt
'''
数据集划分
依据
“信息增益选择最好的决策用的属性,增益越大,则该属性用于决策的能力越强”
来判断选谁为最有属性。
'''
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1 # 计算属性个数
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0
bestFeature = -1
for i in range(numFeatures): # 按特征A来划分、然后特征B来划分、。。。
print("第[",i,"]个特征属性")
# 取第i列的单特征的键值,每次只取一列的键值
featList = [example[i] for example in dataSet] # 取列表list的每一行的第i个,取到完,如,i=1,取list的第1行第1个, 第2行第1个,第3行第1个,。。。
print("第", i, "个属性的特征属性的键值是", featList)
uniqueVals = set(featList) # 只取不重复的特征属性的键值
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
print("第", i, "个属性的特征属性的键值为", value, "的子划分数据集是", subDataSet) # 重点,嘻嘻!! 分离数据集来划分更具规律的数据集
prob = len(subDataSet)/float(len(dataSet))
print("第", i, "个属性的特征属性的键值为", value, "的熵是",calcShannonEnt(subDataSet))
newEntropy += prob * calcShannonEnt(subDataSet) # 新的熵 = 划分A子数据和总数据的占比 * 划分子数据A的熵 + 划分B子数据和总数据的占比 * 划分子数据B的熵 + ...
infoGain = baseEntropy - newEntropy # 计算增益
print("第", i, "个属性的特征属性的新熵是", newEntropy)
print("原始的数据集的总熵", baseEntropy)
print("第", i, "个属性的特征属性的增益是", infoGain)
if (infoGain > bestInfoGain): #compare this to the best gain so far
bestInfoGain = infoGain #if better than current best, set to best
bestFeature = i
print("bestInfoGain", bestInfoGain, "对应特征属性序号idex", bestFeature)
print("选第[", bestFeature, "]个的属性表现最好")
return bestFeature #returns an integer
'''
防止漏洞添加的
'''
import operator
def majorityCnt(classList):
classCount={}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
# python2 classCount.items()
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
# python3 classCount.iteritems()
# sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
'''
构建树
'''
def createTree(dataSet,labels):
print("____________________________________________")
classList = [example[-1] for example in dataSet] # 取最后分类标签
print("classList[0]:",classList[0],"cout:",classList.count(classList[0]))
print("需要分类的类别:", classList,"cout:",len(classList))
if classList.count(classList[0]) == len(classList):
print("类别一致,无需再分,分支跳出" )
return classList[0]#stop splitting when all of the classes are equal
if len(dataSet[0]) == 1: #stop splitting when there are no more features in dataSet
print("没有属性可分了,选出现频率最大的属性")
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet) # 最有划分属性编号 index
bestFeatLabel = labels[bestFeat] # 最有划分属性编号 index
myTree = {bestFeatLabel:{}}
del(labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:] # 复制一份现有标签不会被打乱
print("labels[:]", labels[:])
print("下一轮数据-data:", splitDataSet(dataSet, bestFeat, value))
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
print("____________________done________________________")
return myTree
## 应用 ########
'''
训练后的模型进行分类预测
'''
def classify(inputTree,featLabels,testVec):
# python2
# firstStr = inputTree.keys()[0]
# python3
firstStr = list(inputTree.keys())[0]
print("firstStr:", firstStr)
secondDict = inputTree[firstStr]
print("字典检索方式——secondDict", secondDict)
featIndex = featLabels.index(firstStr)
print("featLabels",featLabels, "index", featIndex)
key = testVec[featIndex]
valueOfFeat = secondDict[key]
print("valueOfFeat不是字典类型,说明树枝检索到最后",isinstance(valueOfFeat, dict))
if isinstance(valueOfFeat, dict):
classLabel = classify(valueOfFeat, featLabels, testVec)
else: classLabel = valueOfFeat
return classLabel
'''
树的硬盘保存
'''
def storeTree(inputTree, filename):
import pickle
fw = open(filename,'wb')
pickle.dump(inputTree,fw)
fw.close()
pass
'''
树的硬盘读取
'''
def grabTree(filename):
import pickle
fr = open(filename,'rb')
return pickle.load(fr)
if __name__ == "__main__":
# dataSet, labels = createDataSet()
# print(dataSet,labels)
#print(calcShannonEnt(dataSet))
#print(chooseBestFeatureToSplit(dataSet))
#print(createTree(dataSet, labels))
dataSet, labels = createDataSet()
tree = {'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}
storeTree(tree, 'ID3_Tree.txt')
print(grabTree("ID3_Tree.txt"))
print("分类结果:", classify(tree, labels, [1,0,1]))
pass