根据processed.cleveland.data数据进行数据挖掘操作
UCI公开数据集-heartdisease,属性信息如下:
1.#3(age)
2.#4(sex)
3.#9(cp)
4.#10(trestbps)
5.#12(chol)
6.#16(fbs)
7.#19(restecg)
8.#32(thalach)
9.#38(exang)
10.#40(oldpeak)
11.#41(slope)
12.#44(ca)
13.#51(thal)
14.#58(num)(thepredictedattribute)
数据集参考网址:https://archive.ics.uci.edu/ml/datasets/Heart+Disease
import math
import operator
def calcShannonEnt(dataset):
numEntries = len(dataset)
labelCounts = {}
for featVec in dataset:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] +=1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -= prob*math.log(prob, 2)
return shannonEnt
def CreateDataSet():
#字符串转化浮点数
def safe_float(number):
try:
return float(number)
except:
return None
#读取数据
dataset=[]
with open('processed.cleveland.data') as read_file:
for line in read_file:
line=line.replace('\n','').split(',')
line=list(map(safe_float,line))
dataset.append(line)
labels = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','thalach','exang','oldpeak','slope','ca','thal','num']
return dataset, labels
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
def majorityCnt(classList):
classCount ={}
for vote in classList:
if vote not in classCount.keys():
classCount[vote]=0
classCount[vote]=1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def chooseBestFeatureToSplit(dataSet):
numberFeatures = len(dataSet[0])-1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0;
bestFeature = -1;
for i in range(numberFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntropy =0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
if(infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature
def createTree(dataSet, labels):
classList = [example[-1] for example in dataSet]
if classList.count(classList[0])==len(classList):
return classList[0]
if len(dataSet[0])==1:
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel:{}}
del(labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
return myTree
MyData,label = CreateDataSet()
#决策树ID3
createTree(MyData,label)
运行结果: