# coding=utf-8
from numpy import *
from math import log
class ID3(object):
def createDataSet(self):
dataSet = matrix([['s', 's', 'no', 0],
['s', 'l', 'yes', 1],
['l', 'm', 'yes', 1],
['m', 'm', 'yes', 1],
['l', 'm', 'yes', 1],
['m', 'l', 'no', 1],
['m', 's', 'no', 0],
['l', 'm', 'no', 1],
['m', 's', 'no', 1],
['s', 's', 'yes', 0]])
self.dataMat = dataSet[:, 0:-1]
self.labelMat = dataSet[:, -1]
self.keys = {}
# 熵(entropy)
def entropy(self, data):
dataNum, featureNum = shape(data)
labels = {}
# create the dictionary for all of the data
for i in arange(dataNum):
if data[i, 0] not in labels.keys():
labels[data[i, 0]] = 1
else:
labels[data[i, 0]] += 1
entropy = 0.0
for key in labels:
prob = float(labels[key]) / float(dataNum)
# get the log value
entropy -= prob * math.log(prob, 2)
return entropy
# 信息增益
def gain(self, data, label):
dataNum, featureNum = shape(data)
labels = {}
gain = self.entropy(label)
# create the dictionary for all of the data
for i in arange(dataNum):
if data[i, 0] not in labels.keys():
labels[data[i, 0]] = [label[i, 0]]
else:
labels[data[i, 0]].append(label[i, 0])
for data in labels:
gain_data = mat(labels[data]).T
gain -= self.entropy(gain_data) * gain_data.shape[0] / dataNum
return gain
def split_DataSet(self, data, label, axis, value):
dataset = hstack((data, label))
m, n = shape(dataset)
split_data = mat(zeros((0, n - 1)))
for feature in dataset:
if feature[0, axis] == value:
vec = hstack((feature[0, 0:axis], feature[0, axis + 1:]))
split_data = vstack((split_data, vec))
return split_data[:, 0:-1], split_data[:, -1]
def Best_Feature(self, data, label):
gains = []
for i in arange(data.shape[1]):
gains.append([self.gain(data[:, i], label), i])
gains.sort(reverse=True)
axis = gains[0][1]
values = set([i[0, 0] for i in data[:, axis]])
return axis, values
def stop_Condtion(self, data, label):
dataNum, featureNum = shape(data)
label_classify = set([i[0, 0] for i in label])
more_label = self.more_label(label)
if featureNum == 1:
return True, more_label
if len(label_classify) == 1:
return True, more_label
return False, more_label
def more_label(self, labels):
count = {}
# create the dictionary for all of the data
for label in labels:
if label[0, 0] not in count.keys():
count[label[0, 0]] = 1
else:
count[label[0, 0]] += 1
more_key = ''
more_value = 0
for key, value in count.items():
if more_value < value:
more_value = value
more_key = key
return more_key
def build_Tree(self, data=None, label=None):
if data is None:
data = self.dataMat
if label is None:
label = self.labelMat
stop, fina_label = self.stop_Condtion(data, label)
if stop:
return fina_label
axis, values = self.Best_Feature(data, label)
keys = {}
for value in values:
split_data, split_label = self.split_DataSet(data, label, axis, value)
final_label = self.build_Tree(split_data, split_label)
keys[value] = final_label
return {axis: keys}
def train(self):
self.createDataSet()
self.keys = self.build_Tree()
def predict_One(self, data, keys=None):
if keys is None:
keys = self.keys
if not isinstance(keys, dict):
return keys
for (key, value) in keys.items():
for (key1, value1) in value.items():
if data[0, key] == key1:
return self.predict_One(data, value1)
def predict(self, data):
data_num = shape(data)[0]
label = mat(zeros((data_num, 1)))
for i in arange(data_num):
label[i, 0] = self.predict_One(data[i, :])
return label
if __name__ == '__main__':
tree = ID3()
tree.train()
dataSet = matrix([['s', 's', 'no', 0],
['s', 'l', 'yes', 1],
['l', 'm', 'yes', 1],
['m', 'm', 'yes', 1],
['l', 'm', 'yes', 1],
['m', 'l', 'no', 1],
['m', 's', 'no', 0],
['l', 'm', 'no', 1],
['m', 's', 'no', 1],
['s', 's', 'yes', 0]])
data = dataSet[:, 0:-1]
label = dataSet[:, -1]
target = tree.predict(data)
print(target)
测试集:
[[‘0’ ‘1’ ‘1’ ‘1’ ‘1’ ‘1’ ‘0’ ‘1’ ‘1’ ‘0’]]
测试结果:
[[ 0. 1. 1. 1. 1. 1. 1. 1. 1. 0.]]