输入:iris.data
输出:二叉决策树,以及测试集在这颗树上的准确率(此处用gini指数)
代码如下
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn import tree
iris=load_iris()
# 取测试集,其余为训练集
removed = range(70,100)
new_target = np.delete(iris.target,removed)
new_data = np.delete(iris.data,removed, axis=0)
# entropy计算
def calc_entropy(p):
if p!=0:
return -p * np.log2(p)
else:
return 0
# 增益计算
def calc_info_gain(data,classes,feature):
# Calculates the information gain based on both entropy and the Gini impurity
gain = 0
ggain = 0
nData = len(data)
# List the values that feature can take
values = []
for datapoint in data:
if datapoint[feature] not in values:
values.append(datapoint[feature])
featureCounts = np.zeros(len(values))
entropy = np.zeros(len(values))
gini = np.zeros(len(values))
valueIndex = 0
# Find where those values appear in data[feature] and the corresponding class
for value in values:
dataIndex = 0
newClasses = []
for datapoint in data:
if datapoint[feature]==value:
featureCounts[valueIndex]+=1
newClasses.append(classes[dataIndex])
dataIndex += 1
# Get the values in newClasses
classValues = []
for aclass in newClasses:
if classValues.count(aclass)==0:
classValues.append(aclass)
classCounts = np.zeros(len(classValues))
classIndex = 0
for classValue in classValues:
for aclass in newClasses:
if aclass == classValue:
classCounts[classIndex]+=1
classIndex += 1
for classIndex in range(len(classValues)):
entropy[valueIndex] += calc_entropy(float(classCounts[classIndex])/np.sum(classCounts))
gini[valueIndex] += (float(classCounts[classIndex])/np.sum(classCounts))**2
# Computes both the Gini gain and the entropy
gain = gain + float(featureCounts[valueIndex])/nData * entropy[valueIndex]
ggain = ggain + float(featureCounts[valueIndex])/nData * gini[valueIndex]
valueIndex += 1
return gain, 1-ggain
# 生成树
def make_tree(data,classes,featureNames,maxlevel=-1,level=0,forest=0):
""" The main function, which recursively constructs the tree"""
nData = len(data)
nFeatures = len(data[0])
newClasses = []
for aclass in classes:
if newClasses.count(aclass)==0:
newClasses.append(aclass)
frequency = np.zeros(len(newClasses))
totalEntropy = 0
totalGini = 0
index = 0
for aclass in newClasses:
frequency[index] = np.sum(classes == aclass)
totalEntropy += calc_entropy(float(frequency[index])/nData)
totalGini += (float(frequency[index])/nData)**2
index += 1
totalGini = 1 - totalGini
default = classes[np.argmax(frequency)]
if nData==0 or nFeatures == 0 or (maxlevel>=0 and level>maxlevel):
# Have reached an empty branch
return default
elif np.sum(classes == classes[0]) == nData:
# Only 1 class remains
return classes[0]
else:
# Choose which feature is best
gain = np.zeros(nFeatures)
ggain = np.zeros(nFeatures)
featureSet = range(nFeatures)
if forest != 0:
np.random.shuffle(featureSet)
featureSet = featureSet[0:forest]
for feature in featureSet:
g,gg = calc_info_gain(data,classes,feature)
gain[feature] = totalEntropy - g
ggain[feature] = totalGini - gg
bestFeature = np.argmax(gain)
tree = {featureNames[bestFeature]:{}}
# List the values that bestFeature can take
values = []
for datapoint in data:
if datapoint[feature] not in values:
values.append(datapoint[bestFeature])
for value in values:
# Find the datapoints with each feature value
newData = []
newClasses = []
index = 0
for datapoint in data:
if datapoint[bestFeature]==value:
if bestFeature==0:
newdatapoint = datapoint[1:]
newNames = featureNames[1:]
elif bestFeature==nFeatures:
newdatapoint = datapoint[:-1]
newNames = featureNames[:-1]
else:
newdatapoint = datapoint[:bestFeature]
newdatapoint = np.hstack((newdatapoint,datapoint[bestFeature+1:]))
newNames = featureNames[:bestFeature]
newNames.extend(featureNames[bestFeature+1:])
newData.append(newdatapoint)
newClasses.append(classes[index])
index += 1
# Now recurse to the next level
subtree = make_tree(newData,newClasses,newNames,maxlevel,level+1,forest)
# And on returning, add the subtree on to the tree
tree[featureNames[bestFeature]][value] = subtree
return tree
def classify(tree,datapoint,featureNames):
if type(tree) == type("string"):
# Have reached a leaf
return tree
else:
if isinstance(tree,dict):
a = list(tree.keys())[0]
for i in range(3):
if featureNames[i]==a:
break
try:
t = tree[a][datapoint[i]]
return classify(t,datapoint,featureNames)
except:
for y in range(len(tree[a])):
tree_feature = [0]*len(tree[a])
differ = list(tree[a].keys())[y] - datapoint[i]
if differ < 0:
tree_feature[y] = (-1)*differ
else:
tree_feature[y] = differ
index = tree_feature.index(min(tree_feature))
t = tree[a][list(tree[a].keys())[y]]
return classify(t, datapoint, featureNames)
else:
return tree
# 对测试集进行分类
def classifyAll(tree,data,featureNames):
results = []
for i in range(len(data)):
results.append(classify(tree,data[i],featureNames)-1)
return results
# 计算准确率
def accuracy(a,b):
s = 0
for i in range(len(a)):
if a[i] == b[i]:
s += 1
rate = s/len(a)
return rate
featureNames = iris.feature_names
classes = iris.target
tree = make_tree(new_data,new_target,featureNames)
a = classifyAll(tree, iris.data[removed], featureNames)
# 打印树与准确率
print(tree)
print(accuracy(a,list(iris.target[removed])))