# python实现西瓜书《机器学习》习题4.4基尼指数决策树，预剪枝及后剪枝

import pandas as pd

#data_file_encode="gb18030"   #gb18030支持汉字和少数民族字符，是一二四字节变长编码。这么用的时候with open需要增加encoding参数，但会报错gb18030不能解码
# with open相当于打开文件，保存成str对象，如果出错则关闭文件。参数r表示只读
with open("/Users/huatong/PycharmProjects/Data/watermelon_33.csv",mode="r") as data_file:

import decision_tree

# 取出训练集，iloc是根据数字索引取出对应行的信息，drop是删除这些行之后剩余的表格
index_train = [0, 1, 2, 5, 6, 9, 13, 14, 15, 16]   #和书上80页的训练样本相同

df_train = df.iloc[index_train]
df_test = df.drop(index_train)

# generate a full tree
root = decision_tree.TreeGenerate(df_train)
#decision_tree.DrawPNG(root, "decision_tree_full.png")  画不出来 先注释掉
print("accuracy of full tree: %.3f" % decision_tree.PredictAccuracy(root, df_test))

# 预剪枝
root = decision_tree.PrePurn(df_train, df_test)
#decision_tree.DrawPNG(root, "decision_tree_pre.png")
print("accuracy of pre-purning tree: %.3f" % decision_tree.PredictAccuracy(root, df_test))

# 后剪枝，先生成树，再从底部节点开始分析
root = decision_tree.TreeGenerate(df_train)
decision_tree.PostPurn(root, df_test)
#decision_tree.DrawPNG(root, "decision_tree_post.png")
print("accuracy of post-purning tree: %.3f" % decision_tree.PredictAccuracy(root, df_test))

# 5折交叉分析
accuracy_scores = []
n = len(df.index)
k = 5
for i in range(k):
m = int(n / k)
test = []
for j in range(i * m, i * m + m):
test.append(j)

df_train = df.drop(test)
df_test = df.iloc[test]
root = decision_tree.TreeGenerate(df_train)  # generate the tree
decision_tree.PostPurn(root, df_test)  # post-purning

# test the accuracy
pred_true = 0
for i in df_test.index:
label = decision_tree.Predict(root, df[df.index == i])
if label == df_test[df_test.columns[-1]][i]:
pred_true += 1

accuracy = pred_true / len(df_test.index)
accuracy_scores.append(accuracy)

# print the prediction accuracy result
accuracy_sum = 0
print("accuracy: ", end="")
for i in range(k):
print("%.3f  " % accuracy_scores[i], end="")
accuracy_sum += accuracy_scores[i]
print("\naverage accuracy: %.3f" % (accuracy_sum / k))



decision_tree.py

#被主程序执行treeGenerate时候调用，def用于定义函数
#节点类，包含①当前节点的属性，例如纹理清晰？ ②节点所属分类，只对叶子节点有效 ③向下划分的属性取值例如色泽乌黑青绿浅白

class Node(object):   #新式类
def __init__(self,attr_init=None,label_init=None,attr_down_init={}):   #注意类的特殊函数前后有两个下划线
self.attr=attr_init
self.label=label_init
self.attr_down=attr_down_init

#主函数，输入参数为数据集，输出参数为决策树根节点Node
def TreeGenerate(df):
new_node=Node(None,None,{})
label_arr=df[df.columns[-1]]   #好瓜这列数值，df.columns[-1]是最后一列
label_count=NodeLabel(label_arr)
if label_count:  #类别统计结果不为空
new_node.label=max(label_count,key=label_count.get) #取类别数目最多的类，get是返回键值
#如果样本全属于同一类别则直接返回叶节点，或如果样本属性集A为空则返回叶节点并标记类别为类别数最多的类，但如果样本属性取值相同怎么处理？
if len(label_count)==1 or len(label_arr)==0:
return new_node
#根据基尼指数选择最优划分属性
new_node.attr,div_value=OptAttr_Gini(df)
#如果属性值为空，删除当前属性再递归
if div_value==0:
value_count=ValueCount(df[new_node.attr])
for value in value_count:
df_v=df[df[new_node.attr].isin([value])]
dv_v=df_v.drop(new_node.attr,1)
new_node.attr_down[value]=TreeGenerate(df_v)
else:
value_l="<=%.3f"%div_value
value_r=">%.3f"%div_value
df_v_l=df[df[new_node.attr]<=div_value]   #左孩子
df_v_r=df[df[new_node.attr]>div_value]    #右孩子
new_node.attr_down[value_l] = TreeGenerate(df_v_l)   #继续分
new_node.attr_down[value_r] = TreeGenerate(df_v_r)
return new_node

#统计样本包含的类别和每个分类的个数，输入参数是分类标签序列，输出序列中包含的类别和各类别总数
def NodeLabel(label_arr):
label_count={}
for label in label_arr:
if label in label_count: label_count[label]+=1
else:label_count[label]=1
return label_count

#寻找最优划分属性，输入参数为数据集，输出参数为属性opt_attr和划分取值div_value，div_value对离散变量取值为0，对连续变量取实际值
def OptAttr_Gini(df):
gini_index=float('Inf')
for attr_id in df.columns[1:-1]:
gini_index_tmp,div_value_tmp=GiniIndex(df,attr_id)
if gini_index_tmp<gini_index:   #目标是找到最小基尼指数
gini_index_=gini_index_tmp
opt_attr=attr_id
div_value=div_value_tmp
#print("devide according to:",opt_attr,end=' ')
#print("devide value is:",div_value) 这么写不行，要判断是int还是字符
return opt_attr,div_value

#计算基尼指数，输入参数为数据集、属性值，输出参数为基尼指数和划分取值div_value，离散变量取0连续变量取实际值
def GiniIndex(df,attr_id):
gini_index=0
div_value=0   #划分数值
n=len(df[attr_id]) #样本数
#对连续值变量
if df[attr_id].dtype==(float,int):
sub_gini={}  #存储划分数值和各子分类的？
df=df.sort_values([attr_id],ascending=1) #按属性这列排序，升序，这里源程序sort函数会报错要改成sort_values
df=df.reset_index(drop=True)  #sort后索引变化了，需要还原索引
data_arr=df[attr_id]
label_arr=df[df.columns[-1]]
for i in range(n-1):
div=(data_arr[i]+data_arr[i+1])/2   #连续值属性的划分点集合
sub_gini[div] = ( (i+1) * Gini(label_arr[0:i+1]) / n ) \
+ ( (n-i-1) * Gini(label_arr[i+1:-1]) / n )
div_value,gini_index=min(sub_gini.items(),key=lambda x:x[1]) #最lambda用于命名匿名函数

#对离散值变量
else:
data_arr=df[attr_id]
label_arr=df[df.columns[-1]]
value_count=ValueCount(data_arr)
for key in value_count:
key_label_arr=label_arr[data_arr==key]
gini_index+=value_count[key]*Gini(key_label_arr)/n
return gini_index,div_value

#计算基尼值，注意区别于基尼指数
def Gini(label_arr):
gini=1
n=len(label_arr)
label_count=NodeLabel(label_arr)
for key in label_count:
gini-=(label_count[key]/n)*(label_count[key]/n)   #gini=1-p^2
return gini

#根据输入参数属性值区分后，各分类的样本个数
def ValueCount(data_arr):
value_count={}
for label in data_arr:
if label in value_count: value_count[label]+=1
else: value_count[label]=1
return value_count

#根据根节点预测
def Predict(root, df_sample):
try:
import re  # using Regular Expression to get the number in string
except ImportError:

while root.attr != None:
# continuous variable
if df_sample[root.attr].dtype == (float, int):
# get the div_value from root.attr_down
for key in list(root.attr_down):
num = re.findall(r"\d+\.?\d*", key)
div_value = float(num[0])
break
if df_sample[root.attr].values[0] <= div_value:
key = "<=%.3f" % div_value
root = root.attr_down[key]
else:
key = ">%.3f" % div_value
root = root.attr_down[key]

# categoric variable
else:
key = df_sample[root.attr].values[0]
# check whether the attr_value in the child branch
if key in root.attr_down:
root = root.attr_down[key]
else:
break

return root.label

#计算验证集精度
def PredictAccuracy(root, df_test):
'''
calculating accuracy of prediction on test set

@param root: Node, root Node of the decision tree
@param df_test: dataframe, test data set
@return accuracy, float,
'''
if len(df_test.index) == 0: return 0
pred_true = 0
for i in df_test.index:
label = Predict(root, df_test[df_test.index == i])
if label == df_test[df_test.columns[-1]][i]:
pred_true += 1
return pred_true / len(df_test.index)

#预剪枝，输入训练集和验证集，输出剪枝后根节点
def PrePurn(df_train, df_test):

# 生成新树
new_node = Node(None, None, {})
label_arr = df_train[df_train.columns[-1]]

label_count = NodeLabel(label_arr)
if label_count:  # assert the label_count isn's empty
new_node.label = max(label_count, key=label_count.get)

# end if there is only 1 class in current node data
# end if attribution array is empty
if len(label_count) == 1 or len(label_arr) == 0:
return new_node

# calculating the test accuracy up to current node
a0 = PredictAccuracy(new_node, df_test)

# get the optimal attribution for a new branching
new_node.attr, div_value = OptAttr_Gini(df_train)  # via Gini index

# get the new branch
if div_value == 0:  # categoric variable
value_count = ValueCount(df_train[new_node.attr])
for value in value_count:
df_v = df_train[df_train[new_node.attr].isin([value])]  # get sub set
df_v = df_v.drop(new_node.attr, 1)
# for child node
new_node_child = Node(None, None, {})
label_arr_child = df_train[df_v.columns[-1]]
label_count_child = NodeLabel(label_arr_child)
new_node_child.label = max(label_count_child, key=label_count_child.get)
new_node.attr_down[value] = new_node_child

# calculating to check whether need further branching
a1 = PredictAccuracy(new_node, df_test)
if a1 > a0:  # need branching
for value in value_count:
df_v = df_train[df_train[new_node.attr].isin([value])]  # get sub set
df_v = df_v.drop(new_node.attr, 1)
new_node.attr_down[value] = TreeGenerate(df_v)
else:
new_node.attr = None
new_node.attr_down = {}

else:  # continuous variable # left and right child
value_l = "<=%.3f" % div_value
value_r = ">%.3f" % div_value
df_v_l = df_train[df_train[new_node.attr] <= div_value]  # get sub set
df_v_r = df_train[df_train[new_node.attr] > div_value]

# for child node
new_node_l = Node(None, None, {})
new_node_r = Node(None, None, {})
label_count_l = NodeLabel(df_v_l[df_v_r.columns[-1]])
label_count_r = NodeLabel(df_v_r[df_v_r.columns[-1]])
new_node_l.label = max(label_count_l, key=label_count_l.get)
new_node_r.label = max(label_count_r, key=label_count_r.get)
new_node.attr_down[value_l] = new_node_l
new_node.attr_down[value_r] = new_node_r

# calculating to check whether need further branching
a1 = PredictAccuracy(new_node, df_test)
if a1 > a0:  # need branching
new_node.attr_down[value_l] = TreeGenerate(df_v_l)
new_node.attr_down[value_r] = TreeGenerate(df_v_r)
else:
new_node.attr = None
new_node.attr_down = {}

return new_node

#后剪枝
def PostPurn(root, df_test):
'''
pre-purning to generating a decision tree

@param root: Node, root of the tree
@param df_test: dataframe, the testing set for purning decision
@return accuracy score through traversal the tree
'''
# leaf node
if root.attr == None:
return PredictAccuracy(root, df_test)

# calculating the test accuracy on children node
a1 = 0
value_count = ValueCount(df_test[root.attr])
for value in list(value_count):
df_test_v = df_test[df_test[root.attr].isin([value])]  # get sub set
if value in root.attr_down:  # root has the value
a1_v = PostPurn(root.attr_down[value], df_test_v)
else:  # root doesn't have value
a1_v = PredictAccuracy(root, df_test_v)
if a1_v == -1:  # -1 means no pruning back from this child
return -1
else:
a1 += a1_v * len(df_test_v.index) / len(df_test.index)

# calculating the test accuracy on this node
node = Node(None, root.label, {})
a0 = PredictAccuracy(node, df_test)

# check if need pruning
if a0 >= a1:
root.attr = None
root.attr_down = {}
return a0
else:
return -1

def DrawPNG(root, out_file):
import graphviz
'''
visualization of decision tree from root.
@param root: Node, the root node for tree.
@param out_file: str, name and path of output file
'''
try:
from pydotplus import graphviz
except ImportError:

g = graphviz.Dot()  # generation of new dot

TreeToGraph(0, g, root)
g2 = graphviz.graph_from_dot_data(g.to_string())

g2.write_png(out_file)

def TreeToGraph(i, g, root):
'''
build a graph from root on
@param i: node number in this tree
@param g: pydotplus.graphviz.Dot() object
@param root: the root node

@return i: node number after modified
#     @return g: pydotplus.graphviz.Dot() object after modified
@return g_node: the current root node in graphviz
'''
try:
from pydotplus import graphviz    #pydotplus和graphviz都要安装
except ImportError:

if root.attr == None:
g_node_label = "Node:%d\n好瓜:%s" % (i, root.label)
else:
g_node_label = "Node:%d\n好瓜:%s\n属性:%s" % (i, root.label, root.attr)
g_node = i