这个代码是可以直接运行的,数据集在github链接下的breast_cancer.csv,把代码和数据集放到同一目录下就好了
或者:百度云链接,提取码:eu8q
详细分析和完善下周日再写吧,感觉复杂度好高啊。。。。。。
import numpy as np
import copy as copy
class Node(object):
def __init__(self):
self.left = None
self.right = None
self.parent = None
self.items = []
self.feature = None
self.feature_value = None
@property
def predict(self):
maxCount = 0
for i in np.unique(self.items[1]):
if self.items[1].count(i) > maxCount:
maxCount = self.items[1].count(i)
maxPredict = i
return maxPredict
def __str__(self):
if self.left == None and self.right == None:
return "size:%d predict:%s"%(len(self.items),str(self.predict))
else:
return "feature:%s feature_value:%s"%(self.feature,self.feature_value)
def get_leafEntropy(self):
g = 1
n = len(self.items[1])
p = {}
for item in self.items[1]:
p.setdefault(item,0)
p[item] += 1
for v in p.values():
g -= (v / n) ** 2
# print(self.items)
# print(p)
return g
def get_leaf_num(self):
if self.left is not None and self.right is not None:
return self.right.get_leaf_num() + self.left.get_leaf_num()
else:
return 1
class Dtree(object):
def __init__(self):
self.root = Node()
def __str__(self):
queue = [(self.root, -1)]
level = 0
res = []
while queue:
node,prelevel = queue.pop(0)
res.append("%d -> %d: %s"%(prelevel, prelevel + 1, str(node)))
if node.left:
queue.append((node.left, prelevel + 1))
if node.right:
queue.append((node.right, prelevel + 1))
level += 1
return "\n".join(res)
def get_nodeEntropy(self,node):
ll = len(node.left.items[0])
lr = len(node.right.items[0])
return (ll * node.left.get_leafEntropy() + lr * node.right.get_leafEntropy()) / (ll + lr)
def split(self,feature,feature_value,idx,X):
div = [[],[]] #对于不同类型的特征选用不同的划分方法:对于离散的,根据是否相等来划分;对于连续的,根据大于还是小于进行划分
# for i in idx:
# if X[i][feature] == feature_value:
# div[0].append(i)
# else:
# div[1].append(i)
# return div
for i in idx:
if X[i][feature] <= feature_value:
div[0].append(i)
else:
div[1].append(i)
return div
def get_G(self,idx,X,y):
g = 1
n = len(idx)
p = {}
for i in idx:
p.setdefault(y[i], 0)
p[y[i]] += 1
for v in p.values():
g -= (v / n) ** 2
return g
def get_bestFeatureValue_forAFeature(self,X,y,idx,feature,best_feature,best_feature_value,minG):
feature_vs = np.unique([X[i][feature] for i in idx])
for feature_v in feature_vs:
div = self.split(feature,feature_v,idx,X)
ll = len(div[0])
lr = len(div[1])
curG = (ll * self.get_G(div[0],X,y) + lr * self.get_G(div[1],X,y)) / (ll + lr)
# print(feature,feature_v,curG)
if curG < minG:
minG = curG
best_feature = feature
best_feature_value = feature_v
return best_feature,best_feature_value,minG
def get_bestFeatureAndValue(self,X,y,idx):
best_feature = 0
best_feature_value = X[0][best_feature]
minG = 1
for feature in range(len(X[0])):
best_feature,best_feature_value, minG = self.get_bestFeatureValue_forAFeature(X,y,idx,feature,best_feature,best_feature_value,minG)
return best_feature,best_feature_value
def create_Dtree(self,X,y):
queue = [(self.root,range(len(X)))]
while queue:
node,idx = queue.pop(0)
if len(np.unique([y[i] for i in idx])) == 1:
node.items = [copy.deepcopy(idx),[y[i] for i in idx]]
# node.predict = y[idx[0]]
continue
best_feature,best_feature_value = self.get_bestFeatureAndValue(X,y,idx)
print("bestFeature: %s, bestFeatureValue: %s"%(str(best_feature),str(best_feature_value)))
node.feature = best_feature
node.feature_value = best_feature_value
node.items = [copy.deepcopy(idx),[y[i] for i in idx]] #为便于剪枝时比较单节点树形式和子树形式的基尼系数,子树的标记也需要保存到根节点
div = self.split(best_feature,best_feature_value,idx,X)
if div[0] != []:
node.left = Node()
node.left.parent = node
queue.append((node.left,div[0]))
if div[1] != []:
node.right = Node()
node.right.parent = node
queue.append((node.right,div[1]))
def predict(self,xi):
node = self.root
while node.left or node.right:
if xi[node.feature] <= node.feature_value:
node = node.left
else:
node = node.right
return node.predict
def get_min_gt(self):
minGt = 0
targetNode = None
queue = [(self.root)]
while queue:
node = queue.pop(0)
Ct = node.get_leafEntropy() #寻找最小的g(t),见统计学习方法p86
CTt = self.get_nodeEntropy(node)
leafnum = node.get_leaf_num()
curGt = (Ct - CTt) / (leafnum - 1)
if minGt == 0 or curGt < minGt:
minGt = curGt
targetNode = node
if node.left.left and node.left.right:
queue.append((node.left))
if node.right.left and node.right.right:
queue.append((node.right))
return targetNode,minGt
def merge_subTree(self,node):
node.left = None
node.right = None
# 用统计学习方法第五章贷款申请样本测试一下:
# 注意这些特征都是离散的,所以要在split函数里把划分标准改成是否相等
# def test_create():
# dt = Dtree()
# X = [
# ['young', 'no', 'no', 'normal'],
# ['young', 'no', 'no', 'good'],
# ['young', 'yes', 'no', 'good'],
# ['young', 'yes', 'yes', 'normal'],
# ['young', 'no', 'no', 'normal'],
#
# ['midage', 'no', 'no', 'normal'],
# ['midage', 'no', 'no', 'good'],
# ['midage', 'yes', 'yes', 'good'],
# ['midage', 'no', 'yes', 'verygood'],
# ['midage', 'no', 'yes', 'verygood'],
#
# ['old', 'no', 'yes', 'verygood'],
# ['old', 'no', 'yes', 'good'],
# ['old', 'yes', 'no', 'good'],
# ['old', 'yes', 'no', 'verygood'],
# ['old', 'no', 'no', 'normal'],
# ]
# y = ['no', 'no', 'yes', 'yes', 'no',
# 'no', 'no', 'yes', 'yes', 'yes',
# 'yes', 'yes', 'yes', 'yes', 'no']
# dt.create_Dtree(X,y)
# dt2 = copy.deepcopy(dt)
# print("dt:\n",dt)
# print("dt2:\n",dt2)
# # print(dt.get_C_multiNode(dt.root)) #准确分类时,任意节点的基尼系数为零
# dt2.root.left.feature = 100
# print("dt:\n", dt)
# print("dt2:\n", dt2)
#
# print(dt.root.items)
#
# # print(dt.get_min_gt())
# test_create()
import numpy as np
data = np.loadtxt('breast_cancer.csv',delimiter=',',dtype=float)
n = data.shape[0]
# 按顺序划分数据,这种方式得到的准确率要比随机生成低,大概几个百分点
# X_train = data[:(int) (0.6 * n), : -1]
# y_train = [int(x) for x in data[:(int) (0.6 * n), -1]]
#
# X_cv = data[(int) (0.6 * n) : (int)(0.8 * n), : -1]
# y_cv = data[(int) (0.6 * n) : (int)(0.8 * n), -1]
#
# X_test = data[(int) (0.8 * n) :, : -1]
# y_test = data[(int) (0.8 * n) :, -1]
from numpy.random import choice,seed
X = data[:,:-1]
y = data[:,-1]
train_size =int(0.6 * n) #训练集大小
cv_size = int(0.2 * n) #交叉验证集大小
test_size = int(0.2 * n) #测试集大小
train_rows = choice(range(n),size = train_size,replace=False)
X_train = [X[i] for i in train_rows]
y_train = [y[i] for i in train_rows]
remains = [i for i in range(n) if i not in train_rows]
cv_rows = choice(remains,size=cv_size,replace=False)
X_cv = [X[i] for i in cv_rows]
y_cv = [y[i] for i in cv_rows]
X_test = [X[i] for i in remains if i not in cv_rows]
y_test = [y[i] for i in remains if i not in cv_rows]
t1 = Dtree()
t1.create_Dtree(X_train,y_train)
predictTrue = 0
queue = [(t1)]
bestTree = t1
maxAcc = 0
alpha = []
while queue: #剪枝并进行交叉验证
curTree = queue.pop(0)
predictTrue = 0
for i in range(len(X_cv)):
curPredict = curTree.predict(X_cv[i])
if curPredict == y_cv[i]:
predictTrue += 1
curAcc = predictTrue / len(y_cv)
print("交叉验证准确率: ",curAcc)
if curAcc > maxAcc:
bestTree = curTree
maxAcc = curAcc
if curTree.root.left and curTree.root.right:
nextTree = copy.deepcopy(curTree)
bestNode,ai = nextTree.get_min_gt() #选在交叉验证集上准确率最高的决策树作为最优模型
print("每次生成的参数alpha: ",ai)
alpha.append(ai)
nextTree.merge_subTree(bestNode)
queue.append(nextTree)
predictTrue = 0
for i in range(len(X_test)):
curPredict = bestTree.predict(X_test[i])
if curPredict == y_test[i]:
predictTrue += 1
acc = predictTrue / len(y_test)
print(acc)