1、CART算法中Gini系数离散值与连续值的计算方法:
# DecisionTree-main
import itertools
import copy
import re
def createDataSet():
FeatureName = ['性别','年龄','滋生时间','个数','种类','面积','硬化直径']
path = 'data/Immuotherapy.txt'
f = open(path, 'r')
DataSet = []
for cells in f.readlines():
cells = cells.split('\t')
if cells[0] == '1':
cells[0] = '男'
else:
cells[0] = '女'
if cells[4] == '1':
cells[4] = '常规'
elif cells[4] == '2':
cells[4] = '脚底'
else:
cells[4] = '二者'
if cells[7] == '1\n':
cells[7] = '成'
else:
cells[7] = '败'
DataSet.append(cells)
return DataSet, FeatureName
#计算集合的非空子集,便于将属性值二分类(待优化:会有顺序不同的重复集合,不知道怎么去重)
def subSet(items):
a = []
b = []
for i in range(1, len(items)):
a.extend(list(map(list, itertools.combinations(items, i))))
for j in range(len(a) - 1):
if list(set(items)-set(a[j])) in a:
a[j] = 0
for k in a:
if k != 0:
b.append(k)
return b
#当属性被用作划分属性后删除该属性及其列
def splitDataSet(dataSet,value,axis): #(数据集,属性值,第i个属性)
subDataSet = []
for line in dataSet:
if line[axis] == value:
#删除这一列的值
reduceFeatVec = line[:axis] #保留前面所有列的值
reduceFeatVec.extend(line[axis+1:]) #保留后面所有列的值
subDataSet.append(reduceFeatVec)
return subDataSet #得到符合该特征的列,并删除掉某属性其属性值的dataset
def calGini(dataSet,axis):
a,b,a1,a2,b1,b2=0,0,0,0,0,0
for line in dataSet:
if line[axis] == 0: # 对于性别特征来说,0代表女的。
if line[-1] == '成':
a1 += 1
elif line[-1] == '败':
a2 += 1
else: continue
elif line[axis] == 1: # 对于性别特征来说,1代表男的。
if line[-1] == '成':
b1 += 1
elif line[-1] == '败':
b2 += 1
else: continue
else: continue
a = a1 + a2 # 某个特征当中
b = b1 + b2
Gini_a = 1 - pow(a1/a, 2) - pow(a2/a, 2)
Gini_b = 1 - pow(b1/b, 2) - pow(b2/b, 2)
Gini = a/(a+b) * Gini_a + b/(a+b) * Gini_b
return Gini
def chooseBest(dataSet,featureName):
featureNum = len(featureName) # 属性个数
bestFeature, l, r, ldata, rdata = 0, "", "", [], []
Gini = 1
for axis in range(featureNum): #遍历每个属性
values = sorted(list(set([line[axis] for line in dataSet]))) #无重复列出所有属性值并排序
try: # 若属性值为连续型
for i in range(len(values)-1): #对两两属性值计算均值,将所有属性值依照这个数划分成两个集合
tempDataSet = copy.deepcopy(dataSet) #由于要将数据中的属性值按0/1划分,因此建立临时数据集变量
mid = (float(values[i])+float(values[i+1]))/2 #计算两两属性值间的均值
for line in tempDataSet:
if float(line[axis]) < mid: #小于此均值的所有值划分为一个集合,属性值记为0
line[axis] = 0
else: #大于此均值的所有值划分为一个集合,属性值记为1
line[axis] = 1
if calGini(tempDataSet,axis) < Gini:
Gini = calGini(tempDataSet,axis) #得到当前最小Gini值
bestFeature = axis #得到最优分裂属性序号
l = '<'+str(mid)
r = '>'+str(mid)
ldata = splitDataSet(tempDataSet, 0, axis)
rdata = splitDataSet(tempDataSet, 1, axis)
elif calGini(tempDataSet,axis) >= Gini:
continue
except: # 若属性值为离散型
for sub1 in subSet(values): # 给集合随机分裂成两个子集
tempDataSet = copy.deepcopy(dataSet)
for line in tempDataSet: # 将属性列的属性值调整成可以计算Gini的0和1
if line[axis] in sub1: # 若属于第一个子集则将属性值变为0
line[axis] = 0
else: # 若属于第二个子集则将属性值变为1
line[axis] = 1
if calGini(tempDataSet, axis) < Gini:
Gini = calGini(tempDataSet, axis) # 得到当前最小Gini值
bestFeature = axis # 得到最优分裂属性序号
l = ",".join(sub1) # 左子树属性值
r = ",".join(list(set(values)-set(sub1))) # 右子树属性值
ldata = splitDataSet(tempDataSet, 0, axis)
rdata = splitDataSet(tempDataSet, 1, axis)
elif calGini(tempDataSet,axis) >= Gini:
continue
return bestFeature, l, r, ldata, rdata
def majorityCnt(classList):
label = {}
for x in classList:
try:
label[x] += 1
except:
label[x] = 1
if label == {}:
return {}
else:
return max(label)
def createTree(dataSet,featureName):
classList = [line[-1] for line in dataSet] # 将最后一列的类别整为一个列表
# 停止条件1:
if classList.count(classList[0]) == len(classList): #当只剩单一类别时,输出该类别
return classList[0]
# 停止条件2:
if len(featureName) == 1: #当没有属性可以再划分时,返回最多的类别
return majorityCnt(classList)
bestFeature, l, r, ldata, rdata = chooseBest(dataSet, featureName)[0],\
chooseBest(dataSet, featureName)[1],\
chooseBest(dataSet, featureName)[2],\
chooseBest(dataSet, featureName)[3],\
chooseBest(dataSet, featureName)[4] # 得到最佳分裂属性标号
# 停止条件3:
if ldata == [] or rdata == []: #当有一个子树为空集时,输出该属性中占多数的类别
return majorityCnt(classList)
bestFeatureName = featureName[bestFeature] # 得到最佳属性的名称
myTree = {bestFeatureName: {}} # 建立子树
del (featureName[bestFeature])
subfeaName = featureName[:]
myTree[bestFeatureName][l] = createTree(dataSet=ldata, featureName=subfeaName)
myTree[bestFeatureName][r] = createTree(dataSet=rdata, featureName=subfeaName)
return myTree
# 对测试集数据用决策树划分类别
def classify(inputTree,featName,testVec):
firstStr = list(inputTree.keys())[0] #当前树的根节点的特征名称
secondDict = inputTree[firstStr] #根节点的子树
featIndex = featName.index(firstStr) #找到根节点特征对应的下标
key = testVec[featIndex] #找出待测数据的特征值
valueOfFeat = ''
try:
flag = 0
for i in secondDict.keys():
aa = float(re.findall(r'\d+.\d+', i)[0]) #提取map中分支的离散数值,若无则转到except
if (flag == 0 and float(key) < aa) or flag == 1: #小于分支数值则在第1个i进入下一层,大于则在第2个i进入下一层
valueOfFeat = secondDict[i]
break
else: #若大于,则进入下一个循环的if
flag += 1
except:
for i in secondDict.keys():
if i.find(key) != -1: #若匹配到字符串,则进入
valueOfFeat = secondDict[i]
else:
continue
if isinstance(valueOfFeat, dict): #如果不是叶节点
classLabel = classify(valueOfFeat, featName, testVec) #递归地进入下一层节点
else:
classLabel = valueOfFeat #如果是叶节点:确定待测数据的分类
return classLabel
DataSet, FeatureName = createDataSet()
# 构建 %80
test_Data = DataSet[0:-1:5] #从第0个到最后一个,间隔5个数据取一个作为测试集
del (DataSet[0:-1:5])
myTree = createTree(dataSet=DataSet, featureName=FeatureName)
print(myTree)
# # 测试 %20
FeatureName = ['性别','年龄','滋生时间','个数','种类','面积','硬化直径']
correct = 0
prd_correct = 0
recall = 0
act_recall = 0
for line in test_Data:
if line[-1] == '成':
if classify(myTree,FeatureName,line) == '成':
recall += 1
act_recall += 1
if classify(myTree,FeatureName,line) == '成':
if line[-1] == '成':
correct += 1
prd_correct += 1
p = correct/prd_correct
q = recall/act_recall
print(correct,prd_correct,recall,act_recall)
print("准确率为:" + str(p))
print("召回率为:" + str(q))
print("调和平均:" + str(p*q*2/(p+q)))
def getNumLeafs(myTree):
numLeafs = 0
firstStr = list(myTree.keys())[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict':
numLeafs += getNumLeafs(secondDict[key])
else:
numLeafs += 1
return numLeafs
print(getNumLeafs(myTree))
2、数据集:Immuotherapy.txt
1 22 2.25 14 3 51 50 1
1 15 3 2 3 900 70 1
1 16 10.5 2 1 100 25 1
1 27 4.5 9 3 80 30 1
1 20 8 6 1 45 8 1
1 15 5 3 3 84 7 1
1 35 9.75 2 2 8 6 1
2 28 7.5 4 1 9 2 1
2 19 6 2 1 225 8 1
2 32 12 6 3 35 5 0
2 33 6.25 2 1 30 3 1
2 17 5.75 12 3 25 7 1
2 15 1.75 1 2 49 7 0
2 15 5.5 12 1 48 7 1
2 16 10 7 1 143 6 1
2 33 9.25 2 2 150 8 1
2 26 7.75 6 2 6 5 1
2 23 7.5 10 2 43 3 1
2 15 6.5 19 1 56 7 1
2 26 6.75 2 1 6 6 1
1 22 1.25 3 3 47 3 1
2 19 2.25 2 1 60 7 1
2 26 10.5 6 1 50 9 0
1 25 5.75 2 1 300 7 1
2 17 11.25 4 3 70 7 1
1 27 5 2 1 20 5 1
2 24 4.75 10 3 30 45 1
1 15 11 6 1 30 25 0
2 34 11.5 12 1 25 50 0
2 20 7.75 18 3 45 2 1
2 38 2.5 1 3 43 50 1
1 23 3 2 3 87 70 1
2 48 10.25 7 1 50 25 1
2 24 4.25 1 1 174 30 1
2 33 8 3 1 502 8 1
1 34 5 7 3 64 7 0
2 41 11 11 2 21 6 0
1 29 8.75 3 1 504 2 1
2 22 8.5 5 1 99 8 1
1 45 11.25 4 1 72 5 0
2 22 8.25 9 1 352 3 1
1 35 8.75 10 2 69 7 1
2 34 8.5 1 2 163 7 0
1 49 4.5 2 1 33 7 0
2 19 11 5 2 51 6 1
1 21 8 3 1 17 8 1
1 26 7.75 13 2 13 5 1
1 51 8.75 2 2 57 3 1
1 19 7.75 6 1 32 7 1
1 38 12 14 1 87 6 0
2 36 1.75 10 3 45 3 1
2 52 2.25 5 1 63 7 1
2 49 9 4 2 14 9 1
1 23 5.75 2 1 43 7 1
1 45 10 8 1 58 7 1
1 54 7.5 13 3 43 5 1
2 47 5.25 3 3 23 45 1
2 53 10 1 2 30 25 1
2 56 11.75 7 1 31 50 0
1 27 11.25 3 2 37 2 1
2 47 3.75 14 2 67 50 1
2 19 2.25 8 2 42 70 1
2 33 8 5 1 63 25 1
2 15 4 12 1 72 30 1
1 17 8.5 2 1 44 8 1
1 29 5 12 3 75 7 1
1 27 11.75 8 1 208 6 0
2 51 6 6 1 80 2 1
1 35 6.75 4 3 41 8 1
2 47 10.75 8 1 57 5 0
1 43 8 1 1 59 3 1
1 15 4 4 3 25 7 1
1 33 1.75 7 2 379 7 0
2 51 4 1 1 65 7 1
1 45 6.5 9 2 49 6 1
2 47 9.25 13 2 367 8 1
1 18 11.75 5 2 13 5 1
2 46 7.75 8 1 40 3 1
1 43 11 7 1 507 7 1
2 28 11 3 3 91 6 0
1 30 1 2 1 88 3 1
2 16 2 11 1 47 7 1
2 42 8.75 8 2 73 9 0
2 15 8 1 1 55 7 1
2 53 7.25 6 1 81 7 1
1 40 5.5 8 3 69 5 1
1 38 7.5 8 2 56 45 1
1 46 11.5 4 1 91 25 0
1 32 12 9 1 43 50 0
2 23 6.75 6 1 19 2 1