决策树学习记录

1、CART算法中Gini系数离散值与连续值的计算方法:

Gini指标计算流程

# DecisionTree-main
import itertools
import copy
import re
def createDataSet():
    FeatureName = ['性别','年龄','滋生时间','个数','种类','面积','硬化直径']
    path = 'data/Immuotherapy.txt'
    f = open(path, 'r')
    DataSet = []
    for cells in f.readlines():
        cells = cells.split('\t')
        if cells[0] == '1':
            cells[0] = '男'
        else:
            cells[0] = '女'
        if cells[4] == '1':
            cells[4] = '常规'
        elif cells[4] == '2':
            cells[4] = '脚底'
        else:
            cells[4] = '二者'
        if cells[7] == '1\n':
            cells[7] = '成'
        else:
            cells[7] = '败'
        DataSet.append(cells)
    return DataSet, FeatureName

#计算集合的非空子集,便于将属性值二分类(待优化:会有顺序不同的重复集合,不知道怎么去重)
def subSet(items):
    a = []
    b = []
    for i in range(1, len(items)):
        a.extend(list(map(list, itertools.combinations(items, i))))
    for j in range(len(a) - 1):
        if list(set(items)-set(a[j])) in a:
            a[j] = 0
    for k in a:
        if k != 0:
            b.append(k)
    return b

#当属性被用作划分属性后删除该属性及其列
def splitDataSet(dataSet,value,axis): #(数据集,属性值,第i个属性)
    subDataSet = []
    for line in dataSet:
        if line[axis] == value:
            #删除这一列的值
            reduceFeatVec = line[:axis] #保留前面所有列的值
            reduceFeatVec.extend(line[axis+1:]) #保留后面所有列的值
            subDataSet.append(reduceFeatVec)
    return subDataSet #得到符合该特征的列,并删除掉某属性其属性值的dataset

def calGini(dataSet,axis):
    a,b,a1,a2,b1,b2=0,0,0,0,0,0
    for line in dataSet:
        if line[axis] == 0:  # 对于性别特征来说,0代表女的。
            if line[-1] == '成':
                a1 += 1
            elif line[-1] == '败':
                a2 += 1
            else: continue
        elif line[axis] == 1:  # 对于性别特征来说,1代表男的。
            if line[-1] == '成':
                b1 += 1
            elif line[-1] == '败':
                b2 += 1
            else: continue
        else: continue
    a = a1 + a2  # 某个特征当中
    b = b1 + b2
    Gini_a = 1 - pow(a1/a, 2) - pow(a2/a, 2)
    Gini_b = 1 - pow(b1/b, 2) - pow(b2/b, 2)
    Gini = a/(a+b) * Gini_a + b/(a+b) * Gini_b
    return Gini

def chooseBest(dataSet,featureName):
    featureNum = len(featureName)  # 属性个数
    bestFeature, l, r, ldata, rdata = 0, "", "", [], []
    Gini = 1
    for axis in range(featureNum): #遍历每个属性
        values = sorted(list(set([line[axis] for line in dataSet]))) #无重复列出所有属性值并排序
        try:  # 若属性值为连续型
            for i in range(len(values)-1): #对两两属性值计算均值,将所有属性值依照这个数划分成两个集合
                tempDataSet = copy.deepcopy(dataSet) #由于要将数据中的属性值按0/1划分,因此建立临时数据集变量
                mid = (float(values[i])+float(values[i+1]))/2 #计算两两属性值间的均值
                for line in tempDataSet:
                    if float(line[axis]) < mid: #小于此均值的所有值划分为一个集合,属性值记为0
                        line[axis] = 0
                    else: #大于此均值的所有值划分为一个集合,属性值记为1
                        line[axis] = 1
                if calGini(tempDataSet,axis) < Gini:
                    Gini = calGini(tempDataSet,axis) #得到当前最小Gini值
                    bestFeature = axis #得到最优分裂属性序号
                    l = '<'+str(mid)
                    r = '>'+str(mid)
                    ldata = splitDataSet(tempDataSet, 0, axis)
                    rdata = splitDataSet(tempDataSet, 1, axis)
                elif calGini(tempDataSet,axis) >= Gini:
                    continue
        except:  # 若属性值为离散型
            for sub1 in subSet(values):  # 给集合随机分裂成两个子集
                tempDataSet = copy.deepcopy(dataSet)
                for line in tempDataSet:  # 将属性列的属性值调整成可以计算Gini的0和1
                    if line[axis] in sub1:  # 若属于第一个子集则将属性值变为0
                        line[axis] = 0
                    else:  # 若属于第二个子集则将属性值变为1
                        line[axis] = 1
                if calGini(tempDataSet, axis) < Gini:
                    Gini = calGini(tempDataSet, axis)  # 得到当前最小Gini值
                    bestFeature = axis  # 得到最优分裂属性序号
                    l = ",".join(sub1)  # 左子树属性值
                    r = ",".join(list(set(values)-set(sub1)))  # 右子树属性值
                    ldata = splitDataSet(tempDataSet, 0, axis)
                    rdata = splitDataSet(tempDataSet, 1, axis)
                elif calGini(tempDataSet,axis) >= Gini:
                    continue
    return bestFeature, l, r, ldata, rdata

def majorityCnt(classList):
    label = {}
    for x in classList:
        try:
            label[x] += 1
        except:
            label[x] = 1
    if label == {}:
        return {}
    else:
        return max(label)

def createTree(dataSet,featureName):
    classList = [line[-1] for line in dataSet]  # 将最后一列的类别整为一个列表
    # 停止条件1:
    if classList.count(classList[0]) == len(classList): #当只剩单一类别时,输出该类别
        return classList[0]
    # 停止条件2:
    if len(featureName) == 1: #当没有属性可以再划分时,返回最多的类别
        return majorityCnt(classList)
    bestFeature, l, r, ldata, rdata = chooseBest(dataSet, featureName)[0],\
                                      chooseBest(dataSet, featureName)[1],\
                                      chooseBest(dataSet, featureName)[2],\
                                      chooseBest(dataSet, featureName)[3],\
                                      chooseBest(dataSet, featureName)[4]  # 得到最佳分裂属性标号
    # 停止条件3:
    if ldata == [] or rdata == []: #当有一个子树为空集时,输出该属性中占多数的类别
        return majorityCnt(classList)
    bestFeatureName = featureName[bestFeature]  # 得到最佳属性的名称
    myTree = {bestFeatureName: {}}  # 建立子树
    del (featureName[bestFeature])
    subfeaName = featureName[:]
    myTree[bestFeatureName][l] = createTree(dataSet=ldata, featureName=subfeaName)
    myTree[bestFeatureName][r] = createTree(dataSet=rdata, featureName=subfeaName)
    return myTree

# 对测试集数据用决策树划分类别
def classify(inputTree,featName,testVec):
    firstStr = list(inputTree.keys())[0]  #当前树的根节点的特征名称
    secondDict = inputTree[firstStr]  #根节点的子树
    featIndex = featName.index(firstStr)  #找到根节点特征对应的下标
    key = testVec[featIndex]  #找出待测数据的特征值
    valueOfFeat = ''
    try:
        flag = 0
        for i in secondDict.keys():
            aa = float(re.findall(r'\d+.\d+', i)[0]) #提取map中分支的离散数值,若无则转到except
            if (flag == 0 and float(key) < aa) or flag == 1:  #小于分支数值则在第1个i进入下一层,大于则在第2个i进入下一层
                valueOfFeat = secondDict[i]
                break
            else: #若大于,则进入下一个循环的if
                flag += 1
    except:
        for i in secondDict.keys():
            if i.find(key) != -1: #若匹配到字符串,则进入
                valueOfFeat = secondDict[i]
            else:
                continue
    if isinstance(valueOfFeat, dict):  #如果不是叶节点
        classLabel = classify(valueOfFeat, featName, testVec)  #递归地进入下一层节点
    else:
        classLabel = valueOfFeat  #如果是叶节点:确定待测数据的分类
    return classLabel


DataSet, FeatureName = createDataSet()

# 构建 %80
test_Data = DataSet[0:-1:5] #从第0个到最后一个,间隔5个数据取一个作为测试集
del (DataSet[0:-1:5])
myTree = createTree(dataSet=DataSet, featureName=FeatureName)
print(myTree)

# # 测试 %20
FeatureName = ['性别','年龄','滋生时间','个数','种类','面积','硬化直径']
correct = 0
prd_correct = 0
recall = 0
act_recall = 0
for line in test_Data:
    if line[-1] == '成':
        if classify(myTree,FeatureName,line) == '成':
            recall += 1
        act_recall += 1
    if classify(myTree,FeatureName,line) == '成':
        if line[-1] == '成':
            correct += 1
        prd_correct += 1
p = correct/prd_correct
q = recall/act_recall
print(correct,prd_correct,recall,act_recall)
print("准确率为:" + str(p))
print("召回率为:" + str(q))
print("调和平均:" + str(p*q*2/(p+q)))


def getNumLeafs(myTree):
    numLeafs = 0
    firstStr = list(myTree.keys())[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            numLeafs += getNumLeafs(secondDict[key])
        else:
            numLeafs += 1
    return numLeafs

print(getNumLeafs(myTree))

2、数据集:Immuotherapy.txt 

1	22	2.25	14	3	51	50	1
1	15	3	2	3	900	70	1
1	16	10.5	2	1	100	25	1
1	27	4.5	9	3	80	30	1
1	20	8	6	1	45	8	1
1	15	5	3	3	84	7	1
1	35	9.75	2	2	8	6	1
2	28	7.5	4	1	9	2	1
2	19	6	2	1	225	8	1
2	32	12	6	3	35	5	0
2	33	6.25	2	1	30	3	1
2	17	5.75	12	3	25	7	1
2	15	1.75	1	2	49	7	0
2	15	5.5	12	1	48	7	1
2	16	10	7	1	143	6	1
2	33	9.25	2	2	150	8	1
2	26	7.75	6	2	6	5	1
2	23	7.5	10	2	43	3	1
2	15	6.5	19	1	56	7	1
2	26	6.75	2	1	6	6	1
1	22	1.25	3	3	47	3	1
2	19	2.25	2	1	60	7	1
2	26	10.5	6	1	50	9	0
1	25	5.75	2	1	300	7	1
2	17	11.25	4	3	70	7	1
1	27	5	2	1	20	5	1
2	24	4.75	10	3	30	45	1
1	15	11	6	1	30	25	0
2	34	11.5	12	1	25	50	0
2	20	7.75	18	3	45	2	1
2	38	2.5	1	3	43	50	1
1	23	3	2	3	87	70	1
2	48	10.25	7	1	50	25	1
2	24	4.25	1	1	174	30	1
2	33	8	3	1	502	8	1
1	34	5	7	3	64	7	0
2	41	11	11	2	21	6	0
1	29	8.75	3	1	504	2	1
2	22	8.5	5	1	99	8	1
1	45	11.25	4	1	72	5	0
2	22	8.25	9	1	352	3	1
1	35	8.75	10	2	69	7	1
2	34	8.5	1	2	163	7	0
1	49	4.5	2	1	33	7	0
2	19	11	5	2	51	6	1
1	21	8	3	1	17	8	1
1	26	7.75	13	2	13	5	1
1	51	8.75	2	2	57	3	1
1	19	7.75	6	1	32	7	1
1	38	12	14	1	87	6	0
2	36	1.75	10	3	45	3	1
2	52	2.25	5	1	63	7	1
2	49	9	4	2	14	9	1
1	23	5.75	2	1	43	7	1
1	45	10	8	1	58	7	1
1	54	7.5	13	3	43	5	1
2	47	5.25	3	3	23	45	1
2	53	10	1	2	30	25	1
2	56	11.75	7	1	31	50	0
1	27	11.25	3	2	37	2	1
2	47	3.75	14	2	67	50	1
2	19	2.25	8	2	42	70	1
2	33	8	5	1	63	25	1
2	15	4	12	1	72	30	1
1	17	8.5	2	1	44	8	1
1	29	5	12	3	75	7	1
1	27	11.75	8	1	208	6	0
2	51	6	6	1	80	2	1
1	35	6.75	4	3	41	8	1
2	47	10.75	8	1	57	5	0
1	43	8	1	1	59	3	1
1	15	4	4	3	25	7	1
1	33	1.75	7	2	379	7	0
2	51	4	1	1	65	7	1
1	45	6.5	9	2	49	6	1
2	47	9.25	13	2	367	8	1
1	18	11.75	5	2	13	5	1
2	46	7.75	8	1	40	3	1
1	43	11	7	1	507	7	1
2	28	11	3	3	91	6	0
1	30	1	2	1	88	3	1
2	16	2	11	1	47	7	1
2	42	8.75	8	2	73	9	0
2	15	8	1	1	55	7	1
2	53	7.25	6	1	81	7	1
1	40	5.5	8	3	69	5	1
1	38	7.5	8	2	56	45	1
1	46	11.5	4	1	91	25	0
1	32	12	9	1	43	50	0
2	23	6.75	6	1	19	2	1

2、决策树-预测隐形眼镜类型 (ID3算法,C4.5算法,CART算法,GINI指数,剪枝,随机森林)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值