决策树 ID3算法 python实现 不用现成库

决策树 id3 学校作业,数据集不多,没划分测试训练,结果看最后

数据集及描述

import numpy as np
import pandas as pd
import numpy.random
import time
import math
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from scipy.io import arff
filepath='caesarian.csv.arff'   
data = arff.loadarff(filepath)
df = pd.DataFrame(data[0],dtype='int')
df
C:\Users\20535\Anaconda3\envs\tensorflow\lib\site-packages\numpy\core\numeric.py:2378: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  return bool(asarray(a1 == a2).all())
AgeDelivery numberDelivery timeBlood of PressureHeart ProblemCaesarian
02210200
12620101
22621100
32810200
42220101
.....................
752721100
763340101
772921201
782512001
792422100

80 rows × 6 columns

grid = sns.FacetGrid(df, size=2.2, aspect=1.6)
grid.map(sns.lineplot, 'Age', 'Caesarian', palette='deep')
grid.add_legend()
C:\Users\20535\Anaconda3\envs\tensorflow\lib\site-packages\seaborn\axisgrid.py:316: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)





<seaborn.axisgrid.FacetGrid at 0x21bf4553c08>

在这里插入图片描述

sns.countplot(x = 'Age', hue = "Caesarian", data = df)
plt.show()

在这里插入图片描述

df['Age'].max()-df['Age'].min()
23
df['Age'].max()
40
df['Age'].min()
17
df['Age'].loc[df['Age']>=32]=3
df['Age'].loc[df['Age']>=23]=2
df['Age'].loc[df['Age']>16]=1
grid = sns.FacetGrid(df, size=2.2, aspect=1.6)
grid.map(sns.lineplot, 'Age', 'Caesarian', palette='deep')
grid.add_legend()
C:\Users\20535\Anaconda3\envs\tensorflow\lib\site-packages\seaborn\axisgrid.py:316: UserWarning: The `size` parameter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)





<seaborn.axisgrid.FacetGrid at 0x21bf482ce88>

在这里插入图片描述

年龄改成分层,数据量太少,年龄种类太多,分层能保证每个末端能多一点

Mdata=df.values
x=Mdata[:,0:5]
Y=Mdata[:,5:6]
len_of_feature_count = []
for i in df.columns.tolist():
    print(i, ':', len(df[i].astype(str).value_counts()))
    len_of_feature_count.append(len(df[i].astype(str).value_counts()))
print("over")
Age : 3
Delivery number : 4
Delivery time : 3
Blood of Pressure : 3
Heart Problem : 2
Caesarian : 2
over
df
AgeDelivery numberDelivery timeBlood of PressureHeart ProblemCaesarian
0110200
1220101
2221100
3210200
4120101
.....................
75221100
76340101
77221201
78212001
79222100

80 rows × 6 columns

def hd(y):
    ty=(y==1).sum()
    fy=(y==0).sum()
    ay=len(y)
    a1=ty/ay
    a2=fy/ay
    if a1 != 0 and a2 !=0 and ay != 0:
        res=-a1*math.log(a1,2)-a2*math.log(a2,2)
    elif a2 !=0 and ay != 0:
        res=-a2*math.log(a2,2)
    elif a1 !=0 and ay != 0:
        res=-a1*math.log(a1,2)
    else:
        res=0
    return res
Uncertainty=hd(Y)
Uncertainty
0.9837082626231857
def gda(i,mdf):
    nl=len_of_feature_count[i]
    name=mdf.columns.tolist()[i]
    UTS=[]
    for t in list(set(mdf[name].values)):
        temp=mdf[mdf[name]==t]
        temp2=temp.values
        temp_num=len(temp)/len(Y)
        uncertainty=hd(temp2[:,-1])#小写  
        UTS.append(uncertainty*temp_num)
    
    return Uncertainty-sum(UTS)
        
RES=[]
for nn in range(5):
    res=gda(nn,df)
    RES.append(res)
    print(df.columns.tolist()[nn]+':'+str(res))
Age:0.4696775060793045
Delivery number:0.44683156475569896
Delivery time:0.4346659154931095
Blood of Pressure:0.46832970111430006
Heart Problem:0.49499326582091163
df.columns.tolist()[RES.index(max(RES))]
'Heart Problem'

从这里开始,上面都是些预处理和函数测试

in_put=df
Mytree={'name':'MDS','name':{}}

Count=0
def tree(temp_data,last_tree,last_name):
    global Count
    
    #if len(temp_data.columns.tolist())==1:  原版
    if len(temp_data.columns.tolist())==1 or hd(temp_data.values[:,-1])<0.3:
        Count+=1
        if temp_data['Caesarian'].mean()>=0.5:
            last_tree[last_name] ='true'
        else:
            last_tree[last_name] ='false'
        return
    RES=[]
    for nn in range(len(temp_data.columns.tolist())-1):
        res=gda(nn,df)
        #print(temp_data.columns.tolist()[nn]+':'+str(res))
        RES.append(res)
    Best_name=temp_data.columns.tolist()[RES.index(max(RES))]
    temp=list(set(temp_data[Best_name].values))
    temp.append('name')
    temp.append('last')
    mytree=dict.fromkeys(temp,0)
    mytree['name'] =Best_name
    mytree['last'] =last_name
    last_tree[last_name] =mytree
    for nnn in list(set(df[Best_name].values)):
        kid_data1=temp_data.loc[temp_data[Best_name]==nnn]
        kid_data=kid_data1.drop([Best_name],axis=1)
        tree(kid_data,mytree,nnn)
tree(df,Mytree,'name')
C:\Users\20535\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel_launcher.py:5: RuntimeWarning: invalid value encountered in long_scalars
  """
C:\Users\20535\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel_launcher.py:6: RuntimeWarning: invalid value encountered in long_scalars
Count#叶子节点数量
72
Mytree
{'name': {0: {1: {1: {0: {0: 'true',
      1: 'true',
      2: 'false',
      'name': 'Blood of Pressure',
      'last': 0},
     2: {2: 'true',
      'name': 'Blood of Pressure',
      'last': 2,
      0: 'false',
      1: 'false'},
     'name': 'Delivery time',
     'last': 1,
     1: 'false'},
    2: 'true',
    'name': 'Delivery number',
    'last': 1,
    3: 'false',
    4: 'false'},
   2: {1: {0: {0: 'true',
      1: 'false',
      2: 'true',
      'name': 'Blood of Pressure',
      'last': 0},
     1: {0: 'false',
      1: 'false',
      2: 'false',
      'name': 'Blood of Pressure',
      'last': 1},
     2: {0: 'true',
      'name': 'Blood of Pressure',
      'last': 2,
      1: 'false',
      2: 'false'},
     'name': 'Delivery time',
     'last': 1},
    2: {0: {1: 'false',
      'name': 'Blood of Pressure',
      'last': 0,
      0: 'false',
      2: 'false'},
     1: {1: 'false',
      2: 'true',
      'name': 'Blood of Pressure',
      'last': 1,
      0: 'false'},
     2: {0: 'false',
      1: 'false',
      'name': 'Blood of Pressure',
      'last': 2,
      2: 'false'},
     'name': 'Delivery time',
     'last': 2},
    3: {0: 'true', 2: 'false', 'name': 'Delivery time', 'last': 3, 1: 'false'},
    'name': 'Delivery number',
    'last': 2,
    4: 'false'},
   3: {1: {0: 'false',
     1: {0: 'true',
      1: 'false',
      2: 'true',
      'name': 'Blood of Pressure',
      'last': 1},
     'name': 'Delivery time',
     'last': 1,
     2: 'false'},
    2: 'true',
    3: 'true',
    4: 'true',
    'name': 'Delivery number',
    'last': 3},
   'name': 'Age',
   'last': 0},
  1: {1: {1: {0: {1: 'false',
      2: 'true',
      'name': 'Blood of Pressure',
      'last': 0,
      0: 'false'},
     1: 'true',
     'name': 'Delivery time',
     'last': 1,
     2: 'false'},
    2: 'true',
    'name': 'Delivery number',
    'last': 1,
    3: 'false',
    4: 'false'},
   2: {1: {0: 'true',
     2: {0: 'true',
      2: 'false',
      'name': 'Blood of Pressure',
      'last': 2,
      1: 'false'},
     'name': 'Delivery time',
     'last': 1,
     1: 'false'},
    2: {0: {0: 'true',
      1: 'true',
      'name': 'Blood of Pressure',
      'last': 0,
      2: 'false'},
     1: {1: 'false',
      2: 'true',
      'name': 'Blood of Pressure',
      'last': 1,
      0: 'false'},
     'name': 'Delivery time',
     'last': 2,
     2: 'false'},
    3: 'true',
    'name': 'Delivery number',
    'last': 2,
    4: 'false'},
   3: {1: 'true',
    2: 'true',
    3: {0: {1: 'true',
      'name': 'Blood of Pressure',
      'last': 0,
      0: 'false',
      2: 'false'},
     1: 'true',
     2: {1: 'false',
      2: 'true',
      'name': 'Blood of Pressure',
      'last': 2,
      0: 'false'},
     'name': 'Delivery time',
     'last': 3},
    4: 'true',
    'name': 'Delivery number',
    'last': 3},
   'name': 'Age',
   'last': 1},
  'name': 'Heart Problem',
  'last': 'name'}}
label={'Age':0,'Delivery number':1,'Delivery time':2,'Blood of Pressure':3,'Heart Problem':4}
Input=[2,2,0,0,1]
user=Mytree['name']

def Result(input,user):
    while True:
        cx=user['name']
        user=user[input[label[cx]]]
        if user==('true'):
            return 1
        elif user==('false'):
            return 0
        else:
            cx=user['name']
Result(Input,Mytree['name'])
1
test=df.values
xtest=test[:,0:5]
ytest=test[:,5:6]

score=0
for i in range(len(test)):
    xin=xtest[i]
    yin=ytest[i]
    a=(Result(xin,Mytree['name']))
    b=(yin[0])
    if a==b:
        score+=1
print(score/len(test))
0.8625
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值