缺失值算是决策树里处理起来比较麻烦的了,其他简单的我就不发布了。
# encoding:utf-8
from \_\_future\_\_ import division
\_\_author\_\_ = 'HP'
import copy
import math
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.preprocessing import LabelEncoder
################################
# id3
# 离散属性
# 多分类
# 多重字典记录学习规则
# 非递归
# 深度优先
# 预剪枝
#\## 缺失值处理
# 解决两个问题
# 如何进行划分属性选择,缺失值如何处理
# 如何进行样本划分,缺失值对应的样本如何划分
################################
''' 缺失值处理
1. 如何进行属性选择
a. 第一次选择划分属性时,样本等权重,均为1,找出未缺失的样本集,计算该样本集的信息增益 和 该样本集的占比,两者相乘即为真正的信息增益
. 注意这时计算占比,就是数个数,因为权重都是1
. 计算信息增益时,P也是数个数
b. 后面选择划分属性时,样本不等权重,找出未缺失的样本集,计算该样本集的信息增益 和 该样本集的占比,两者相乘即为真正的信息增益
. 此时样本权重不全为1
. 计算占比时不是数个数,而是求权重和
. 计算信息增益的P时,也是求权重和
2. 如何划分节点
a. 未缺失按照正常方法划分,权重都为1
b. 缺失值划到所有子集当中,权重不为1, 而是该属性值占未缺失的样本集的比例
'''
def mydata():
data \= pd.read\_csv('xg3.txt',index\_col=\[0\], encoding='gbk')
data\[\[\-1\]\] = data.apply(lambda x:x\[-1\].strip(), axis=1)
# print(data)
# print(pd.get\_dummies(data\[\[0\]\]))
data.columns = range(9)
# print(data)
encode\_str \= LabelEncoder()
str\_cols \= \[0, 1, 2, 3, 4, 5, 8\]
for i in str\_cols:
data\[\[i\]\] \= encode\_str.fit\_transform(data\[\[i\]\])
return data.values
def get\_label(labels):
count\_label \= Counter(labels)
key \= None
sum \= 0
for label, count in count\_label.items():
if count > sum:
sum \= count
key \= label
return key
def entropy(attr):
# 信息熵
attr\_values\_count = Counter(attr)
attr\_len \= len(attr)
sum \= 0
for i in attr\_values\_count.values():
sum += -1 \* i / attr\_len \* math.log(i / attr\_len, 2)
return sum
def gain\_queshi\_equal\_weight(attr, label):
# 缺失属性的信息增益,用于初次划分,初次划分样本权重都为1
index\_nan = np.isnan(attr)
index\_nonan \= np.where(attr>=0)
# 未缺失属性及标签
attr\_new = attr\[index\_nonan\]
label\_new \= label\[index\_nonan\]
# 未缺失样本数
count\_nonan = label\_new.shape\[0\]
# 未缺失占比
zhanbi = attr\_new.shape\[0\]/attr.shape\[0\]
# 未缺失的原始熵
ori\_entropy = entropy(label\_new)
# 未缺失的新熵
new\_entropy = 0
for key, count in Counter(attr\_new).items():
# 未缺失中属性值为key的占比 \* key对应的样本集的熵
new\_entropy += count/count\_nonan \* entropy(label\_new\[np.where(attr\_new == key)\])
# 信息增益
gain = zhanbi \* (ori\_entropy - new\_entropy)
return gain
def split\_node\_queshi(node, attr\_split):
# 属性有缺失值的样本划分
index\_nan = np.isnan(node\[:,attr\_split\])
index\_nonan \= np.where(node\[:,attr\_split\]>=0)
# 未缺失属性值对应的样本集
node\_new = node\[index\_nonan\]
# 缺失属性值对应的样本集
sample\_queshi = node\[index\_nan\]
# 未缺失样本大小
count\_nonan = node\_new.shape\[0\]
#\## 对该样本集进行划分
# 未缺失的划分 \[属性值,样本集,样本占比\]
split = \[\]
for key, node\_child in pd.DataFrame(node\_new).groupby(attr\_split):
# 属性值为key的样本在未缺失样本中占比
zhanbi\_key = round(len(node\_child) / count\_nonan, 3)
# 未缺失样本权重为1
weight = \[1\] \* len(node\_child)
# 添加缺失样本
node\_child = np.vstack((node\_child.values, sample\_queshi))
# 缺失样本权重
weight.extend(\[zhanbi\_key\] \* len(sample\_queshi))
split.append(\[key, node\_child, np.array(weight)\])
return split
def entropy\_no\_equal\_weight(attr, weight):
# 样本不等权重的信息熵
sum = 0
sum\_weight \= np.sum(weight)
for key in Counter(attr).keys():
index \= np.where(attr==key)
zhanbi \= np.sum(weight\[index\]) / sum\_weight
sum += -1 \* zhanbi \* math.log(zhanbi, 2)
return sum
def gain\_queshi\_no\_equal\_weight(attr, weight, label):
# 缺失属性的信息增益,样本权重不相等,用于第一次之后的属性选择
index\_nan = np.isnan(attr)
index\_nonan \= np.where(attr>=0)
# 未缺失的属性/标签/权重
attr\_new = attr\[index\_nonan\]
label\_new \= label\[index\_nonan\]
weight\_new \= weight\[index\_nonan\]
# 未缺失对应的样本占比
zhanbi = np.sum(weight\_new) / np.sum(weight)
#\## 未缺失对应的信息增益
# 未缺失对应的原始熵
ori\_entropy = entropy\_no\_equal\_weight(label\_new, weight\_new)
# 未缺失的新熵
new\_entropy = 0
for key in Counter(attr\_new).keys():
index\_key \= np.where(attr\_new==key)
label\_key \= label\_new\[index\_key\]
weight\_key \= weight\_new\[index\_key\]
new\_entropy += len(label\_key) / len(label\_new) \* entropy\_no\_equal\_weight(label\_key, weight\_key)
# 信息增益
gain = zhanbi \* (ori\_entropy - new\_entropy)
return gain
if \_\_name\_\_ == '\_\_main\_\_':
data \= mydata()
# 离散型样本
data = data\[:,\[0,1,2,3,4,5,8\]\]
data\[0, 0\] \= None
data\[4, 0\] = None
data\[12, 0\] = None
data\[7, 3\] = None
data\[9, 3\] = None
print(data)
# 缺失属性的信息增益 样本等权重
for i in range(data.shape\[1\]):
print gain\_queshi\_equal\_weight(data\[:,i\], data\[:,-1\])
# 缺失值属性的样本划分
split = split\_node\_queshi(data, 3)
print(split)
# 缺失属性的信息增益 样本不等权重
# weight = np.array(\[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1/3, 1/3\])
# gain\_queshi\_no\_equal\_weight(data\[:,0\], weight, data\[:,-1\])
# 以色泽为例
gain = gain\_queshi\_no\_equal\_weight(split\[2\]\[1\]\[:,0\], split\[2\]\[2\],split\[2\]\[1\]\[:,-1\])
print(gain)
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
- 51.
- 52.
- 53.
- 54.
- 55.
- 56.
- 57.
- 58.
- 59.
- 60.
- 61.
- 62.
- 63.
- 64.
- 65.
- 66.
- 67.
- 68.
- 69.
- 70.
- 71.
- 72.
- 73.
- 74.
- 75.
- 76.
- 77.
- 78.
- 79.
- 80.
- 81.
- 82.
- 83.
- 84.
- 85.
- 86.
- 87.
- 88.
- 89.
- 90.
- 91.
- 92.
- 93.
- 94.
- 95.
- 96.
- 97.
- 98.
- 99.
- 100.
- 101.
- 102.
- 103.
- 104.
- 105.
- 106.
- 107.
- 108.
- 109.
- 110.
- 111.
- 112.
- 113.
- 114.
- 115.
- 116.
- 117.
- 118.
- 119.
- 120.
- 121.
- 122.
- 123.
- 124.
- 125.
- 126.
- 127.
- 128.
- 129.
- 130.
- 131.
- 132.
- 133.
- 134.
- 135.
- 136.
- 137.
- 138.
- 139.
- 140.
- 141.
- 142.
- 143.
- 144.
- 145.
- 146.
- 147.
- 148.
- 149.
- 150.
- 151.
- 152.
- 153.
- 154.
- 155.
- 156.
- 157.
- 158.
- 159.
- 160.
- 161.
- 162.
- 163.
- 164.
- 165.
- 166.
- 167.
- 168.
- 169.
- 170.
- 171.
- 172.
- 173.
- 174.
- 175.
- 176.
- 177.
- 178.
- 179.
- 180.
- 181.
- 182.
- 183.
- 184.
- 185.
- 186.
- 187.
- 188.
- 189.
- 190.
- 191.
- 192.
- 193.
- 194.
- 195.
- 196.
- 197.
- 198.