包含了离散变量与连续变量
def calculate_ent(Dataset):
num_total = len(Dataset)
LabelsCount = {}
for FeatureVector in Dataset:
CurrentLabel = FeatureVector[-1]
if CurrentLabel not in LabelsCount.keys():
LabelsCount[CurrentLabel] = 0
LabelsCount[CurrentLabel] += 1
entropy = 0
for key in LabelsCount:
p = float(LabelsCount[key])/num_total
entropy -= p * log(p, 2)
return entropy
# 得到某特征取值为某值的子集
def GetSubset(Dataset, feature, value):
Subset = []
for FeatureVector in Dataset:
if FeatureVector[feature] == value:
SubVector = np.hstack((FeatureVector[:feature], FeatureVector[(feature+1):]))
Subset.append(SubVector)
return Subset
# 连续情形获取子集
def GetSubset_continue(Dataset, feature, value):
Subset_left = []
Subset_right = []
for FeatureVector in Dataset:
if FeatureVector[feature] <= value:
Subset_left.append(FeatureVector)
elif FeatureVector[feature] > value:
Subset_right.append(FeatureVector)
return Subset_left, Subset_right
# 计算信息增益
def calculate_gain(Dataset, feature, continue_label=0):
t = 0
if continue_label == 0:
feature_value = set([FeatureVector[feature] for FeatureVector in Dataset])
gain = calculate_ent(Dataset)
for value in feature_value:
Subset = GetSubset(Dataset, feature, value)
gain -= (len(Subset)/len(Dataset))*calculate_ent(Subset)
if continue_label == 1:
T = [FeatureVector[feature] for FeatureVector in Dataset]
T = sorted(T)
value_select = half_apart(T)
gain = -np.Inf
for value in value_select:
Subset_left, Subset_right = GetSubset_continue(Dataset, feature, value)
gain0 = calculate_ent(Dataset) - len(Subset_left)/len(Dataset)*calculate_ent(Subset_left) - \
len(Subset_right)/len(Dataset)*calculate_ent(Subset_right)
if gain < gain0:
gain = gain0
t = value
return gain, t
# 选择连续变量可能的分点
def half_apart(T):
T1 = copy.deepcopy(T)
T.insert(0,0)
T.pop(-1)
T = np.array(T)
T1 = np.array(T1)
arr = np.mean([T,T1], axis = 0)
return arr
# 选择最优特征
def ChooseBestfeature(Dataset,Featurelabel):
BestGain, t = calculate_gain(Dataset, 1, Featurelabel[0])
BestFeature = 1
for i, label in enumerate(Featurelabel):
Gain, t = calculate_gain(Dataset, i, label)
if Gain >= BestGain:
BestFeature = i
BestGain = Gain
return BestFeature
# 投票表决
import operator
def MostChoice(Dataset):
LabelsCount = {}
for FeatureVector in Dataset:
CurrentLabel = FeatureVector[-1]
if CurrentLabel not in LabelsCount.keys():
LabelsCount[CurrentLabel] = 0
LabelsCount[CurrentLabel] += 1
sortedClassCount = sorted(LabelsCount.items(), key=operator.itemgetter(1), reverse = True)
return sortedClassCount[0][0]
# 可划分性
def Severability(Dataset):
data = np.delete(Dataset, -1, axis=1)
severability = True
for i in range(len(data)):
for j in range(i+1,len(data)):
if (data[i] == data[j]).all():
severability = False
return severability
#
def CreatTree(DataSet, Featurelabel, Feature_names):
LabelList = [FeatureVector[-1] for FeatureVector in DataSet]
#只有一种类别
if LabelList.count(LabelList[0]) == len(LabelList):
return LabelList[0]
#无法进行划分或特征集为0
if Severability(DataSet) == False or len(DataSet[0]) == 1:
return MostChoice(DataSet)
#最优特征选择
bestFeature = ChooseBestfeature(DataSet, Featurelabel)
bestFeaturename = Feature_names[bestFeature]
bestFeaturelabel = Featurelabel[bestFeature]
MyTree = {bestFeaturename:{}}
if Featurelabel[bestFeature] == 0:
del (Feature_names[bestFeature])
del (Featurelabel[bestFeature])
# 最优属性的每个值,生成分支,找到子集
if bestFeaturelabel == 0:
bestFeaturevalue = set(FeatureVector[bestFeature] for FeatureVector in DataSet)
for value in bestFeaturevalue:
Subset = GetSubset(DataSet, bestFeature, value)
SubLabels = Featurelabel[:]
Subnames = Feature_names[:]
MyTree[bestFeaturename][value] = CreatTree(Subset, SubLabels, Subnames)
if bestFeaturelabel == 1:
gain, t = calculate_gain(DataSet, bestFeature, continue_label=1)
tip = ['<='+str(t), '>'+str(t)]
Subsetleft, Subsetright = GetSubset_continue(DataSet, bestFeature, t)
for i, value in enumerate(tip):
if i == 0:
Subset = Subsetleft
else:
Subset = Subsetright
SubLabels = Featurelabel[:]
Subnames = Feature_names[:]
MyTree[bestFeaturename][value] = CreatTree(Subset, SubLabels, Subnames)
return MyTree