决策树原理实现代码如下所示:(参考自机器学习实践 Peter Harrington)。
import math
x=[[0,1,"no"],[0,1,"no"],[1,0,"no"],[1,1,"yes"],[1,1,"no"],[1,1,"no"],[1,1,"maybe"],[1,1,"maybe"],[1,1,"maybe"]]
# x=[[0,1,"no"],[0,1,"no"],[1,0,"no"],[1,1,"yes"],[1,1,"yes"],]
def majorityCnt(clasList):
classCount={}
for label in clasList:
if label not in classCount.keys():
classCount[label]=0
classCount[label]+=1
max_value=max(classCount.values())
for index in classCount.keys():
if classCount[ index]==max_value:
return index
def splitDataset(dataset,BestFeature,feature_i_j):
new_set=[]
for example in dataset:
if example[BestFeature]==feature_i_j:
new_set.append(example[:BestFeature]+example[BestFeature+1:])
return new_set
def computeEntropy(subData,feature_i_j):
value_class=set()
for data in subData:
value_class.add(data[-1])
entropy=0
value_class_statistics={}
for sample in subData:
if sample[-1] not in value_class_statistics.keys():
value_class_statistics[sample[-1]]=0
value_class_statistics[sample[-1]]+=1
for value in value_class:
# len_value=float(len([sample for sample in subData ]))
prob=value_class_statistics[value]/float(len(subData))
entropy-=prob*math.log(prob,2)
return entropy
def chooseBestFeature(dataset):
info_entropy=[]
base_entropy=computeEntropy(dataset,0)
print(base_entropy)
feat_len=len(dataset[0][:-1])
for feature_i in range(feat_len):
feature_i_species=set(dataset[:][feature_i])
entropy=0
subData=[]
for feature_i_j in feature_i_species:
# entropy=math.log(feature_i_j,2)
for example in dataset:
if example[feature_i] == feature_i_j:
subData.append(example)
prob=float(len(subData))/float(len(dataset))
entropy+=prob*computeEntropy(subData,feature_i_j)#加权求平均熵
subData.clear()
info_entropy.append(entropy)
return info_entropy.index(min(info_entropy))
def createTree(dataset,labels=None):
classList=[example[-1] for example in dataset]
if classList.count(classList[0])==len(classList):
return classList[0]
if len(dataset[0])==1:
return majorityCnt(classList)#标签用光了
# for
DTree={}
feature_i_species = set()
BestFeature=chooseBestFeature(dataset)
# del(labels[BestFeature])
#删除列
for example in dataset:
feature_i_species.add(example[BestFeature])
DTree={BestFeature:{}}
for feature_i_j in feature_i_species:
if labels!=None:
sublabels = labels[:]
DTree[BestFeature][feature_i_j]=createTree(splitDataset(dataset,BestFeature,feature_i_j),sublabels)
else:
DTree[BestFeature][feature_i_j] = createTree(splitDataset(dataset, BestFeature, feature_i_j))
return DTree
print(createTree(x))
理解原理之后来做一个例子,更具红酒的11个特征对品质进行十分类。为方便起见把所有特征做成0,1,即大于中位数的取1,小于中位数的取0。红酒数据集可以从
:http://archive.ics.uci.edu/ml/datasets/Wine+Quality
下载。
结果显示正确率在45%左右。
有任何问题可以留言,我会及时回复哒
import math
import pandas as pd
import numpy as np
import csv
x=[[0,1,"no"],[0,1,"no"],[1,0,"no"],[1,1,"yes"],[1,1,"no"],[1,1,"no"],[1,1,"maybe"],[0,1,"maybe"],[2,0,"maybe"],[1,1,"yes"],[0,0,"no"],[1,1,"no"],[1,1,"maybe"],[1,1,"maybe"],[1,1,"maybe"]]
# x=[[0,1,"no"],[0,1,"no"],[1,0,"no"],[1,1,"yes"],[1,1,"yes"],]
def majorityCnt(clasList):
classCount={}
for label in clasList:
if label not in classCount.keys():
classCount[label]=0
classCount[label]+=1
max_value=max(classCount.values())
for index in classCount.keys():
if classCount[ index]==max_value:
return index
def splitDataset(dataset,BestFeature,feature_i_j):
new_set=[]
for example in dataset:
if example[BestFeature]==feature_i_j:
new_set.append(example[:BestFeature]+example[BestFeature+1:])
return new_set
def computeEntropy(subData,feature_i_j):
value_class=set()
for data in subData:
value_class.add(data[-1])
entropy=0
value_class_statistics={}
for sample in subData:
if sample[-1] not in value_class_statistics.keys():
value_class_statistics[sample[-1]]=0
value_class_statistics[sample[-1]]+=1
for value in value_class:
# len_value=float(len([sample for sample in subData ]))
prob=value_class_statistics[value]/float(len(subData))
entropy-=prob*math.log(prob,2)
return entropy
def chooseBestFeature(dataset):
info_entropy=[]
base_entropy=computeEntropy(dataset,0)
# print(base_entropy)
feat_len=len(dataset[0][:-1])
for feature_i in range(feat_len):
#
# print(feature_i,dataset[:])
# print(dataset[:][feature_i])
feature_i_species={0,1}
# feature_i_species=set(dataset[:][feature_i])
# print(feature_i_species)
entropy=0
subData=[]
for feature_i_j in feature_i_species:
# entropy=math.log(feature_i_j,2)
for example in dataset:
if example[feature_i] == feature_i_j:
subData.append(example)
prob=float(len(subData))/float(len(dataset))
entropy+=prob*computeEntropy(subData,feature_i_j)#加权求平均熵
subData.clear()
info_entropy.append(entropy)
return info_entropy.index(min(info_entropy))
def createTree(dataset,labels0=None):
labels=labels0[:]
classList=[example[-1] for example in dataset]
if classList.count(classList[0])==len(classList):
return classList[0]
if len(dataset[0])==1:
return majorityCnt(classList)#标签用光了
# for
DTree={}
feature_i_species = set()
BestFeature_index=chooseBestFeature(dataset)
BestFeature = labels[BestFeature_index]
if BestFeature=="quality":
print("wrong")
del(labels[chooseBestFeature(dataset)])
#删除列
for example in dataset:
feature_i_species.add(example[BestFeature_index])
DTree={BestFeature:{}}
for feature_i_j in feature_i_species:
if labels!=None:
sublabels = labels[:]
DTree[BestFeature][feature_i_j]=createTree(splitDataset(dataset,BestFeature_index,feature_i_j),sublabels)
else:
DTree[BestFeature][feature_i_j] = createTree(splitDataset(dataset, BestFeature_index, feature_i_j))
return DTree
def dichotomy(data,median=None):
#可以用于清洗额外的数据
if median!=None:
for i in range(len(data)):
for j in range(len(data[0])):
if data[i][j] <= median[j]:
data[i][j] = 0
else:
data[i][j] = 1
return data
# median=[0]*len(data[0])
mid=np.zeros(data.shape)
new_data=np.sort(data,axis=0)
median=new_data[data.shape[0]//2,:]
for i in range(len(data)):
for j in range(len(data[0])):
if data[i][j]<=median[j]:
data[i][j]=0
else:
data[i][j]=1
# mid[j].append(data[i][j])
# s_mid=[sorted(j) for j in mid]
# median=[i for i in s_mid[:][len(data)//2]]
# for i in range(len(data)):
# for j in range(len(data[0])):
# if data[i][j] < median[j]:
# data[i][j]=0
# else:
# data[i][j]=1
return data,median
def classify(inputTree,featLabels,testVec):#一次测试一个
x= list(inputTree.keys())[0]
secondDict=inputTree[x]
featIndex=featLabels.index(x)#主意标签转换
for key in secondDict.keys():
if testVec[featIndex]==key:
if type(secondDict[key]).__name__=="dict":
# print(secondDict,key)
classLabel=classify(secondDict[key],featLabels,testVec)
else:
classLabel=secondDict[key]
return classLabel
def cleanData(path='dataset\\winequality-red.csv'):
f = csv.reader(open(path, 'r'))
flag = 0
Data=[]
for i in f:
data = i[0].split(";")
if flag != 0:
data = [eval(j) for j in data]
flag += 1
Data.append(data)
numbers_data=np.array(Data[1:])
bio_data,media=dichotomy(numbers_data[:,:-1])
numbers_data=np.append(bio_data,numbers_data[:,-1:],axis=1)#二分法
Data=[Data[0]]+numbers_data.tolist()
return Data,media
Data,media=cleanData()#清洗数据,再分训练和测试
Train_data,Test_data=Data[1:int(len(Data)*1599/1600)],Data[int(len(Data)*1/1600):]
Mytree=createTree(Train_data,Data[0])
print(Mytree)
correct_n=0
for Test_vec in Test_data:
x=classify(Mytree,Data[0],Test_vec)
if x==Test_vec[-1]:
correct_n+=1
print(x,Test_vec[-1])
print(correct_n,correct_n/(5/6*len(Data)))
# print(createTree(x))
附实验报告
实验1:基于决策树的分类算法
- 实验目的
本次实验通过实现决策树算法,使学生了解相关知识。并通过让学生自行从原始数据构建自定义数据集和使用决策树搭建简单的分类器,使学生能够学习到决策树的知识。
- 实验内容
1用python语言构建决策树。
2.用所构建的决策树进行红酒分类,测试效果。
- 实验原理
决策树(Decision Tree)是在已知各种情况发生概率的基础上,通过构成决策树来求取净现值的期望值大于等于零的概率,评价项目风险,判断其可行性的决策分析方法,是直观运用概率分析的一种图解法。由于这种决策分支画成图形很像一棵树的枝干,故称决策树。在机器学习中,决策树是一个预测模型,他代表的是对象属性与对象值之间的一种映射关系。Entropy = 系统的凌乱程度,使用算法ID3, C4.5和C5.0生成树算法使用熵。这一度量是基于信息学理论中熵的概念。
决策树是一种树形结构,其中每个内部节点表示一个属性上的测试,每个分支代表一个测试输出,每个叶节点代表一种类别。
分类树(决策树)是一种十分常用的分类方法。它是一种监督学习,所谓监督学习就是给定一堆样本,每个样本都有一组属性和一个类别,这些类别是事先确定的,那么通过学习得到一个分类器,这个分类器能够对新出现的对象给出正确的分类。这样的机器学习就被称之为监督学习。
- 实验过程
- 数据集整理:从网络上下载红葡萄酒数据集,由于其中数据多为连续数据,我们将它离散化处理,对每个连续特征求中位数,二分处理。
- 构建决策树并训练:利用python构建一决策树,并读入数据训练
- 实验结果与分析
在多次调试后,红酒十分类正确率稳定在45%~50%,虽然正确率较低,但仍然高于10%随机猜测的概率,说明决策树构建得部分正确。
分析正确率较低的原因有:1.二分处理法太简单粗暴,忽略了很多细分特征。2.训练数据较小和特征不突出,构建出的决策树在测试中仍会出现输出为None的情况,说明这个样例在构造的决策树中不被包括。
如果能加大数据集和加强特征采集的精细程度,正确率可以得到进一步提高。