首先分析分析ID3/C4.5算法之间的区别于联系
ID3: 对可取数目较多的类别有所偏好
C4.5: 对可取数目较少的类别有所偏好。
在实际应用中会使用一种折中的方法,该方法是先从ID3中选取出信息增益大于平均信息增益的类别,再利用C4.5选取增益率最大的作为最佳分类准则。
CART: 基尼系数
以下是ID3算法的实现代码:
def ID3Ent( Data):
m,n = shape(Data)
lableCount = {}
for vect in Data:
curentLabel = vect[-1]
if curentLabel in lableCount.keys():
lableCount[curentLabel] += 1
else:
lableCount[curentLabel] = 1
ent = 0.0
for key in lableCount.keys():
p = lableCount[key]/m
ent += -p*log(p, 2)
return ent
def splitDataSet(data, axis, value):
reductData = []
for feat in data:
if feat[axis] == value:
reductFeat = feat[:axis]
reductFeat.extend(feat[axis + 1:])
reductData.append(reductFeat)
return reductData
def cla_info_gain(data, axis):
Ent = 0
base_Ent = ID3Ent(data)
feat_list = [example[axis] for example in data]
feat = set(feat_list)
for value in feat:
reductData = splitDataSet(data, axis, value)
p = len(reductData)/ len(data)
Ent += p * ID3Ent(reductData)
info_gain = base_Ent - Ent
return info_gain
以下是C4.5 算法的代码实现:
def Gain_ratio(data, axis):
feat = set([example[axis] for example in data])
info_Gain = cla_info_gain(data, axis)
IV = 0.0
for value in feat:
reductData = splitDataSet(data, axis, value)
p = len(reductData)/len(data)
IV += -p * log(p, 2)
Gain_R = info_Gain/ IV
return Gain_R
以下是CART代码部分:
################# CART ###################################
def Gini(data):
Gini_value = 0.0
labelCount = {}
for vec in data:
if vec[-1] in labelCount.keys():
labelCount[vec] += 1
else:
labelCount[vec] = 1
for key in labelCount.keys():
Gini_value += labelCount[key]/len(data)
return 1 - Gini_value**2
def Gini_index(data, axis):
feat = set([example[axis] for example in data])
Gini_index_value = 0
for value in feat:
reducData = splitDataSet(data,axis, value[axis])
p = len(reducData)/len(data)
Gini_index_value += Gini(data)
以下是实践部分
# -*- coding: utf-8 -*-
import numpy as np
def loadData(file):
fr = open(file)
data = []
for line in fr.readlines():
lineArr = [float(i) for i in line.split('\t')]
data.append(lineArr)
return data
#测试
#import pandas as pd
#file = r"C:\Users\Administrator\Desktop\python\data\ex0.txt"
#data = loadData(file)
#pd.DataFrame(data).plot(x = 0, y= 1, kind = 'scatter')
#三个参数<数据集合, 待切分的特征, 该特征的某个值>
def binSplitDataSet(dataSet, feature, value):
mat0 = dataSet[np.nonzero(dataSet[:, feature] > value)[0], :]
mat1 = dataSet[np.nonzero(dataSet[: , feature] <= value)[0], :]
return mat0, mat1
#四个参数<数据集>, 其他三个参数可选<leafType建议叶节点的函数;errType误差计算函数;ops树构建所需的其他参数的元组>
#choosebestSplit() -- 切分函数
'''伪代码:
找到最佳的待切分特征:
如果该节点不能再分,将该节点存为叶节点
执行二元切分
在右子树调用create Tree()方法
在左子树调用createtree()方法
'''
#生成叶节点
def regLeaf(dataSet):
return np.mean(dataSet[:,-1])
#计算目标变量的平方误差
def regErr(dataSet):
return np.var(dataSet[:, -1]) * np.shape(dataSet)[0]
'''
#chooseBestSplit() 最佳方式切分数据集和生成相应的叶节点 数据集切分的最佳位置
遍历所有的特征及其可能的取值来找到使误差最小化的切分阈值,伪代码如下:
对每个特征:
对每个特征值:
将数据集切分成两份
计算切分的误差
如果当前误差小于当前最小误差,那么将当前切分设定为最佳切分,并更新最小误差
返回最佳切分的特征和阈值
'''
def chooseBestSplit(dataSet, leafType = regLeaf, errType = regErr, ops = (1,4)):
tolS = ops[0] #容许的误差下降值
tolN = ops[1] #切分的最少样本数
#如果所有值都相等,则退出
if len(set(dataSet[:, -1])) == 1:
return None, leafType(dataSet)
m, n = np.shape(dataSet)
S = errType(dataSet)
bestS = float('inf') # inf 表示无穷的意思
bestIndex = 0; bestValue = 0
for featIndex in range(n-1):
for splitVal in set(dataSet[:, featIndex]):
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
#如果误差减少不大则退出
if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): continue
newS = errType(mat0) + errType(mat1)
if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
if (S - bestS) < tolS:
return None, leafType(dataSet)
mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
#如果切分的数据集很小,则退出
if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
return None, leafType(dataSet)
return bestIndex, bestValue
def creatTree(dataSet, leafType = regLeaf, errType = regErr, ops = (1,4)):
feat, val = chooseBestSplit(dataSet, leafType, errType, ops)
if feat == None: return val
retTree = {}
retTree['spInd'] = feat
retTree['spVal'] = val
lSet, rSet = binSplitDataSet(dataSet, feat, val)
retTree['left'] = creatTree(lSet, leafType, errType, ops)
retTree['right'] = creatTree(rSet, leafType, errType, ops)
return retTree
#测试
#testMat = np.mat(np.eye(4))
#mat0, mat1 = binSplitDataSet(testMat, 1, 0.5)
if __name__ == '__main__':
import matplotlib.pyplot as plt
file = r"C:\Users\Administrator\Desktop\python\data\ex0.txt"
dataSet = loadData(file)
dataSet = np.array(dataSet)
plt.scatter(dataSet[:, 0], dataSet[:, 1])
re = creatTree(dataSet)
'''
基于已有的树切分测试数据:
如果存在任一子集是一棵树,则在该子集递归剪枝过程
计算将当前两个叶节点合并后的误差
计算不合并的误差
如果合并会降低叶节点误差的画,就将叶节点合并
'''
def isTree(obj):
return (type(obj).__name__ == 'dict')
#递归函数,如果找到两个叶节点,则对他们求均值
def getMean(tree):
if isTree(tree['right']):
tree['right'] = getMean(tree['right'])
if isTree(tree['left']):
tree['left'] = getMean(tree['left'])
return (tree['left'] + tree['right'])/2.0
def prune(tree, testData):
'''
tree:待剪枝的树
testData:剪枝所需的测试数据
'''
#检验测试数据否为空
if np.shape(testData)[0] == 0:
return getMean(tree)
if (isTree(tree['right']) or isTree(tree['left'])):
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
if isTree(tree['left']) :
tree['left'] = prune(tree['left'], lSet)
if isTree(tree['right']) :
tree['right'] = prune(tree['right'] , rSet)
if not isTree(tree['left']) and not isTree(tree['right']):
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
errorNoMerge = np.sum(np.power(lSet[:, -1] - tree['left'] , 2)) +\
np.sum(np.power(rSet[:, -1] - tree['right'] , 2))
treeMean = (tree['left'] + tree['right'])/2.0
errorMerge = np.sum(np.power(testData[:, -1] - treeMean, 2))
if errorMerge <errorNoMerge:
print("merging")
return treeMean
else:
return tree
else:
return tree
#测试
if __name__ == '__main__':
ex2 = r'C:\Users\Administrator\Desktop\python\data\ex2.txt'
myDat2 = loadData(ex2)
myDat2 = np.array(myDat2)
plt.scatter(myDat2[:, 0], myDat2[:, 1])
myTree = creatTree(myDat2, ops= (0, 1))
ex2test = r'C:\Users\Administrator\Desktop\python\data\ex2test.txt'
myDataTest = loadData(ex2test)
myDataTest = np.array(myDataTest)
plt.scatter(myDataTest[:, 0], myDataTest[:, 1])
testData = myDataTest.copy()
prune(myTree, myDataTest)
#将数据集格式化成目标变量Y和自变量X
def linearSolve(dataSet):
m, n = np.shape(dataSet)
X = np.mat(np.ones((m, n-1)))
Y = np.mat(np.ones((m, 1)))
X[:, 1:n] = dataSet[:, 0: n-1]
Y = np.mat(dataSet[:, -1]).reshape(m,1)
xTx = X.T*X
#判断特征值是否为0
if np.linalg.det(xTx) == 0.0:
raise NameError ('This matrix is singular, cannot do inverse,\n\
try increasing the second value of ops')
ws = xTx.T*(X.T*Y)
return ws, X, Y
#负责生成叶节点
def modelLeaf(dataSet):
ws, X, Y = linearSolve(dataSet)
return ws
#在给定的数据集上计算误差
def modelErr(dataSet):
ws, X, Y = linearSolve(dataSet)
yHat = X*ws
return np.sum(np.power(Y - yHat, 2))
#测试
if __name__ == '__main__':
ex2 = r'C:\Users\Administrator\Desktop\python\data\ex2.txt'
myDat2 = loadData(ex2)
myDat2 = np.array(myDat2)
plt.scatter(myDat2[:, 0], myDat2[:, 1])
myTree = creatTree(myDat2, modelLeaf, modelErr ,ops= (1, 10))
ex2test = r'C:\Users\Administrator\Desktop\python\data\ex2test.txt'
myDataTest = loadData(ex2test)
myDataTest = np.array(myDataTest)
plt.scatter(myDataTest[:, 0], myDataTest[:, 1])
testData = myDataTest.copy()
prune(myTree, myDataTest)
def regTreeEval(model, inDat):
return float(model)
#对叶节点数据进行预测
def modelTreeEval(model, inDat):
n = np.shape(inDat)[1]
X = np.mat(np.ones((1, n+1)))
X[:, 1: n+1] = inDat
return float(X*model)
#自顶向下遍历整棵树,直到命中叶节点为止,一旦到达叶节点,它就会在输入数据上调用modelEval()函数,而该函数的默认值实regTreeEval()
def treeForeCast(tree, inData, modelEval = regTreeEval):
if not isTree(tree):
return modelEval(tree, inData)
if inData[tree['spInd']] > tree['spVal']:
if isTree(tree['left']):
return treeForeCast(tree['left'], inData, modelErr)
else:
return modelEval(tree['left'], inData)
else:
if isTree(tree['right']):
return treeForeCast(tree['right'], inData, modelEval)
else:
return modelEval(tree['right'], inData)
def creatForeCast(tree, testData, modelEval = regTreeEval):
m = len(testData)
yHat = np.mat(np.zeros((m, 1)))
for i in range(m):
yHat[i, 0] = treeForeCast(tree, np.mat(testData[i]), modelEval)
return yHat