前言:
目录:
- CART
- 模型树
- 树剪枝
- GUI 使用
一 CART
流程:
CreateTree
寻找最佳的切分特征:
如果该节点不能再分,将该节点存为叶节点
执行二元切分
右子树调用 CreateTree() 方法
左子树调用CreateTree()方法
最佳的切分特征:
遍历每个特征值
将数据切分为两份
计算切分的误差
如果当前的误差小于当前的最小误差,那将当前切分设定为最佳切分并更新最小误差
返回最佳的特征和阀值
预剪枝:提前终止条件也是方法之一
树剪枝(后剪枝)
基于已有的树切分测试数据
如果存在任一子集是一棵树,则在该子集递归剪枝过程
计算将当前两个叶结点合并后的误差
计算不合并后的误差
如果合并后的误差会降低误差,则叶合并
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 9 14:27:46 2019
@author: chengxf2
"""
import numpy as np
from time import sleep
import os
import matplotlib.pyplot as plt
class treeNone():
def __init__(self, feat, val, right, left):
self.feature = feat
self.value = val
self.rightBranch = right
self.leftBranch = left
def LoadDataSet(fileName):
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
floatLine = list(map(float, curLine))
dataMat.append(floatLine)
#print("\n dataMat \n ",dataMat)
return np.mat(dataMat)
def BinSplitDataSet(dataSet, feature, value):
index = np.nonzero(dataSet[:,feature]>value)[0]
index1 = np.nonzero(dataSet[:,feature]<=value)[0]
matL = dataSet[index,:]
matR = dataSet[index1,:]
return matL,matR
"""
当前的均值
Args
dataSet: 当前的样本集
return
均值
"""
def RegLeaf(dataSet):
y = np.mean(dataSet[:,-1])
return y
"""
当前的方差
Args
dataSet: 样本集
return
方差
"""
def RegErr(dataSet):
m,n = np.shape(dataSet)
q = np.var(dataSet[:,-1])
return q*m
"""
选择最优特征去切分
Args
dataSet: 数据集
leafType: 当前均值
errType: 方差
return
最优属性
"""
def chooseBestSplit(dataSet, leafType=RegLeaf, errType = RegErr, ops=(1,4)):
minDiff = ops[0]; minNum = ops[1]
#print("\n dataSet ",dataSet)
n = len(set(dataSet[:,-1].T.tolist()[0]))
print("\n ****chooseBestSplit**** \n ",n)
if n == 1: ##只有一类
return None, leafType(dataSet)
m,n = np.shape(dataSet)
S = errType(dataSet)
bestS = np.inf ; bestIndex = -1; bestValue = -1
for featIndex in range(n-1): ##最后一列是y
for val in set(dataSet[:,featIndex].T.tolist()[0]):
# print("\n val ",val)
matL, matR = BinSplitDataSet(dataSet, featIndex, val)
m1,n1 = np.shape(matL)
m2,n2 = np.shape(matR)
if m1<minNum or m2<minNum:
continue
newS = errType(matL)+errType(matR)
if newS<bestS:
bestS = newS
bestIndex = featIndex
bestValue = val
if (S-bestS)<minDiff:##以前无法划分了
return None,leafType(dataSet)
#############有个预剪枝操作#############3
m1,n1 = np.shape(matL)
m2,n2 = np.shape(matR)
if m1<minNum or m2<minNum:
print("\n *********m1 ****************")
return None,leafType(dataSet)
return bestIndex, bestValue
"""
创建树
Args
dataMat 数据集
leafType: 叶节点,均值
errType: 方差
ops() tuple[0] 终止条件1,当前切分后变化下限 tuple[1] 叶节点个数
"""
def createTree(dataMat, leafType = RegLeaf, errType = RegErr, ops=(1,4)):
print("\n ********Enter******* \n ")
feat, val = chooseBestSplit(dataMat, leafType, errType, ops)
print("\n createTree feat ",feat, "\t val: ",val)
if feat is None:
print("\n None Feat")
return val
retTree ={}
retTree['spInd'] = feat
retTree['spVal'] = val
lSet, rSet = BinSplitDataSet(dataMat, feat, val)
retTree['left']= createTree(lSet, leafType,errType,ops)
retTree['right']= createTree(rSet, leafType,errType,ops)
return retTree
"""
判断是否是树
"""
def IsTree(obj):
return type(obj).__name__=='dict'
"""
获得误差
Args
tree: 树
"""
def GetMean(tree):
if IsTree(tree['left']):
tree['left']=GetMean(tree['left'])
if IsTree(tree['right']):
tree['right']=GetMean(tree['right'])
return (tree['left']+tree['right'])/2.0
"""
后剪枝
Args
tree: 已经用train 数据集生成的数据
testData: 用测试集去后剪枝
"""
def Prune(tree, testData):
m,n = np.shape(testData)
if m == 0: ##数据集为Null
return GetMean(tree)
if IsTree(tree['left']) or IsTree(tree['right']): ##分割数据
lMat, rMat = BinSplitDataSet(testData, tree['spInd'], tree['spVal'])
if IsTree(tree['left']):
tree['left']= Prune(tree['left'],lMat)
if IsTree(tree['right']):
tree['right']= Prune(tree['right'],rMat)
if not IsTree(tree['left']) and not IsTree(tree['right']):
lMat, rMat = BinSplitDataSet(testData, tree['spInd'], tree['spVal'])
#print("\n tree['left']",tree['left'],"\t tree['right']",tree['right'])
errorNoMerge = np.sum(np.power(lMat[:,-1]-tree['left'],2))+ np.sum(np.power(rMat[:,-1]-tree['right'],2))
#print("\n errorM ::",errorNoMerge)
treeMean = (tree['left']+tree['right'])/2.0
errorMerge = np.sum(np.power(testData[:,-1]-treeMean,2))
if errorMerge< errorNoMerge:
print("\n Merget: ", errorNoMerge-errorMerge)
return treeMean
else:
return tree
else:
return tree
"""
预测
Args
data: 数据集
tree: 模型
modelEval : 预测树
"""
def TreeForeCast(tree, data):
if not IsTree(tree): ##已经是叶节点
y = float(tree)
return y
#print("\n tree['spVal']",tree['spInd'], "\t ",data)
key = tree['spInd']; value =tree['spVal']
if data[0,key]> value:
if IsTree(tree['left']):
return TreeForeCast(tree['left'],data)
else:
return float(tree['left'])
else:
if IsTree(tree['right']):
return TreeForeCast(tree['right'],data)
else:
return float(tree['right'])
"""
预测
Args
data: 数据集
tree: 模型
"""
def ForeCast(tree, testData):
m,n = np.shape(testData)
yHat = np.mat(np.zeros((m,1)))
X = np.arange(0,m)
Y =[]
for i in range(m):
yHat[i,0]= TreeForeCast(tree, testData[i,0:n-1])
print("\n 预测值 ",yHat[i,0],"\t 实际值: ", testData[i,-1])
Y.append(yHat[i,0]-testData[i,-1])
Y.sort()
plt.scatter(X,Y,c='r')
# plt.scatter(X,Y2,c='g')
return yHat
def Test():
path = os.path.abspath("ex2.txt")
trainData = LoadDataSet(path)
myTree = createTree(trainData,ops=(1,2))
print("\n ***MyTree ******** \n ", myTree)
path2 = os.path.abspath("ex2test.txt")
testData = LoadDataSet(path2)
newTree = Prune(myTree, testData)
print("\n prune Tree: \n ",newTree)
ForeCast(newTree, testData)
Test()
二 模型树
如上 所不同的,最优点切分使用回归算法。
1: 叶节点是一个权值系数,不再是预测值
预测值为:
myTree = createTree(trainData, ModelLeaf, ModelErr, ops=(1,5))
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 11 16:56:19 2019
@author: chengxf2
"""
import numpy as np
import os
import matplotlib.pyplot as plt
"""
加载数据集
"""
def LoadDataSet(fileName):
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
floatLine = list(map(float, curLine))
dataMat.append(floatLine)
#print("\n dataMat \n ",dataMat)
return np.mat(dataMat)
"""
判断是否是树
"""
def IsTree(obj):
return type(obj).__name__=='dict'
def BinSplitDataSet(dataSet, feature, value):
index = np.nonzero(dataSet[:,feature]>value)[0]
index1 = np.nonzero(dataSet[:,feature]<=value)[0]
matL = dataSet[index,:]
matR = dataSet[index1,:]
return matL,matR
"""
线性模型
Args
dataSet: 数据集
return
ws: 权值系数
X: 样本集
Y: 标签集
"""
def LinearSolve(dataSet):
m,n = np.shape(dataSet)
X = np.mat(np.ones((m,n))); Y = np.mat(np.ones((m,1)))
X[:,1:n]=dataSet[:,0:n-1]
Y = dataSet[:,-1]
A = X.T*X
if np.linalg.det(A)==0: ##也可以通过岭回归解决 需要增加第个值,或者正规化
raise NameError("\n ********非线性**************")
W = np.linalg.inv(A)*(X.T*Y)
return W,X,Y
"""
生成叶
Args
dataSet: 数据集
"""
def ModelLeaf(dataSet):
W, X,Y = LinearSolve(dataSet)
print("\n ****** W **********\n",W)
return W
"""
模型错误
Args
dataSet: 数据集
return
None
"""
def ModelErr(dataSet):
W,X,Y = LinearSolve(dataSet)
yHat = X*W## 预测值
return np.sum(np.power(Y-yHat,2))
"""
选择最优特征去切分
Args
dataSet: 数据集
leafType: 当前均值
errType: 方差
return
最优属性
"""
def chooseBestSplit(dataSet, leafType=RegLeaf, errType = RegErr, ops=(1,4)):
minDiff = ops[0]; minNum = ops[1]
#print("\n dataSet ",dataSet)
n = len(set(dataSet[:,-1].T.tolist()[0]))
print("\n ****chooseBestSplit**** \n ",n)
if n == 1: ##只有一类
return None, leafType(dataSet)
m,n = np.shape(dataSet)
S = errType(dataSet)
bestS = np.inf ; bestIndex = -1; bestValue = -1
for featIndex in range(n-1): ##最后一列是y
for val in set(dataSet[:,featIndex].T.tolist()[0]):
# print("\n val ",val)
matL, matR = BinSplitDataSet(dataSet, featIndex, val)
m1,n1 = np.shape(matL)
m2,n2 = np.shape(matR)
if m1<minNum or m2<minNum:
continue
newS = errType(matL)+errType(matR)
if newS<bestS:
bestS = newS
bestIndex = featIndex
bestValue = val
if (S-bestS)<minDiff:##以前无法划分了
return None,leafType(dataSet)
""" 把预剪枝部分删除
m1,n1 = np.shape(matL)
m2,n2 = np.shape(matR)
if m1<minNum or m2<minNum:
print("\n *********m1 ****************")
return None,leafType(dataSet)
"""
return bestIndex, bestValue
"""
创建树
Args
dataMat 数据集
leafType: 叶节点,均值
errType: 方差
ops() tuple[0] 终止条件1,当前切分后变化下限 tuple[1] 叶节点个数
"""
def createTree(dataMat, leafType = RegLeaf, errType = RegErr, ops=(1,4)):
print("\n ********createTree******* \n ")
feat, val = chooseBestSplit(dataMat, leafType, errType, ops)
print("\n chooseBestSplit End feat ",feat, "\t val: ",val)
if feat is None:
print("\n Feat is None return val")
return val
retTree ={}
retTree['spInd'] = feat
retTree['spVal'] = val
lSet, rSet = BinSplitDataSet(dataMat, feat, val)
retTree['left']= createTree(lSet, leafType,errType,ops)
retTree['right']= createTree(rSet, leafType,errType,ops)
return retTree
"""
Cart的输出值
Args
model: 叶节点的值
"""
def regTreeEval(model,data):
return float(model)
"""
树回归的输出值
Args:
dataMat: 数据集
W: 权值系数
return:
Y: 输出值=X*W+B
"""
def modelTreeEval(W, dataMat):
m,n = np.shape(dataMat)
X = np.mat(np.ones((1,n+1)))
X[:,1:n+1]=dataMat
return float(X*W)
"""
预测
Args
data: 数据集
tree: 模型
modelEval : 预测树
"""
def TreeForeCast(tree, data, modelEval= modelTreeEval):
if not IsTree(tree):
y = modelEval(tree, data)
return y
#print("\n tree['spVal']",tree['spInd'], "\t ",data)
key = tree['spInd']; value =tree['spVal']
if data[0,key]> value:
if IsTree(tree['left']):
return TreeForeCast(tree['left'],data, modelEval)
else:
return modelEval(tree['left'], data)
else:
if IsTree(tree['right']):
return TreeForeCast(tree['right'],data, modelEval)
else:
return modelEval(tree['right'], data)
"""
预测
Args
data: 数据集
tree: 模型
"""
def CreateForeCast(tree, testData, modelEval = modelTreeEval):
m,n = np.shape(testData)
yHat = np.mat(np.zeros((m,1)))
X = np.arange(0,m)
Y1 =[]
Y2 =[]
for i in range(m):
yHat[i,0]= TreeForeCast(tree, testData[i,0:n-1], modelEval)
print("\n 预测值 ",yHat[i,0],"\t 实际值: ", testData[i,-1])
Y1.append(yHat[i,0]-testData[i,-1])
Y2.append(testData[i,-1])
Y1.sort()
plt.scatter(X,Y1,c='r')
# plt.scatter(X,Y2,c='g')
return yHat
def Test():
path = os.path.abspath("Modeltrain.txt")
trainData = LoadDataSet(path)
# print("\n trainData ",trainData)
myTree = createTree(trainData, ModelLeaf, ModelErr, ops=(1,20))
print("\n myTree ",myTree)
print("\n **********开始预测*******************\n")
path2 = os.path.abspath("ModelTest.txt")
testData = LoadDataSet(path2)
CreateForeCast(myTree, testData, modelTreeEval)
Test()
参考文档:
《机器学习实战》