树回归_机器学习

前言:

 

目录:

  1.    CART
  2.    模型树
  3.    树剪枝
  4.    GUI 使用

一 CART 

      流程:

      CreateTree

       寻找最佳的切分特征

                      如果该节点不能再分,将该节点存为叶节点

                      执行二元切分

                      右子树调用 CreateTree() 方法

                       左子树调用CreateTree()方法

      最佳的切分特征:

                       遍历每个特征值

                               将数据切分为两份

                                计算切分的误差

                                 如果当前的误差小于当前的最小误差,那将当前切分设定为最佳切分并更新最小误差

                           返回最佳的特征和阀值

                           预剪枝:提前终止条件也是方法之一

     树剪枝(后剪枝)

                          基于已有的树切分测试数据

                                      如果存在任一子集是一棵树,则在该子集递归剪枝过程

                                      计算将当前两个叶结点合并后的误差

                                      计算不合并后的误差

                                     如果合并后的误差会降低误差,则叶合并

# -*- coding: utf-8 -*-
"""
Created on Mon Dec  9 14:27:46 2019

@author: chengxf2
"""

import numpy as np
from time import sleep
import os
import matplotlib.pyplot as plt

class treeNone():
      
      def __init__(self, feat, val, right, left):
          
          self.feature = feat
          self.value = val
          self.rightBranch = right
          self.leftBranch = left
          

def LoadDataSet(fileName):
    
    dataMat = []
    
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        floatLine = list(map(float, curLine))
        
        dataMat.append(floatLine)
    #print("\n dataMat \n ",dataMat)
    return np.mat(dataMat)

def BinSplitDataSet(dataSet, feature, value):
  

    index = np.nonzero(dataSet[:,feature]>value)[0]
    index1 = np.nonzero(dataSet[:,feature]<=value)[0]
    matL = dataSet[index,:]
    matR = dataSet[index1,:]
    

    return matL,matR

"""
当前的均值
Args
    dataSet: 当前的样本集
return 
    均值
"""
def RegLeaf(dataSet):
    
    y = np.mean(dataSet[:,-1])
    return y

"""
当前的方差
Args
  dataSet: 样本集
return 
  方差
"""
def RegErr(dataSet):
    m,n = np.shape(dataSet)
    q = np.var(dataSet[:,-1])
    
    return q*m

"""
选择最优特征去切分
Args
  dataSet: 数据集
  leafType: 当前均值
  errType: 方差

return
   最优属性
"""
def chooseBestSplit(dataSet, leafType=RegLeaf, errType = RegErr, ops=(1,4)):
    minDiff = ops[0]; minNum = ops[1]
    
    #print("\n dataSet ",dataSet)
    
    n = len(set(dataSet[:,-1].T.tolist()[0]))
    print("\n  ****chooseBestSplit**** \n ",n)
    if n == 1: ##只有一类
        return None, leafType(dataSet)
    
    m,n = np.shape(dataSet)
    S = errType(dataSet)
    bestS = np.inf ; bestIndex = -1; bestValue = -1
    
    for featIndex in range(n-1): ##最后一列是y
        
        for val in set(dataSet[:,featIndex].T.tolist()[0]):
            # print("\n val ",val)
             matL, matR =  BinSplitDataSet(dataSet, featIndex, val)
             
             m1,n1 = np.shape(matL)
             m2,n2 = np.shape(matR)
             
             if m1<minNum or m2<minNum:
                 continue
             
             newS = errType(matL)+errType(matR)
             
             if newS<bestS:
                 bestS = newS
                 bestIndex = featIndex
                 bestValue = val
                 
    
    if (S-bestS)<minDiff:##以前无法划分了
        
        return None,leafType(dataSet)
    
   #############有个预剪枝操作#############3
    m1,n1 = np.shape(matL)
    m2,n2 = np.shape(matR)
    if m1<minNum or m2<minNum:
        print("\n *********m1    ****************")
        return None,leafType(dataSet)
    return bestIndex, bestValue
        
        
    


"""
创建树
Args
   dataMat 数据集
   leafType: 叶节点,均值
   errType: 方差
   ops() tuple[0] 终止条件1,当前切分后变化下限  tuple[1] 叶节点个数
   
"""
def createTree(dataMat, leafType = RegLeaf, errType = RegErr, ops=(1,4)):
    print("\n ********Enter******* \n ")
    feat, val = chooseBestSplit(dataMat, leafType, errType, ops)
    print("\n createTree feat ",feat, "\t val: ",val)
    if feat is None:
        print("\n None Feat")
        return val
    retTree ={}
    
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet, rSet = BinSplitDataSet(dataMat, feat, val)
    retTree['left']= createTree(lSet, leafType,errType,ops)
    retTree['right']= createTree(rSet, leafType,errType,ops)
    
    return retTree

"""
判断是否是树
"""
def IsTree(obj):
    return type(obj).__name__=='dict'

"""
获得误差
Args
   tree: 树
"""
def GetMean(tree):
    if IsTree(tree['left']):
        tree['left']=GetMean(tree['left'])
    
    if IsTree(tree['right']):
        tree['right']=GetMean(tree['right'])
        
    return (tree['left']+tree['right'])/2.0

"""
后剪枝
Args
   tree: 已经用train 数据集生成的数据
   testData: 用测试集去后剪枝
"""   
def Prune(tree, testData):
    m,n = np.shape(testData)
    if m == 0: ##数据集为Null
        return GetMean(tree)
    
    if IsTree(tree['left']) or IsTree(tree['right']): ##分割数据
       lMat, rMat = BinSplitDataSet(testData, tree['spInd'], tree['spVal'])
       
    if IsTree(tree['left']):
        tree['left']= Prune(tree['left'],lMat)
    
    if IsTree(tree['right']):
        tree['right']= Prune(tree['right'],rMat)
        
    if not IsTree(tree['left']) and not IsTree(tree['right']):
        lMat, rMat =  BinSplitDataSet(testData, tree['spInd'], tree['spVal'])
        
        #print("\n tree['left']",tree['left'],"\t tree['right']",tree['right'])
        errorNoMerge = np.sum(np.power(lMat[:,-1]-tree['left'],2))+ np.sum(np.power(rMat[:,-1]-tree['right'],2))
        #print("\n errorM ::",errorNoMerge)
        
        treeMean = (tree['left']+tree['right'])/2.0
        errorMerge = np.sum(np.power(testData[:,-1]-treeMean,2))
        if errorMerge< errorNoMerge:
            print("\n Merget:  ", errorNoMerge-errorMerge)
            return treeMean
        else:
            return tree
    else: 
        return tree
 
"""
预测
Args
   data: 数据集
   tree: 模型
   modelEval : 预测树
"""
def TreeForeCast(tree, data):
     
    if not IsTree(tree): ##已经是叶节点
        y = float(tree)
        return y
    
    #print("\n tree['spVal']",tree['spInd'], "\t ",data)
    key = tree['spInd']; value =tree['spVal']
    if data[0,key]> value:  
        if IsTree(tree['left']):
            return TreeForeCast(tree['left'],data)
        else:
            return  float(tree['left'])
    
    else:
        if IsTree(tree['right']):
            return TreeForeCast(tree['right'],data)
        else:
            return float(tree['right'])
        
"""
预测
Args
   data: 数据集
   tree: 模型
   
"""
def ForeCast(tree, testData):
    m,n = np.shape(testData)
    
    yHat = np.mat(np.zeros((m,1)))
    X = np.arange(0,m)
    Y =[]
   
    for i in range(m):
        yHat[i,0]= TreeForeCast(tree, testData[i,0:n-1])
        print("\n 预测值  ",yHat[i,0],"\t 实际值: ", testData[i,-1])
        Y.append(yHat[i,0]-testData[i,-1])
    
    
    Y.sort()
    plt.scatter(X,Y,c='r')
   # plt.scatter(X,Y2,c='g')
    return yHat

    
def Test():
    
    path = os.path.abspath("ex2.txt")
    trainData = LoadDataSet(path)
    myTree = createTree(trainData,ops=(1,2))
    print("\n ***MyTree ******** \n ", myTree)
    
    
    path2 = os.path.abspath("ex2test.txt")
    testData = LoadDataSet(path2)
    newTree = Prune(myTree, testData)
    print("\n prune Tree: \n ",newTree)
    
    ForeCast(newTree, testData)

Test()

       

二 模型树

   

      如上 所不同的,最优点切分使用回归算法。

      1: 叶节点是一个权值系数,不再是预测值

                 预测值为:    y=X^TW+b

   

    myTree = createTree(trainData, ModelLeaf, ModelErr, ops=(1,5))

 

  

# -*- coding: utf-8 -*-
"""
Created on Wed Dec 11 16:56:19 2019

@author: chengxf2
"""

import numpy as np
import os
import matplotlib.pyplot as plt

"""
加载数据集
"""
def LoadDataSet(fileName):
    
    dataMat = []
    
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        floatLine = list(map(float, curLine))
        
        dataMat.append(floatLine)
    #print("\n dataMat \n ",dataMat)
    return np.mat(dataMat)
"""
判断是否是树
"""
def IsTree(obj):
    return type(obj).__name__=='dict'


def BinSplitDataSet(dataSet, feature, value):
  

    index = np.nonzero(dataSet[:,feature]>value)[0]
    index1 = np.nonzero(dataSet[:,feature]<=value)[0]
    matL = dataSet[index,:]
    matR = dataSet[index1,:]
    

    return matL,matR
"""
线性模型
Args
    dataSet: 数据集
return
    ws: 权值系数
    X: 样本集
    Y: 标签集
"""    
def LinearSolve(dataSet):
    
    
    m,n = np.shape(dataSet)
    X = np.mat(np.ones((m,n))); Y = np.mat(np.ones((m,1)))  
    
    X[:,1:n]=dataSet[:,0:n-1]
    Y = dataSet[:,-1]
    
    A = X.T*X
    if np.linalg.det(A)==0: ##也可以通过岭回归解决 需要增加第个值,或者正规化
        raise NameError("\n ********非线性**************")
    W = np.linalg.inv(A)*(X.T*Y)
    return W,X,Y

"""
生成叶
Args
    dataSet: 数据集
"""
def ModelLeaf(dataSet):
    W, X,Y = LinearSolve(dataSet)
    print("\n ****** W **********\n",W)
    return W

"""
模型错误
Args
   dataSet: 数据集
return
   None
"""
def ModelErr(dataSet):
    W,X,Y = LinearSolve(dataSet)
    
    yHat = X*W## 预测值

    return np.sum(np.power(Y-yHat,2))


"""
选择最优特征去切分
Args
  dataSet: 数据集
  leafType: 当前均值
  errType: 方差

return
   最优属性
"""
def chooseBestSplit(dataSet, leafType=RegLeaf, errType = RegErr, ops=(1,4)):
    minDiff = ops[0]; minNum = ops[1]
    
    #print("\n dataSet ",dataSet)
    
    n = len(set(dataSet[:,-1].T.tolist()[0]))
    print("\n  ****chooseBestSplit**** \n ",n)
    if n == 1: ##只有一类
        return None, leafType(dataSet)
    
    m,n = np.shape(dataSet)
    S = errType(dataSet)
    bestS = np.inf ; bestIndex = -1; bestValue = -1
    
    for featIndex in range(n-1): ##最后一列是y
        
        for val in set(dataSet[:,featIndex].T.tolist()[0]):
            # print("\n val ",val)
             matL, matR =  BinSplitDataSet(dataSet, featIndex, val)
             
             m1,n1 = np.shape(matL)
             m2,n2 = np.shape(matR)
             
             if m1<minNum or m2<minNum:
                 continue
             
             newS = errType(matL)+errType(matR)
             
             if newS<bestS:
                 bestS = newS
                 bestIndex = featIndex
                 bestValue = val
                 
    
    if (S-bestS)<minDiff:##以前无法划分了
        
        return None,leafType(dataSet)
    """  把预剪枝部分删除
    m1,n1 = np.shape(matL)
    m2,n2 = np.shape(matR)
    if m1<minNum or m2<minNum:
        print("\n *********m1    ****************")
        return None,leafType(dataSet)
    """
    return bestIndex, bestValue


"""
创建树
Args
   dataMat 数据集
   leafType: 叶节点,均值
   errType: 方差
   ops() tuple[0] 终止条件1,当前切分后变化下限  tuple[1] 叶节点个数
   
"""
def createTree(dataMat, leafType = RegLeaf, errType = RegErr, ops=(1,4)):
    print("\n ********createTree******* \n ")
    feat, val = chooseBestSplit(dataMat, leafType, errType, ops)
    print("\n chooseBestSplit End feat ",feat, "\t val: ",val)
    if feat is None:
        print("\n Feat is None return val")
        return val
    retTree ={}
    
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet, rSet = BinSplitDataSet(dataMat, feat, val)
    retTree['left']= createTree(lSet, leafType,errType,ops)
    retTree['right']= createTree(rSet, leafType,errType,ops)
    
    return retTree


"""
Cart的输出值
Args
   model: 叶节点的值
   
"""
def regTreeEval(model,data):
    
    return float(model)

"""
树回归的输出值
Args:
    dataMat: 数据集
    W: 权值系数
return:
    Y: 输出值=X*W+B
"""
def modelTreeEval(W, dataMat):
    
    m,n = np.shape(dataMat)
    
   
    X = np.mat(np.ones((1,n+1)))
    X[:,1:n+1]=dataMat
    return float(X*W)

"""
预测
Args
   data: 数据集
   tree: 模型
   modelEval : 预测树
"""
def TreeForeCast(tree, data, modelEval= modelTreeEval):
     
    if not IsTree(tree):
        y = modelEval(tree, data)
        return y
    
    #print("\n tree['spVal']",tree['spInd'], "\t ",data)
    key = tree['spInd']; value =tree['spVal']
    if data[0,key]> value:  
        if IsTree(tree['left']):
            return TreeForeCast(tree['left'],data, modelEval)
        else:
            return modelEval(tree['left'], data)
    else:
        if IsTree(tree['right']):
            return TreeForeCast(tree['right'],data, modelEval)
        else:
            return modelEval(tree['right'], data)
        
"""
预测
Args
   data: 数据集
   tree: 模型
   
"""
def CreateForeCast(tree, testData, modelEval = modelTreeEval):
    m,n = np.shape(testData)
    
    yHat = np.mat(np.zeros((m,1)))
    X = np.arange(0,m)
    Y1 =[]
    Y2 =[]
    for i in range(m):
        yHat[i,0]= TreeForeCast(tree, testData[i,0:n-1], modelEval)
        print("\n 预测值  ",yHat[i,0],"\t 实际值: ", testData[i,-1])
        Y1.append(yHat[i,0]-testData[i,-1])
        Y2.append(testData[i,-1])
    
    Y1.sort()
    plt.scatter(X,Y1,c='r')
   # plt.scatter(X,Y2,c='g')
    return yHat


def Test():
    
    path = os.path.abspath("Modeltrain.txt")
    trainData = LoadDataSet(path)
   # print("\n trainData ",trainData)
    myTree = createTree(trainData, ModelLeaf, ModelErr, ops=(1,20))
    
    print("\n myTree ",myTree)
    
    print("\n **********开始预测*******************\n")
    
    path2 = os.path.abspath("ModelTest.txt")
    testData = LoadDataSet(path2)
    CreateForeCast(myTree, testData, modelTreeEval)

Test()

参考文档:

   《机器学习实战》

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值