Xgboost

最新推荐文章于 2024-04-21 14:19:33 发布

PYTandFA

最新推荐文章于 2024-04-21 14:19:33 发布

阅读量164

点赞数

分类专栏：算法文章标签：机器学习

本文链接：https://blog.csdn.net/PYTandFA/article/details/107991527

版权

算法专栏收录该内容

10 篇文章 0 订阅

订阅专栏

手撸的第一版Xgboost，先放上来，慢慢写bug！

#coding=UTF-8
import numpy as np
from collections import Counter

var_std = .5
def gini(fea, val, data):
   """
   计算离散特征列的gini值
   """
   def __cal__(cls):
       if cls.shape[0] == 0:
           return .0
       counter = Counter(cls).values()
       clsCount = 1.0*np.asarray(counter)/cls.shape[0]
       return 1 - np.sum(clsCount**2)

   dataSize = data.shape[0]
   assert dataSize>0, "data is None"
   #将数据划分成等于val和不等于val的两个子集
   feaCol = data[:, fea]
   equalValSubset = data[np.where(feaCol==val)]
   notEqualValSubset = data[np.where(~(feaCol==val))]
   p1, p2 = 1.0*equalValSubset.shape[0]/dataSize, 1.0*notEqualValSubset.shape[0]/dataSize
   #计算fea列等于val的子集的gini指数
   gini1 = __cal__(equalValSubset[:, -1])
   #就散fea列不等于val的子集的给你指数
   gini2 = __cal__(notEqualValSubset[:, -1])

   return p1*gini1 + p2*gini2

def lsrt(fea, val, data):
   dataSize = data.shape[0]
   assert dataSize>0, "data is None"
   feaCol = data[:, fea]
   equalValY = data[np.where(feaCol<=val)][-1]
   notEqualValY = data[np.where(feaCol>val)][-1]

   return np.sum((equalValY-equalValY.mean())**2) + np.sum((notEqualValY-notEqualValY.mean())**2)

def constructLeafNode(idx, isRegress, datas, splitFea=None, splitVal=None, featureSet=None, rightNode=None, leftNode=None, parent=None):
   """
   分类：当前样本子集不同种类的众数
   回归：当前样本子集不同y的均值
   """
   if isRegress:
       return leafNode(idx, None, None, None, datas[idx].mean(), parent=parent)
   else:
       return leafNode(idx, None, None, None, Counter(datas[idx][:, -1]).most_common(1)[0][0], parent=parent)

def predict(root, x):
   """
   root : cart tree
   x : 找到x在root中的叶子节点
   """
   node = root
   if node:
       while type(node) is 'innerNode':
           if x[node.splitFea] <= node.splitVal:
               node = node.leftNode
           else:
               node = node.rightNode

       return node.y

class innerNode(object):
   def __init__(self, sampleIdx, featureSet, splitFea=None, splitVal=None, rightNode=None, leftNode=None, parent=None):
       self.splitFea = splitFea
       self.splitVal = splitVal
       self.featureSet = featureSet
       self.sampleIdx = sampleIdx
       self.rightNode = rightNode
       self.leftNode = leftNode
       self.parent = parent

   def setSplitAttr(self, tup):
       self.splitFea, self.splitVal = tup

   def setRightNode(self, rightNode):
       self.rightNode = rightNode

   def setLeftNode(self, leftNode):
       self.leftNode = leftNode

class leafNode(innerNode):
   """
   分类树中，y是样本子集结果的众数；回归树中y是样本子集的均值
   """
   def __init__(self, sampleIdx, featureSet, splitFea, splitVal, y, rightNode=None, leftNode=None, parent=None):
       super(leafNode, self).__init__(sampleIdx, featureSet, splitFea, splitVal, rightNode, leftNode, parent)
       self.y = y

class cart:
   def __init__(self, epsilon=.1):
       self.root = None
       self.datas = None
       self.isRegress = None
       self.epsilon = epsilon

   def __build(self, idx, featureSet, parent=None):
       if not featureSet:
           return constructLeafNode(idx, self.isRegress, self.datas, featureSet=featureSet, parent=parent)
       if np.unique(self.datas[idx][:, -1]).shape[0] == 1 or self.datas[:, -1].var() < var_std:
           return constructLeafNode(idx, self.isRegress, self.datas, featureSet=featureSet, parent=parent)
       currentData = self.datas[idx]
       #遍历当前feature列的所有不同值，得到(featureCol, value)，通过reduce得到[[fea1, val1], [fea1, val2], ...]
       valfeazip = reduce(lambda x,y:x+y, [[(x, y) for y in np.unique(currentData[:, x])] for x in featureSet])
       #如果当前类型是离散型，使用gini函数计算信息增益；如果当前类型是连续值，得到最小ls值
       gains = [lsrt(x[0], x[1], currentData) if isinstance(currentData[:, x[0]][0], float) else gini(x[0], x[1], currentData) for x in valfeazip]
       maxGain = max(gains)
       if maxGain < self.epsilon:
           return constructLeafNode(idx, self.isRegress, self.datas, featureSet=featureSet, parent=parent)
       bestFea = valfeazip[gains.index(maxGain)]
       currentNode = innerNode(idx, featureSet, splitFea=bestFea[0], splitVal=bestFea[1])
       #左子树的索引
       curFeatureSet = featureSet.remove(bestFea[0])
       if isinstance(self.datas[:, bestFea[0]][0], float):
           #左子树的样本索引
           leftIdx = idx[np.where(currentData[:, bestFea[0]]<=bestFea[1])]
           #右子树的样本索引
           rightIdx = idx[np.where(currentData[:, bestFea[0]]>bestFea[1])]
       else:
           #左子树的样本索引
           leftIdx = idx[np.where(currentData[:, bestFea[0]]==bestFea[1])]
           #右子树的样本索引
           rightIdx = idx[np.where(~(currentData[:, bestFea[0]]==bestFea[1]))]
       #递归不深刻
       currentNode.parent = parent
       currentNode.leftNode = self.build(leftIdx, curFeatureSet, currentNode)
       currentNode.rightNode = self.build(rightIdx, curFeatureSet, currentNode)

       return currentNode

   def build(self, datas):
       self.datas = datas
       self.isRegress = isinstance(datas[:, -1][0], float)
       self.root = self.__build(np.arange(datas.shape[0]), range(datas.shape[1]))

   def predict(self, x):
       if not self.root:
           raise "the model isn't trained"
       currentNode = self.root
       while type(currentNode) is 'innerNode':
           if x[currentNode.splitFea] <= currentNode.splitVal:
               currentNode = currentNode.leftNode
           else:
               currentNode = currentNode.rightNode

       return currentNode.y

   def prune(self, root):
       pass

class trees(object):
   def __init__(self):
       self.treeList = []
       self.treeNum = 0

   def addTree(self, newTree):
       self.treeList.append(newTree)
       self.treeNum += 1

   def fkFun(self, x):
       if self.treeList:
           return sum([predict(tree, x) for tree in self.treeList])

class Xgboost(object):
   """
   lossFun : 指定loss函数，如mse、logit
   maxNumTree : 最大基础分类器的个数
   """
   def __init__(self, gamma=.9, lam = .7, maxNumTree=100, stopCon=.3, epsilon=.1):
       self.lam = lam
       self.datas = None
       self.gamma = gamma
       self.forest = trees()
       self.isRegress = None
       self.stopCon = stopCon
       self.epsilon = epsilon
       self.maxNumTree = maxNumTree

   def __gains(self, fea, val, idx, ht, gt, data):
       gtV = gt[idx].sum()
       htV = ht[idx].sum()
       leftIdx = idx[np.where(data[idx][:, fea]<=val)]
       lgtV = gt[leftIdx].sum()
       lhtV = ht[leftIdx].sum()
       rightIdx = idx[np.where(data[idx][:, fea]>val)]
       rgtV = gt[rightIdx].sum()
       rhtV = ht[rightIdx].sum()

       return .5 * (lgtV**2/(lhtV+self.lam) + rgtV**2/(rgtV + self.lam) - gtV**2/(htv + self.lam)) - self.gamma

   def __build(self, idx, featureSet, gt, ht, parent=None):
       print(idx)
       if not featureSet:
           return constructLeafNode(idx, self.isRegress, self.datas, parent=parent)
       if np.unique(self.datas[idx][:, -1]).shape[0] == 1 or self.datas[:, -1].var() < var_std:
           return constructLeafNode(idx, self.isRegress, self.datas, featureSet=featureSet, parent=parent)
       currentData = self.datas[idx]
       #遍历当前feature列的所有不同值，得到(featureCol, value)，通过reduce得到[[fea1, val1], [fea1, val2], ...]
       valfeazip = reduce(lambda x,y:x+y, [[(x, y) for y in np.unique(currentData[:, x])] for x in featureSet])
       #如果当前类型是离散型，使用gini函数计算信息增益；如果当前类型是连续值，得到最小ls值
       gains = [self.__gains(x[0], x[1], idx, ht, gt, currentData) for x in valfeazip]
       maxGain = max(gains)
       if maxGain < self.epsilon:
           return constructLeafNode(idx, self.isRegress, self.datas, featureSet=featureSet, parent=parent)
       bestFea = valfeazip[gains.index(maxGain)]
       currentNode = innerNode(idx, featureSet, splitFea=bestFea[0], splitVal=bestFea[1])
       curFeatureSet = featureSet.remove(bestFea[0])
       leftIdx = idx[np.where(currentData[:, bestFea[0]]<=bestFea[1])]
       rightIdx = idx[np.where(currentData[:, bestFea[0]]>bestFea[1])]

       currentNode.parent = parent
       currentNode.leftNode = self.build(leftIdx, curFeatureSet, gt, ht, currentNode)
       currentNode.rightNode = self.build(rightIdx, curFeatureSet, gt, ht, currentNode)

       return currentNode

   def train(self, datas):
       """
       datas最后一列是y
       """
       #建立第一棵数
       self.datas = datas
       self.isRegress = isinstance(datas[:,-1][0], float)
       cartIns = cart()
       cartIns.build(datas)
       self.forest.addTree(cartIns.root)
       #使用logit loss
       while self.forest.treeNum < self.maxNumTree:
           gtList = np.array([1-1.0/(1+np.e**(self.forest.fkFun(x[0:-1])))-x[-1] for x in datas])
           htList = np.array([np.e**(self.forest.fkFun(x[0:-1]))/(1+np.e**(self.forest.fkFun(x[0:-1])))**2 for x in datas])
           self.forest.addTree(self.__build(np.arange(datas.shape[0]), range(datas.shape[1]), gtList, htList))

   def predict(self, x):
       pass

if __name__ == "__main__":
   samples = np.array([[0,0,0,0,0],
                       [0,0,0,1,0],
                       [0,1,0,1,1],
                       [0,1,1,0,1],
                       [0,0,0,0,0],
                       [1,0,0,0,0],
                       [1,0,0,1,0],
                       [1,1,1,1,1],
                       [1,0,1,2,1],
                       [1,0,1,2,1],
                       [2,0,1,2,1],
                       [2,0,1,1,1],
                       [2,1,0,1,1],
                       [2,1,0,2,1],
                       [2,0,0,0,0]])
   xg = Xgboost()
   xg.train(samples)
   print(xg.forest.treeNum)

PYTandFA

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Xgboost

手撸的第一版Xgboost，先放上来，慢慢写bug！#coding=UTF-8import numpy as npfrom collections import Countervar_std = .5def gini(fea, val, data): """ 计算离散特征列的gini值 """ def __cal__(cls): if cls.shape[0] == 0: return .0 count...
复制链接

扫一扫