手撸的第一版Xgboost,先放上来,慢慢写bug!
#coding=UTF-8
import numpy as np
from collections import Counter
var_std = .5
def gini(fea, val, data):
"""
计算离散特征列的gini值
"""
def __cal__(cls):
if cls.shape[0] == 0:
return .0
counter = Counter(cls).values()
clsCount = 1.0*np.asarray(counter)/cls.shape[0]
return 1 - np.sum(clsCount**2)
dataSize = data.shape[0]
assert dataSize>0, "data is None"
#将数据划分成等于val和不等于val的两个子集
feaCol = data[:, fea]
equalValSubset = data[np.where(feaCol==val)]
notEqualValSubset = data[np.where(~(feaCol==val))]
p1, p2 = 1.0*equalValSubset.shape[0]/dataSize, 1.0*notEqualValSubset.shape[0]/dataSize
#计算fea列等于val的子集的gini指数
gini1 = __cal__(equalValSubset[:, -1])
#就散fea列不等于val的子集的给你指数
gini2 = __cal__(notEqualValSubset[:, -1])
return p1*gini1 + p2*gini2
def lsrt(fea, val, data):
dataSize = data.shape[0]
assert dataSize>0, "data is None"
feaCol = data[:, fea]
equalValY = data[np.where(feaCol<=val)][-1]
notEqualValY = data[np.where(feaCol>val)][-1]
return np.sum((equalValY-equalValY.mean())**2) + np.sum((notEqualValY-notEqualValY.mean())**2)
def constructLeafNode(idx, isRegress, datas, splitFea=None, splitVal=None, featureSet=None, rightNode=None, leftNode=None, parent=None):
"""
分类 :当前样本子集不同种类的众数
回归 :当前样本子集不同y的均值
"""
if isRegress:
return leafNode(idx, None, None, None, datas[idx].mean(), parent=parent)
else:
return leafNode(idx, None, None, None, Counter(datas[idx][:, -1]).most_common(1)[0][0], parent=parent)
def predict(root, x):
"""
root : cart tree
x : 找到x在root中的叶子节点
"""
node = root
if node:
while type(node) is 'innerNode':
if x[node.splitFea] <= node.splitVal:
node = node.leftNode
else:
node = node.rightNode
return node.y
class innerNode(object):
def __init__(self, sampleIdx, featureSet, splitFea=None, splitVal=None, rightNode=None, leftNode=None, parent=None):
self.splitFea = splitFea
self.splitVal = splitVal
self.featureSet = featureSet
self.sampleIdx = sampleIdx
self.rightNode = rightNode
self.leftNode = leftNode
self.parent = parent
def setSplitAttr(self, tup):
self.splitFea, self.splitVal = tup
def setRightNode(self, rightNode):
self.rightNode = rightNode
def setLeftNode(self, leftNode):
self.leftNode = leftNode
class leafNode(innerNode):
"""
分类树中,y是样本子集结果的众数;回归树中y是样本子集的均值
"""
def __init__(self, sampleIdx, featureSet, splitFea, splitVal, y, rightNode=None, leftNode=None, parent=None):
super(leafNode, self).__init__(sampleIdx, featureSet, splitFea, splitVal, rightNode, leftNode, parent)
self.y = y
class cart:
def __init__(self, epsilon=.1):
self.root = None
self.datas = None
self.isRegress = None
self.epsilon = epsilon
def __build(self, idx, featureSet, parent=None):
if not featureSet:
return constructLeafNode(idx, self.isRegress, self.datas, featureSet=featureSet, parent=parent)
if np.unique(self.datas[idx][:, -1]).shape[0] == 1 or self.datas[:, -1].var() < var_std:
return constructLeafNode(idx, self.isRegress, self.datas, featureSet=featureSet, parent=parent)
currentData = self.datas[idx]
#遍历当前feature列的所有不同值,得到(featureCol, value),通过reduce得到[[fea1, val1], [fea1, val2], ...]
valfeazip = reduce(lambda x,y:x+y, [[(x, y) for y in np.unique(currentData[:, x])] for x in featureSet])
#如果当前类型是离散型,使用gini函数计算信息增益;如果当前类型是连续值,得到最小ls值
gains = [lsrt(x[0], x[1], currentData) if isinstance(currentData[:, x[0]][0], float) else gini(x[0], x[1], currentData) for x in valfeazip]
maxGain = max(gains)
if maxGain < self.epsilon:
return constructLeafNode(idx, self.isRegress, self.datas, featureSet=featureSet, parent=parent)
bestFea = valfeazip[gains.index(maxGain)]
currentNode = innerNode(idx, featureSet, splitFea=bestFea[0], splitVal=bestFea[1])
#左子树的索引
curFeatureSet = featureSet.remove(bestFea[0])
if isinstance(self.datas[:, bestFea[0]][0], float):
#左子树的样本索引
leftIdx = idx[np.where(currentData[:, bestFea[0]]<=bestFea[1])]
#右子树的样本索引
rightIdx = idx[np.where(currentData[:, bestFea[0]]>bestFea[1])]
else:
#左子树的样本索引
leftIdx = idx[np.where(currentData[:, bestFea[0]]==bestFea[1])]
#右子树的样本索引
rightIdx = idx[np.where(~(currentData[:, bestFea[0]]==bestFea[1]))]
#递归不深刻
currentNode.parent = parent
currentNode.leftNode = self.build(leftIdx, curFeatureSet, currentNode)
currentNode.rightNode = self.build(rightIdx, curFeatureSet, currentNode)
return currentNode
def build(self, datas):
self.datas = datas
self.isRegress = isinstance(datas[:, -1][0], float)
self.root = self.__build(np.arange(datas.shape[0]), range(datas.shape[1]))
def predict(self, x):
if not self.root:
raise "the model isn't trained"
currentNode = self.root
while type(currentNode) is 'innerNode':
if x[currentNode.splitFea] <= currentNode.splitVal:
currentNode = currentNode.leftNode
else:
currentNode = currentNode.rightNode
return currentNode.y
def prune(self, root):
pass
class trees(object):
def __init__(self):
self.treeList = []
self.treeNum = 0
def addTree(self, newTree):
self.treeList.append(newTree)
self.treeNum += 1
def fkFun(self, x):
if self.treeList:
return sum([predict(tree, x) for tree in self.treeList])
class Xgboost(object):
"""
lossFun : 指定loss函数,如mse、logit
maxNumTree : 最大基础分类器的个数
"""
def __init__(self, gamma=.9, lam = .7, maxNumTree=100, stopCon=.3, epsilon=.1):
self.lam = lam
self.datas = None
self.gamma = gamma
self.forest = trees()
self.isRegress = None
self.stopCon = stopCon
self.epsilon = epsilon
self.maxNumTree = maxNumTree
def __gains(self, fea, val, idx, ht, gt, data):
gtV = gt[idx].sum()
htV = ht[idx].sum()
leftIdx = idx[np.where(data[idx][:, fea]<=val)]
lgtV = gt[leftIdx].sum()
lhtV = ht[leftIdx].sum()
rightIdx = idx[np.where(data[idx][:, fea]>val)]
rgtV = gt[rightIdx].sum()
rhtV = ht[rightIdx].sum()
return .5 * (lgtV**2/(lhtV+self.lam) + rgtV**2/(rgtV + self.lam) - gtV**2/(htv + self.lam)) - self.gamma
def __build(self, idx, featureSet, gt, ht, parent=None):
print(idx)
if not featureSet:
return constructLeafNode(idx, self.isRegress, self.datas, parent=parent)
if np.unique(self.datas[idx][:, -1]).shape[0] == 1 or self.datas[:, -1].var() < var_std:
return constructLeafNode(idx, self.isRegress, self.datas, featureSet=featureSet, parent=parent)
currentData = self.datas[idx]
#遍历当前feature列的所有不同值,得到(featureCol, value),通过reduce得到[[fea1, val1], [fea1, val2], ...]
valfeazip = reduce(lambda x,y:x+y, [[(x, y) for y in np.unique(currentData[:, x])] for x in featureSet])
#如果当前类型是离散型,使用gini函数计算信息增益;如果当前类型是连续值,得到最小ls值
gains = [self.__gains(x[0], x[1], idx, ht, gt, currentData) for x in valfeazip]
maxGain = max(gains)
if maxGain < self.epsilon:
return constructLeafNode(idx, self.isRegress, self.datas, featureSet=featureSet, parent=parent)
bestFea = valfeazip[gains.index(maxGain)]
currentNode = innerNode(idx, featureSet, splitFea=bestFea[0], splitVal=bestFea[1])
curFeatureSet = featureSet.remove(bestFea[0])
leftIdx = idx[np.where(currentData[:, bestFea[0]]<=bestFea[1])]
rightIdx = idx[np.where(currentData[:, bestFea[0]]>bestFea[1])]
currentNode.parent = parent
currentNode.leftNode = self.build(leftIdx, curFeatureSet, gt, ht, currentNode)
currentNode.rightNode = self.build(rightIdx, curFeatureSet, gt, ht, currentNode)
return currentNode
def train(self, datas):
"""
datas最后一列是y
"""
#建立第一棵数
self.datas = datas
self.isRegress = isinstance(datas[:,-1][0], float)
cartIns = cart()
cartIns.build(datas)
self.forest.addTree(cartIns.root)
#使用logit loss
while self.forest.treeNum < self.maxNumTree:
gtList = np.array([1-1.0/(1+np.e**(self.forest.fkFun(x[0:-1])))-x[-1] for x in datas])
htList = np.array([np.e**(self.forest.fkFun(x[0:-1]))/(1+np.e**(self.forest.fkFun(x[0:-1])))**2 for x in datas])
self.forest.addTree(self.__build(np.arange(datas.shape[0]), range(datas.shape[1]), gtList, htList))
def predict(self, x):
pass
if __name__ == "__main__":
samples = np.array([[0,0,0,0,0],
[0,0,0,1,0],
[0,1,0,1,1],
[0,1,1,0,1],
[0,0,0,0,0],
[1,0,0,0,0],
[1,0,0,1,0],
[1,1,1,1,1],
[1,0,1,2,1],
[1,0,1,2,1],
[2,0,1,2,1],
[2,0,1,1,1],
[2,1,0,1,1],
[2,1,0,2,1],
[2,0,0,0,0]])
xg = Xgboost()
xg.train(samples)
print(xg.forest.treeNum)