1.CART算法
当数据的特征非常多并且特征之间的关系非常复杂时,或者要求的问题不是线性时,就不能用全局线性模型来拟合数据,一种可行的方法是先把数据分成多份容易建模的数据,然后利用回归来建模。
CART(分类回归树)既可以用作分类也可以用来回归,是非常著名的树构建算法,他用二元切分处理连续型变量,如果特征值大于给定值就走左子树,否则就走右子树
def loadDataSet(fileName):
dataSet= []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = map(float, curLine)
dataSet.append(fltLine)
return dataSet
def binSplitDataSet(dataSet, feature, value):
mat0 = dataSet[np.nonzero(dataSet[:,feature]>value)[0],:]
mat1 = dataSet[np.nonzero(dataSet[:,feature]<=value)[0],:]
return mat0, mat1
def regLeaf(dataSet):
return np.mean(dataSet[:,-1])
def regErr(dataSet):
return np.var(dataSet[:,-1])*np.shape(dataSet)[0]
def chooseBestSplit(dataSet, leafType = regLeaf, errType = regErr, ops = (1,4)):
tolS = ops[0]; tolN = ops[1]
if len(set(dataSet[:,-1].T.tolist()[0])) == 1:
return None, leafType(dataSet)
m,n = np.shape(dataSet)
S = errType(dataSet)
bestS = np.inf
bestIndex = 0
bestValue = 0
for featIndex in range(n-1):
for splitVal in set(dataSet[:,featIndex]):
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
continue
newS = errType(mat0) * errType(mat1)
if newS < bestS:
bestIndex = featIndex
bestValue = splitVal
bestS = newS
if (S - bestS) < tolS:
return None, leafType(dataSet)
mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):
return None, leafType(dataSet)
return bestIndex, bestValue
def createTree(dataSet, leafType = regLeaf, errType = regErr, ops = (1,4)):
feat, val = chooseBestSplit(dataSet,leafType,errType,ops)
if feat == None : return val
retTree = {}
retTree['spInd'] = feat
retTree['spVal'] = val
lSet, rSet = binSplitDataSet(dataSet, feat, val)
retTree['left'] = createTree(lSet, leafType, errType, ops)
retTree['right'] = createTree(rSet, leafType, errType, ops)
return retTree
回归树可能会出现过拟合的现象,对树进行剪枝可以防止树的过拟合,剪枝分为预剪枝和后剪枝,预剪枝就是在创建树时对树的创建进行条件限制,后剪枝是对创建后的树剪枝
比较剪枝前后分类误差,如果误差减小就进行剪枝
def isTree(obj):
return (type(obj).__name__ == 'dict')
def getMean(Tree):
if isTree(Tree['left']): Tree['left'] = getMean(Tree['left'])
if isTree(Tree['right']): Tree['right'] = getMean(Tree['right'])
return (Tree['left'] + Tree['right']) / 2.0
def prune(tree, testData):
if np.shape(testData)[0] == 0: return getMean(tree)
if isTree(tree['left']) or isTree(tree['right']):
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
if isTree(tree['left']) : tree['left'] = prune(tree['left'], lSet)
if isTree(tree['right']) : tree['right'] = prune(tree['right'], rSet)
if not isTree(tree['left']) and not isTree('right'):
lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
errorNoMerge = np.sum(np.power(lSet[:,-1] - tree['left'],2)) + np.sum(np.power(rSet[:,-1] - tree['right'],2))
treeMean = (tree['left'] + tree['right']) / 2.0
errorMerge = np.sum(np.power(testData[:,-1],2))
if errorMerge < errorNoMerge:
print 'merge'
return treeMean
else:
return tree
else: return tree
2.模型树
和回归树相比,模型树是把叶子节点变为线性函数,而不再是数据的均值
def linearSolve(dataSet): #helper function used in two places
m,n = shape(dataSet)
X = mat(ones((m,n))); Y = mat(ones((m,1)))#create a copy of data with 1 in 0th postion
X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1]#and strip out Y
xTx = X.T*X
if linalg.det(xTx) == 0.0:
raise NameError('This matrix is singular, cannot do inverse,\n\
try increasing the second value of ops')
ws = xTx.I * (X.T * Y)
return ws,X,Y
def modelLeaf(dataSet):#create linear model and return coeficients
ws,X,Y = linearSolve(dataSet)
return ws
def modelErr(dataSet):
ws,X,Y = linearSolve(dataSet)
yHat = X * ws
return sum(power(Y - yHat,2))