def binSplitDataSet(dataSet,feature,value):
bigIndex = dataSet[:,feature] > value
smallIndex = dataSet[:,feature] <= value
#print('bigIndex:',bigIndex)
#print('smallIndex:',smallIndex)
big = dataSet[nonzero(bigIndex)[0],:]
small = dataSet[nonzero(smallIndex)[0],:]
return small,big
第一个函数有三个参数:数据集合,待切分的特征和该特征的某个值,在给定特征和特征值的情况下,该函数通过数组过滤方式将上述数据集合切分得到两个子集并返回
def chooseBestSplit(dataSet,leafType=regLeaf,errType=regErr,ops=(1,4)):
if len(set(dataSet[:,-1].T.A.tolist()[0]))==1:
return None,leafType(dataSet)
eS=ops[0]
minSample=ops[1]
bestError=errType(dataSet)
originError=errType(dataSet)
bestFeatIndex=0
bestSplitValue=0
m,n=shape(dataSet)
for featIndex in range (n-1):
for splitVa