import numpy as np
classNode:def__init__(self, left, right, rule):
self.left = left
self.right = right
self.feature = rule[0]
self.threshold = rule[1]classLeaf:def__init__(self, value):
self.value = value
classDecisionTree:def__init__(self,classifier=True,max_depth=None,
n_feats=None,criterion="entropy",seed=None):if seed:
np.random.seed(seed)
self.depth =0
self.root =None
self.n_feats = n_feats
self.criterion = criterion
self.classifier = classifier
self.max_depth = max_depth if max_depth else np.inf
ifnot classifier and criterion in["gini","entropy"]:raise ValueError("{} is a valid criterion only when classifier = True.".format(criterion))if classifier and criterion =="mse":raise ValueError("`mse` is a valid criterion only when classifier = False.")deffit(self, X, Y):
self.n_classes =max(Y)+1if self.classifier elseNone
self.n_feats = X.shape[1]ifnot self.n_feats elsemin(self.n_feats, X.shape[1])
self.root = self._grow(X, Y)defpredict(self, X):return np.array([self._traverse(x, self.root)for x in X])defpredict_class_probs(self, X):assert self.classifier,"`predict_class_probs` undefined for classifier = False"return np.array([self._traverse(x, self.root, prob=True)for x in X])def_grow(self, X, Y, cur_depth=0):# if all labels are the same, return a leafiflen(set(Y))==1:if self.classifier:
prob = np.zeros(self.n_classes)
prob[Y[0]]=1.0return Leaf(prob)if self.classifier else Leaf(Y[0])# if we have reached max_depth, return a leafif cur_depth >= self.max_depth:
v = np.mean(Y, axis=0)if self.classifier:
v = np.bincount(Y, minlength=self.n_classes)/len(Y)return Leaf(v)
cur_depth +=1
self.depth =max(self.depth, cur_depth)
N, M = X.shape
feat_idxs = np.random.choice(M, self.n_feats, replace=False)# greedily select the best split according to `criterion`
feat, thresh = self._segment(X, Y, feat_idxs)
l = np.argwhere(X[:, feat]<= thresh).flatten()
r = np.argwhere(X[:, feat]> thresh).flatten()# grow the children that result from the split
left = self._grow(X[l,:], Y[l], cur_depth)
right = self._grow(X[r,:], Y[r], cur_depth)return Node(left, right,(feat, thresh))def_segment(self, X, Y, feat_idxs):
best_gain =-np.inf
split_idx, split_thresh =None,Nonefor i in feat_idxs:
vals = X[:, i]
levels = np.unique(vals)
thresholds =(levels[:-1]+ levels[1:])/2iflen(levels)>1else levels
gains = np.array([self._impurity_gain(Y, t, vals)for t in thresholds])if gains.max()> best_gain:
split_idx = i
best_gain = gains.max()
split_thresh = thresholds[gains.argmax()]return split_idx, split_thresh
def_impurity_gain(self, Y, split_thresh, feat_values):"""
Compute the impurity gain associated with a given split.
IG(split) = loss(parent) - weighted_avg[loss(left_child), loss(right_child)]
"""if self.criterion =="entropy":
loss = entropy
elif self.criterion =="gini":
loss = gini
elif self.criterion =="mse":
loss = mse
parent_loss = loss(Y)# generate split
left = np.argwhere(feat_values <= split_thresh).flatten()
right = np.argwhere(feat_values > split_thresh).flatten()iflen(left)==0orlen(right)==0:return0# compute the weighted avg. of the loss for the children
n =len(Y)
n_l, n_r =len(left),len(right)
e_l, e_r = loss(Y[left]), loss(Y[right])
child_loss =(n_l / n)* e_l +(n_r / n)* e_r
# impurity gain is difference in loss before vs. after split
ig = parent_loss - child_loss
return ig
def_traverse(self, X, node, prob=False):ifisinstance(node, Leaf):if self.classifier:return node.value if prob else node.value.argmax()return node.value
if X[node.feature]<= node.threshold:return self._traverse(X, node.left, prob)return self._traverse(X, node.right, prob)defmse(y):return np.mean((y - np.mean(y))**2)defentropy(y):
hist = np.bincount(y)
ps = hist / np.sum(hist)return-np.sum([p * np.log2(p)for p in ps if p >0])defgini(y):
hist = np.bincount(y)
N = np.sum(hist)return1-sum([(i / N)**2for i in hist])