RegressionTree.
main函数:训练样本,最大深度,叶子节点最小样本数,特征采样,数据采样。加载训练样本,利用样本构建regressiontree,对训练样本(这里有些问题,实际上应该对样本进行划分)进行预测。
RegressionTree:
fit:选取采样特征,递归构建树,选取切分节点(特征)。
split_node: 选取切分点,如果深度达到最大深度或者切分之后左右树没有样本或者左右孩子样本数量和小于叶子节点最小样本数,返回预测值(预测值是均值)。左右子树递归进行split_node
search_best_split: 选取切分点,选择随机采样特征,按照特征均值划分每个特征,计算被划分后的方差,选择最小方差的特征作为划分点。得到划分的特征id,划分值,划分后的左右节点样本。
predict:eval_instance,对一个样本利用tree来训练。
eval_instance:如果tree无法继续划分,就返回当前节点预测值。如果样本特征值大于划分值,就在右子树递归计算,否则在左子树递归计算。
import numpy as np
import random
class RegressionTree():
""" A regression tree model for training.
"""
def __init__(self,
max_depth=16,
min_leaf_sample_cnt=1,
feat_sample_ratio=0.8,
data_sample_ratio=0.8):
self.max_depth = max_depth
self.min_leaf_sample_cnt = min_leaf_sample_cnt
self.feat_sample_ratio = feat_sample_ratio
self.data_sample_ratio = data_sample_ratio
self.feature_cnt = 0
self.feature_id_list = []
self.feature_sampled_cnt = 0
def fit(self, data_set):
"""
train the regression tree model by given data_set.
:param: data_set
:return: regression tree
"""
self.feature_cnt = len(data_set[0]) - 1
self.feature_id_list = range(0, self.feature_cnt)
self.feature_sampled_cnt = int(self.feature_cnt * self.feat_sample_ratio)
#print self.feature_cnt
#print self.feature_id_list
#print self.feature_sampled_cnt
tree = dict()
# split data iters
self.split_node(data_set, tree, 0)
self.tree = tree
return tree
def split_node(self, data_set, tree, current_depth):
tree['fid'], tree['split_val'], tree['left_data_set'], tree['right_data_set'] = self.search_best_split(data_set)
"""
print "split feat id:", tree['fid']
print "split feat val:", tree['split_val']
print "left_data_set, size:", tree['left_data_set'][:,0].size
print "right_data_set, size:", tree['right_data_set'][:,0].size
"""
if current_depth == self.max_depth or \
tree['left_data_set'] is None or \
tree['right_data_set'] is None or \
(tree['left_data_set'][:,0].size + tree['right_data_set'][:,0].size) <= self.min_leaf_sample_cnt:
tree['predict'] = data_set[:, -1].mean()
return None
tree['left_node'] = dict()
self.split_node(tree['left_data_set'], tree['left_node'], current_depth + 1)
tree['right_node'] = dict()
self.split_node(tree['right_data_set'], tree['right_node'], current_depth + 1)
return None
def search_best_split(self, data_set):
feature_mean = data_set[:,:-1].mean(0)
feature_var = data_set[:,:-1].var(0)
#feature_medium = data_set[:,:-1].medium()
#print "feature_mean", feature_mean
#print "feature_var", feature_var
sampled_feat_ids = random.sample(self.feature_id_list, self.feature_sampled_cnt)
#print "sampled_feat_ids", sampled_feat_ids
min_metric = np.inf
fid = None
split_val = None
mask_leaf = []
mask_right = []
for id in sampled_feat_ids:
if feature_var[id] == 0:
continue
m_l = data_set[:, id] <= feature_mean[id]
m_r = data_set[:, id] > feature_mean[id]
metric = data_set[:, -1][m_l].var() + data_set[:, -1][m_r].var()
"""
print "m_l size", sum(m_l)
print "m_r size", sum(m_r)
print "metric", metric
"""
if min_metric > metric:
min_metric = metric
fid = id
split_val = feature_mean[id]
mask_leaf = m_l
mask_right = m_r
return [ fid, split_val, data_set[mask_leaf,:], data_set[mask_right,:] ]
def predict(self, instance):
return self.eval_instance(self.tree, instance)
def eval_instance(self, cur_node, instance):
if not cur_node.has_key("left_node") or not cur_node.has_key("right_node"):
return cur_node["predict"]
if instance[cur_node['fid']] > cur_node["split_val"]:
return self.eval_instance(cur_node["right_node"], instance)
else:
return self.eval_instance(cur_node["left_node"], instance)
def load_data(data_file):
data_set = [[float(j) for j in i.rstrip().split(',')] for i in open(data_file).readlines()]
return np.array(data_set)
def test(data_file_path, model):
data_set = load_data(data_file_path)
"""
print data_set
print data_set.shape
print np.bincount([ int(i) for i in data_set[:,-1]])
"""
correct = float(0)
incorrect = float(0)
sq_sum_error = float(0)
for data in data_set:
pv = model.predict(data)
if abs(pv - data[-1]) < 0.5:
correct += 1
else:
incorrect += 1
# rmse
sq_sum_error += np.power((pv - data[-1]), 2)
rmse = np.sqrt( sq_sum_error / len(data_set[0]) )
accuracy = correct / (incorrect + correct)
print "test accuracy is: %s "%(accuracy)
print "test rmse is : %s"%(rmse)
def main(data_file_path, max_depth, min_leaf_sample_cnt, feat_sample_ratio, data_sample_ratio):
data_set = load_data(data_file_path)
"""
print data_set
print data_set.shape
print np.bincount([ int(i) for i in data_set[:,-1]])
"""
model = RegressionTree(max_depth=max_depth,
min_leaf_sample_cnt=min_leaf_sample_cnt,
feat_sample_ratio=feat_sample_ratio,
data_sample_ratio=data_sample_ratio)
model.fit(data_set)
print "train done!"
test(data_file_path, model)
print "test done!"
if __name__ == "__main__":
main('Iris_150_4.txt', 9, 1, 0.9, 0.9)
GBDT:
main:最大深度,叶子节点最小样本数,特征采样,数据采样,shrink,树个数。
load_data:加载训练数据
GBDT-fit:利用数据生成模型trees,每次对残差利用regressiontree建树,新预测结果+=shrink*残差模型预测结果。初始的预测结果是均值。每次建树之后求一次loss,loss用的square loss。
test:利用模型进行预测。
import os
import sys
import json
import copy
from regression_tree import *
class GBDT():
def __init__(self,
max_depth=16,
min_leaf_sample_cnt=1,
feat_sample_ratio=0.8,
data_sample_ratio=0.8,
shrink_ratio=1,
tree_num=10,
):
self.max_depth = max_depth
self.min_leaf_sample_cnt = min_leaf_sample_cnt
self.feat_sample_ratio = feat_sample_ratio
self.data_sample_ratio = data_sample_ratio
self.shrink_ratio = shrink_ratio
self.tree_num = tree_num
self.model = []
print self.max_depth
print self.min_leaf_sample_cnt
print self.feat_sample_ratio
print self.data_sample_ratio
print self.shrink_ratio
print self.tree_num
def fit(self, data_set):
data_cnt = len(data_set[:,0])
print "data set has %s instance"%(data_cnt)
h_0 = data_set[:, -1].mean()
y_pred_accumulate_buff = [h_0 for i in range(0, data_cnt)]
#print "residual_buff", y_pred_accumulate_buff
#print "residual_buff len %s"%(len(y_pred_accumulate_buff))
y_raw_buff = copy.deepcopy(data_set[:, -1])
#print "y_raw_buff", y_raw_buff
self.model.append(h_0)
for i in range(0, self.tree_num):
loss = 0
if i != 0:
for j in range(0, data_cnt):
#print "y_pred_accumulate_buff before", j, y_pred_accumulate_buff[j]
s = self.shrink_ratio * self.model[i].predict(data_set[j,:])
y_pred_accumulate_buff[j] += s
loss += ( (y_raw_buff[j] - y_pred_accumulate_buff[j]) * (y_raw_buff[j] - y_pred_accumulate_buff[j]) )
#print "shrink_ratio", self.shrink_ratio
#print "data_set[j,:]", data_set[j,:]
#print "predict", self.model[i].predict(data_set[j])
#print "y_pred_accumulate_buff", j, y_pred_accumulate_buff[j]
print "tid: %d loss: %s "%(i ,loss/(2*data_cnt))
for j in range(0, data_cnt):
#print "data_set[j, -1] before", j, data_set[j, -1]
#print "y_pred_accumulate_buff", j, y_pred_accumulate_buff[j]
data_set[j, -1] = y_raw_buff[j] - y_pred_accumulate_buff[j]
#print "data_set[j, -1]", j, data_set[j, -1]
residual_data_set = data_set
#----------------
#print data_set[0:2]
#print residual_data_set[0:2]
new_tree = RegressionTree(self.max_depth,
self.min_leaf_sample_cnt,
self.feat_sample_ratio,
self.data_sample_ratio)
#print residual_data_set
new_tree.fit(residual_data_set)
#print new_tree.tree
self.model.append(new_tree)
#print "self.model", self.model
print "tree num %s" % (len(self.model))
# recover raw data_set label
for j in range(0, data_cnt):
data_set[j, -1] = y_raw_buff[j]
def predict(self, instance):
score = None
for i, m in enumerate(self.model):
if i == 0:
score = m
#print i, score
else:
score += self.shrink_ratio * m.predict(instance)
#print i, score
return score
def load_data(data_file):
data_set = [[float(j) for j in i.rstrip().split(',')] for i in open(data_file).readlines()]
return np.array(data_set)
def test(data_file_path, model):
data_set = load_data(data_file_path)
"""
print data_set
print data_set.shape
print np.bincount([ int(i) for i in data_set[:,-1]])
"""
correct = float(0)
incorrect = float(0)
sq_sum_error = float(0)
for data in data_set[0:2]:
pv = model.predict(data)
if abs(pv - data[-1]) < 0.5:
correct += 1
else:
incorrect += 1
# rmse
sq_sum_error += np.power((pv - data[-1]), 2)
rmse = np.sqrt( sq_sum_error / len(data_set[0]) )
accuracy = correct / (incorrect + correct)
print "test accuracy is: %s "%(accuracy)
print "test rmse is : %s"%(rmse)
def main(data_file_path,
max_depth,
min_leaf_sample_cnt,
feat_sample_ratio,
data_sample_ratio,
shrink_ratio,
tree_num):
data_set = load_data(data_file_path)
"""
print data_set
print data_set.shape
print np.bincount([ int(i) for i in data_set[:,-1]])
"""
model = GBDT(max_depth=max_depth,
min_leaf_sample_cnt=min_leaf_sample_cnt,
feat_sample_ratio=feat_sample_ratio,
data_sample_ratio=data_sample_ratio,
shrink_ratio=shrink_ratio,
tree_num=tree_num
)
model.fit(data_set)
print "train done!"
test(data_file_path, model)
print "test done!"
if __name__ == "__main__":
# main('Jain_373_2.txt', 5, 3, 0.6, 0.6, 0.25, 100)
main('Iris_150_4.txt',
max_depth=2,
min_leaf_sample_cnt=2,
feat_sample_ratio=0.8,
data_sample_ratio=0.8,
shrink_ratio=0.25,
tree_num=1000)
GBRank
重新构造损失函数,强化约束条件。
根据pair之间的关系,利用regressiontree每次重训负样例。
import os
import sys
import copy
import json
import numpy as np
import random
from regression_tree import *
class GBRank():
def __init__(self,
max_depth=5,
min_leaf_sample_cnt=1,
feat_sample_ratio=1.0,
data_sample_ratio=1.0,
learning_ratio=0.1,
tree_num=10,
margin=0.1,
):
self.max_depth = max_depth
self.min_leaf_sample_cnt = min_leaf_sample_cnt
self.feat_sample_ratio = feat_sample_ratio
self.data_sample_ratio = data_sample_ratio
self.learning_ratio = learning_ratio
self.tree_num = tree_num
self.margin = margin
self.model = []
self.data_cnt = 0
print self.max_depth
print self.min_leaf_sample_cnt
print self.feat_sample_ratio
print self.data_sample_ratio
print self.learning_ratio
print self.tree_num
print self.margin
def get_negative_data_set(self, model_id, model, qu_set, h_accumulate):
#print model_id, model, qu_set,h_accumulate
neg_data_set = []
qu_cnt = len(qu_set[:,0])
if model_id != 0:
for u in range(0, qu_cnt):
#print "h_accumulate[u] before", h_accumulate[u], model_id
g = self.learning_ratio * model.predict(qu_set[u, :])
h_accumulate[u] = (model_id * h_accumulate[u] + g) / (model_id + 1)
#print "model.predict(qu_set[u, :])", model.predict(qu_set[u, :]), self.learning_ratio
#print "g", g
#print "h_accumulate[u]", h_accumulate[u], model_id
for x in range(0, qu_cnt):
for y in range(x+1, qu_cnt):
if (h_accumulate[x] < h_accumulate[y] + self.margin) and (qu_set[x, -1] > qu_set[y, -1]):
#print "h_accumulate[x] < h_accumulate[y] + self.margin", h_accumulate[x], h_accumulate[y] , self.margin
#print "qu_set[x, -1] > qu_set[y, -1]", qu_set[x, -1], qu_set[y, -1]
neg_instance_x = copy.deepcopy(qu_set[x,:])
neg_instance_x[-1] = (h_accumulate[y] + self.margin)
#print "neg_instance_x", neg_instance_x
neg_instance_y = copy.deepcopy(qu_set[y,:])
neg_instance_y[-1] = (h_accumulate[x] - self.margin)
#print "neg_instance_y", neg_instance_y
neg_data_set.append(neg_instance_x.tolist())
neg_data_set.append(neg_instance_y.tolist())
'''
elif (h_accumulate[x] + self.margin > h_accumulate[y] ) and (qu_set[x, -1] < qu_set[y, -1]):
#print "h_accumulate[x] + self.margin > h_accumulate[y]", h_accumulate[x], h_accumulate[y], self.margin
#print "qu_set[x, -1] < qu_set[y, -1]", qu_set[x, -1], qu_set[y, -1]
neg_instance_x = copy.deepcopy(qu_set[x, :])
neg_instance_x[-1] = (h_accumulate[y] - self.margin)
#print "neg_instance_x", neg_instance_x
neg_instance_y = copy.deepcopy(qu_set[y, :])
neg_instance_y[-1] = (h_accumulate[x] + self.margin)
#print "neg_instance_y", neg_instance_y
neg_data_set.append(neg_instance_x.tolist())
neg_data_set.append(neg_instance_y.tolist())
'''
#neg_data_set = np.array(neg_data_set)
#print "neg_data_set", neg_data_set
#print "neg_data_set len", len(neg_data_set[:,0])
#print "qu_cnt", qu_cnt
#print "label", qu_set[:,-1]
return neg_data_set
def fit(self, data_set):
h_accumulate = dict()
h_0 = 0.0
for q in data_set.keys():
cur_qu_cnt = len(data_set[q][:,0])
if cur_qu_cnt==0:
continue
h_accumulate[q] = [0.0 for i in range(0, cur_qu_cnt)]
#print "h_accumulate", h_accumulate
#print "h_accumulate keys %s"%(len(h_accumulate.keys()))
self.model.append(h_0)
for i in range(0, self.tree_num):
print "tree id ", i
neg_data_set =[]
for q in data_set.keys():
new_set = self.get_negative_data_set(i, self.model[i], data_set[q], h_accumulate[q])
neg_data_set.extend(new_set)
neg_data_set = np.array(neg_data_set)
#print neg_data_set[0:2]
print len(neg_data_set[:,0])
new_tree = RegressionTree(self.max_depth,
self.min_leaf_sample_cnt,
self.feat_sample_ratio,
self.data_sample_ratio)
new_tree.fit(neg_data_set)
self.model.append(new_tree)
#print "self.model", self.model
print "tree num %s" % (len(self.model))
def predict(self, instance):
score = None
for i, m in enumerate(self.model):
if i == 0:
score = m
#print i, score
else:
score = ( i*score + self.learning_ratio * m.predict(instance) )/ (i+1)
#print i, score
return score
def test(data_file_path, model):
data_set = load_data(data_file_path)
"""
print data_set
print data_set.shape
print np.bincount([ int(i) for i in data_set[:,-1]])
"""
correct = float(0)
incorrect = float(0)
sq_sum_error = float(0)
for data in data_set:
pv = model.predict(data)
if abs(pv - data[-1]) < 0.5:
correct += 1
else:
incorrect += 1
# rmse
sq_sum_error += np.power((pv - data[-1]), 2)
rmse = np.sqrt( sq_sum_error / len(data_set[0]) )
accuracy = correct / (incorrect + correct)
print "test accuracy is: %s "%(accuracy)
print "test rmse is : %s"%(rmse)
def load_data(data_file):
data_set = {}
with open(data_file) as df:
cnt = -1
for line in df:
cnt += 1
if cnt==0: continue
#print line.rstrip('\n').decode('gbk').encode('utf-8')
items = line.rstrip('\n').rsplit('\t')
#print items
instance = items[4:90]
instance.append(items[91])
instance.append(items[0])
instance = [float(i) for i in instance]
#print len(instance)
ins = copy.deepcopy(instance)
#print ins
if data_set.has_key(items[3]):
data_set[items[3]].append(ins)
else:
data_set[items[3]] = [ins]
for k in data_set.keys():
data_set[k] = np.array(data_set[k])
#if cnt ==10:break
print "query", len(data_set.keys()), "qu", cnt
return data_set
def main(data_file_path,
max_depth,
min_leaf_sample_cnt,
feat_sample_ratio,
data_sample_ratio,
learning_ratio,
tree_num,
margin):
data_set = load_data(data_file_path)
"""
print data_set
print data_set.shape
print np.bincount([ int(i) for i in data_set[:,-1]])
"""
model = GBRank(max_depth=max_depth,
min_leaf_sample_cnt=min_leaf_sample_cnt,
feat_sample_ratio=feat_sample_ratio,
data_sample_ratio=data_sample_ratio,
learning_ratio=learning_ratio,
tree_num=tree_num,
margin=margin,
)
model.fit(data_set)
print "train done!"
#test(data_file_path, model)
#print "test done!"
if __name__ == "__main__":
# main('Jain_373_2.txt', 5, 3, 0.6, 0.6, 0.25, 100)
main('dx_sample_data_1w.txt',
max_depth=10,
min_leaf_sample_cnt=50,
feat_sample_ratio=0.6,
data_sample_ratio=0.6,
learning_ratio=1.0,
tree_num=300,
margin=0.5)