实现了下一个adaboost,以二叉树为基分类器,只能处理类别数据,且只能用于分类,需要处理数值数据或用来做数值预测只要在该版本上改改即可。通过数据测试和sklearn的GDBT以及XGBOOST对比了下,f1值如下:
GDBT:0.933678670778
XGBOOST:0.933678670778
自己实现的adaboost:0.905342414024
准确度还是差了点,毕竟还没有加上行抽样、列抽样以、收缩率以及正则等方法。
和大家交流下,测试数据我已经放在附件中,有问题可以留言。
测试数据下载地址:http://archive.ics.uci.edu/ml/machine-learning-databases/car/
代码如下:
# coding=utf8 import sys import numpy as np import os reload(sys) sys.setdefaultencoding('utf-8') os.chdir('D:\Study\ML\MLAction') def cal_info(y,weight): """ calculate information :param y:label 1*m array :param weight:1*m array :return:information value """ if y is None or weight is None: return 0 if y.shape!=weight.shape: raise Exception("the lenght of weight is not equal to y") weight_sum=np.sum(weight) y_class=np.unique(y) info=0 for c in y_class: y_weight=weight[y==c] p=np.sum(y_weight)*1.0/weight_sum info-=p*np.log(p) return info def leaf_val(y,weight): """ get leaf value :param y: 1*m array :param weight: 1*m array :return: """ if y is None or weight is None: return None y_stata={} y_class=np.unique(y) for c in y_class: y_stata[c]=np.sum(weight[y==c]) y_stata=sorted(y_stata.items(),key=lambda x:x[1],reverse=True) return y_stata[0][0] def chose_best_split(X,y,weight): """ chose the best feature and value to split :param X: m*n array :param y: 1*m array :param weight: 1*m array :return: split_feature,split_value """ if X is None or y is None or weight is None: return None m,n=X.shape best_split_feat=None best_split_val=None best_info_gain=0 info_before = cal_info(y, weight) weight_sum=np.sum(weight) for split_feat in range(n): feat_values=np.unique(X[:,split_feat]) for split_val in feat_values: equal_bl = X[:,split_feat] == split_val left_y = y[equal_bl] right_y = y[~equal_bl] left_weight=weight[equal_bl] right_weight=weight[~equal_bl] info_left = cal_info(left_y,left_weight) info_right = cal_info(right_y,right_weight) info_gain = info_before -np.sum(left_weight)*1.0/ weight_sum * info_left \ - np.sum(right_weight)*1.0 / weight_sum * info_right if info_gain > best_info_gain: best_info_gain=info_gain best_split_feat=split_feat best_split_val=split_val return best_split_feat,best_split_val def classfy_tree_train(X,y,weight=None,min_split_gain=1e-6,min_leaf_data=3): """ train a classified tree :param X: m*n array :param y: 1*m list :param weight: 1*m array :param min_split_gain: :param min_leaf_data: :return: tree rootnode {splitfeat:0,splitval:0,left:node,right:node} splitval can ben leaf value """ #check input if X is None or y is None: raise Exception('X is None or y is None') m, n = X.shape if m!=len(y): raise Exception('the length of X is not equal to y') if weight is None: weight=np.ones(m) node = {'splitfeat':None,'splitval':None,'left':None,'right':None} if m<min_leaf_data: node['splitval']=leaf_val(y,weight) return node #split and check split split_feat,split_val=chose_best_split(X,y,weight) if split_feat is None or split_val is None: node['splitval']=leaf_val(y,weight) return node equal_bl=X[:,split_feat] == split_val left_X=X[equal_bl] right_X=X[~equal_bl] left_y = y[equal_bl] right_y=y[~equal_bl] left_weight = weight[equal_bl] right_weight=weight[~equal_bl] if left_y.shape[0]<min_leaf_data or right_y.shape[0]<min_leaf_data: node['splitval']=leaf_val(y,weight) return node info_before=cal_info(y,weight) info_left=cal_info(left_y,left_weight) info_right = cal_info(right_y,right_weight) weight_sum=np.sum(weight) info_gain=info_before-np.sum(left_weight)*1.0/weight_sum*info_left\ -np.sum(right_weight)*1.0/weight_sum*info_right if info_gain<min_split_gain: node['splitval']=leaf_val(y,weight) return node #create a node node['splitfeat']=split_feat node['splitval']=split_val node['left']=classfy_tree_train(left_X,left_y,left_weight,min_split_gain,min_leaf_data) node['right'] = classfy_tree_train(right_X, right_y, right_weight,min_split_gain, min_leaf_data) return node def classfy_tree_onepredict(x,tree_node): """ predict the class of x :param x: 1*n array :param tree_node: a trained tree node :return: class """ if x is None or tree_node is None: return None if tree_node['splitfeat'] is None: return tree_node['splitval'] split_feat=tree_node['splitfeat'] if x[split_feat]==tree_node['splitval']: return classfy_tree_onepredict(x,tree_node['left']) else: return classfy_tree_onepredict(x, tree_node['right']) def classfy_tree_predict(X,tree_node): """ predict the class of X :param X: m*n array :param tree_node: a trained tree node :return: class:1*m list """ m=X.shape[0] predict_classses=[] for i in range(m): pred=classfy_tree_onepredict(X[i],tree_node) predict_classses.append(pred) return np.array(predict_classses) def boost_tree_train(X, y,weight=None, min_split_gain=1e-6, min_leaf_data=3, num_iter=10): """ train a number of tree :param X: m*n array :param y: 1*m array :param min_split_gain: :param min_leaf_data: :param num_iter: the number of tree :return: a number of tree [[tree1,weight],[tree2,weight],...] """ #check input if X is None or y is None: raise Exception('X is None or y is None') m, n = X.shape if m!=len(y): raise Exception('the length of X is not equal to y') tree_list=[] if weight is None: weight = np.ones(m) for i in range(num_iter): #train basic tree tree=classfy_tree_train(X, y, weight, min_split_gain, min_leaf_data) #update weight predict_classes = classfy_tree_predict(X, tree) error=np.sum(predict_classes!=y)*1.0/m if error>0: tree_weight=0.5*(np.log(1-error)-np.log(error)) else: tree_weight=1e10 weight=weight*np.exp(-tree_weight*predict_classes*y) weight=weight/np.sum(weight) #append tree_list.append([tree,tree_weight]) return tree_list def boost_tree_onepredict(x,tree_list): """ predict the class of x :param x: 1*n list :param tree_list: boost tree list :return: predict class """ predict_classes={} for tree,weight in tree_list: pred=classfy_tree_onepredict(x,tree) predict_classes[pred]=predict_classes.get(pred,0)+weight predict_classes=sorted(predict_classes.items(),key=lambda v:v[1],reverse=True) return predict_classes[0][0] def boost_tree_predict(X,tree_list): """ predict the class of X :param X: m*n array :param tree_list: boost tree list :return: class:1*m list """ m=X.shape[0] predict_classses=[] for i in range(m): pred=boost_tree_onepredict(X[i],tree_list) predict_classses.append(pred) return np.array(predict_classses) #read data import pandas as pd car=pd.read_csv('Ch07/car.data',header=None) from sklearn.preprocessing import LabelEncoder for i in range(car.shape[1]): car.iloc[:,i]=LabelEncoder().fit_transform(car.iloc[:,i]) #sample train_size=int(car.shape[0]*0.6) select_index=np.random.permutation(car.shape[0]-1) car=car.iloc[select_index,:] #cut to train and test data car_train=car.iloc[:train_size,:] X_train=car_train.iloc[:,:-1] y_train=car_train.iloc[:,-1] y_ct=pd.value_counts(y_train) y_weight=np.sum(y_ct)*1.0/y_ct weight=y_weight[y_train]#unbalance data,so calculate the weight car_test=car.iloc[train_size:,:] X_test=car_test.iloc[:,:-1] y_test=car_test.iloc[:,-1] #GDBT from sklearn.ensemble import GradientBoostingClassifier ada=GradientBoostingClassifier().fit(X_train,y_train,weight.values) ada_pred=ada.predict(X_test) from sklearn.metrics import f1_score print(f1_score(y_test,ada_pred,average='macro')) #XGBOOST from xgboost.sklearn import XGBClassifier xgb=XGBClassifier(objective='multi:softmax',n_estimators=30).fit(X_train,y_train,weight.values) xgb_pred=ada.predict(X_test) print(f1_score(y_test,xgb_pred,average='macro')) #self-defined adaboost best_classifys=boost_tree_train(X_train.values,y_train.values,weight.values,num_iter=25) b_pred=boost_tree_predict(X_test.values,best_classifys) print(f1_score(y_test,b_pred,average='macro'))