分类代码
class AdaBoost:
def __init__(self,n_estimators=50, learning_rate=1.0): #n_estimators=50:分类器数目
self.clf_num = n_estimators
self.learning_rate = learning_rate
def init_args(self,datasets,labels):
self.X = datasets
self.Y = labels
self.M, self.N = datasets.shape
#弱分类器数据和集合
self.clf_sets = []
#初始化每个数据的权重
self.weights = [1.0/self.M]*self.M
#G(x)系数alpha,即分类器的权重
self.alpha = []
def _G(self,features,labels,weights):
m = len(features)
error = 100000.0 #无穷大
best_v = 0.0
#单维features
features_min = min(features)
features_max = max(features)
n_step = (features_max - features_min + self.learning_rate)//self.learning_rate
direct,compare_array = None,None
for i in range(1,int(n_step)): #找出误差最小的那个划分方式作为一个弱分类器
v = features_min + self.learning_rate*i
if v not in features:
#误差分类计算
compare_array_positive = np.array(
[1 if features[k] > v else -1 for k in range(m)])
weight_error_positive = sum([
weights[k] for k in range(m)
if compare_array_positive[k] != labels[k]
])
compare_array_nagetive = np.array(
[-1 if features[k] > v else 1 for k in range(m)])
weight_error_nagetive = sum([
weights[k] for k in range(m)
if compare_array_nagetive[k] != labels[k]
])
if weight_error_positive < weight_error_nagetive:
weight_error = weight_error_positive
_compare_array = compare_array_positive
direct = 'positive'
else:
weight_error = weight_error_nagetive
_compare_array = compare_array_nagetive
direct = 'negetive'
if weight_error < error:
error = weight_error
compare_array = _compare_array
best_v = v
return best_v,direct,error,compare_array
#计算alpha
def _alpha(self,error):
return 0.5*np.log((1-error)/error)
#规范化因子
def _Z(self,weights,a,clf):
return sum([
weights[i]*np.exp(-1*a*self.Y[i]*clf[i])
for i in range(self.M)
])
#权值更新
def _w(self,a,clf,Z):
for i in range(self.M):
self.weights[i] = self.weights[i]*np.exp(
-1*a*self.Y[i]*clf[i])/Z
def G(self,x,v,direct):
if direct == 'positive':
return 1 if x>v else -1
else:
return -1 if x>v else 1
def fit(self,X,y):
self.init_args(X,y) #初始化各个参数
for epoch in range(self.clf_num): #多个分类器的计算
best_clf_error,best_v,clf_result = 100000,None,None
#根据特征维度,选择误差最小的
for j in range(self.N): #self.N是特征个数,选择哪一个特征列表现最好
feature = self.X[:,j]
#分类阈值,分类方向,分类误差,分类结果
v,direct,error,compare_array = self._G(feature,self.Y,self.weights)
if error < best_clf_error:
best_clf_error = error
best_v = v
final_direct = direct
clf_result = compare_array
axis = j
if best_clf_error == 0: #如果最小的分类误差是0,跳出循环
break
#计算G(x)系数a
a = self._alpha(best_clf_error)
self.alpha.append(a)
#记录分类器
self.clf_sets.append((axis,best_v,final_direct)) #哪维特征,分类阈值,分类方向
#规范化因子
Z = self._Z(self.weights,a,clf_result) #输入上一步的每个样本的权重,这一步分类器的权重,分类结果
#每个样本权值更新
self._w(a,clf_result,Z) #输入分类器权重,分类结果,规范化因子
#预测函数
def predict(self,feature): #这里的特征样本只有一个
result = 0.0
for i in range(len(self.clf_sets)): #self.clf_sets 弱分类器数据和集合
axis,clf_v,direct = self.clf_sets[i] #哪维特征,分类阈值,分类方向
f_input = feature[axis]
result += self.alpha[i]*self.G(f_input,clf_v,direct) # 第几维特征,分类阈值,分类方向
#sign
return 1 if result>0 else -1
def score(self,X_test,y_test):
right_count = 0
for i in range(len(X_test)):
feature = X_test[i]
if self.predict(feature) == y_test[i]:
right_count += 1
return right_count/len(X_test)
运行例子:
获取数据的方法
def create_data():
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
data = np.array(df.iloc[:100,[0,1,-1]]) #取第一列,第二列和最后一列的数据
for i in range(len(data)):
if data[i,-1] == 0:
data[i,-1] = -1 #如果标签是0,则把标签变成-1
return data[:,:2],data[:,-1] #返回前两维特征
运行
X, y = create_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
clf = AdaBoost(10, 0.2)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
当有100个分类器时
result = []
for i in range(1, 101):
X, y = create_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
clf = AdaBoost(n_estimators=100, learning_rate=0.2)
clf.fit(X_train, y_train)
r = clf.score(X_test, y_test)
# print('{}/100 score:{}'.format(i, r))
result.append(r)
print('average score:{:.3f}%'.format(sum(result)))
运行例子