<1> 生成training data 和 testing data : GenerateTrainDataSet

% ========================================================================
% 功能：生成具有正弦分界线的training data set
%
function [TrainData, TestData] = GenerateTrainDataSet(nTrainSize, nTestSize)
TrainData = zeros(nTrainSize,3);
TestData = zeros(nTestSize,3);
figure;
hold on;
%生成training data
title('training data')
for m = 1 : nTrainSize;
TrainData(m,1) =2 * pi * rand();                            % 1 对应x1
%TrainData(m,2) = sin(TrainData(m,1)) + rand() - 0.5;       % 2 对应x2
TrainData(m,2) = 4 * rand();
if(TrainData(m,2) >= sin(TrainData(m,1)) + 2)               % 3 对应y
TrainData(m,3) = 1;
plot(TrainData(m,1),TrainData(m,2),'*b','MarkerSize',10);
hold on;
else
TrainData(m,3) = 0;
plot(TrainData(m,1),TrainData(m,2),'or','MarkerSize',10);
hold on;
end;
end;
%生成testing data
figure;
hold on;
title('testing data')
for m = 1 : nTestSize;
TestData(m,1) =2 * pi * rand();                             % 1 对应x1
%TrainData(m,2) = sin(TrainData(m,1)) + rand() - 0.5;       % 2 对应x2
TestData(m,2) = 4 * rand();
if(TestData(m,2) >= sin(TestData(m,1)) + 2)                 % 3 对应y
TestData(m,3) = 1;
plot(TestData(m,1),TestData(m,2),'*b','MarkerSize',10);
hold on;
else
TestData(m,3) = 0;
plot(TestData(m,1),TestData(m,2),'or','MarkerSize',10);
hold on;
end;
end;

<2> 设计演算法对training data进行学习

%=========================================================================%
% decision stump 模型
% 输入：（X，y, u) (re-weighted error : u)，(X,y) data set (supervised)
% u(i)标记X(i)的权重，或者是bootstrap中样本点采样次数的归一化结果
% 输出(feature i, threshold theta, direction s, error, label[n]):
% label[n]标记每个数据点分类是否犯错，犯错了标记为1，正确标记为0
% 构成Hypothesis h = s * sign(X(i) - theta)
% s为direction，s = +1表示xi>theta为输出标记为+1，s = -1 表示xi<theta输出+1
% i表示某个维度，decision stump通常只选择某一维度进行分割，类似水平线和竖直线
% theta代表分割阈值
% 下面秉持一种信念尝试一下：只要找到比随机猜要好的h就可以了
% 算法思路：随机选取 feature_i，direction s和threshold theta，计算加权误差
% 反复迭代直到error < 0.5 (比随机乱猜好一点，weak classifier)即可
%=========================================================================%
function [feature_i, theta, s , error, label] = decison_stump(X, y, u)
while(1)
if(rand()>0.5) feature_i = 1;else feature_i = 2; end;                                   %随机选取feature i
if(rand()>0.5) s = 1;else s = -1; end;                                                  %随机选取direction s
theta = ( max(X(:,feature_i)) - min(X(:,feature_i)) ) * rand() + min(X(:,feature_i));   %随机选取direction s

error = 0;

for n = 1 : length(X(:,feature_i));
if(s == 1)                                                                          % 大于为1，小于为0
label(n) = 1 - ( (X(n,feature_i) >= theta) == y(n) );                           %犯错了标记为1，正确标记为0
error = error + u(n) * label(n);
else
label(n) = 1 - ( (X(n,feature_i) <= theta) == y(n) );
error = error + u(n) * label(n) ;
end;

end;

if( error < 0.5 )
break;
end;

end

%=========================================================================%
% 实际上是完成了bootstrap，对数据进行re-sample，得到放大错误分类数据的u
% 输入：u_0(u(k)), error(gk对应的分类误差), label(data是否分类错误的标签)
% 输出：u_1(u(k+1)), alpha(gk对应的融合权重)
% error = 0.5, 则delta = 1,alpha() = 0,随机猜测的g的权重为0
% error = 0，则delta = ∞,alpha() = ∞,有理由让完全分类对的g的权重为∞
%=========================================================================%
function [u_1, alpha] = my_adaboost(u_0, error, label)
N = length(label); %数据大小
delta = sqrt((1-error)/error);
for n = 1 : N
if(label(n) == 1) u_1(n) = u_0(n) * delta; else u_1(n) = u_0(n) / delta; end;
end;
alpha = log(delta);

%=========================================================================%
% 利用生成的training data和testing data对AdaBoosting decision stump进行学习
% 设置在同一数据集下的演示测试为tTimes: 训练的迭代次数自增100，初始为T = 100
% 流程：生成数据集，开启循环训练-测试，并输出分类效果

%%
% 生成数据集并初始化参数
nTrainSize = 1000;
nTestSize = 100;
[TrainData, TestData] = GenerateTrainDataSet(nTrainSize, nTestSize);
tTimes = 10;
T = 0;
f = fopen('Testing Results.txt','w');
%%
% 开启tTimes次训练和测试
for times = 1 : tTimes;
T = T + 100;
u0 = 1/nTrainSize * ones(1, nTrainSize);                               % 初始化bootstrap采样权重为均匀分布
X = TrainData(:,1:2);                                                  % 提取输入X
y = TrainData(:,3);                                                    % 提取标签y
g_set = zeros(4, T);                                                   % 初始化g_set集合：存储decision stump的参数（feature_i,theta,s）,再存入一个error性能
label = zeros(1, nTrainSize);                                          % data 分类错误标签
alpha = zeros(1, T);                                                   % g_set融合权重
%%
% 主循环迭代训练得到g_set，alpha
for n = 1 : T;
[g_set(1,n) g_set(2,n) g_set(3,n) g_set(4,n) label] = ...
decision_stump(X, y, u0);
[u_1, alpha(n)] = my_adaboost(u0, g_set(4,n), label);
u0 = u_1;
end;
%%
% 如何把这个g的融合边界绘制出来才是问题的关键所在
% 不如直接进行测试算了
% 直接利用训练得到的g_set和alpha对testing data进行测试，求出分类正确率

X = TestData(:,1:2);
y = TestData(:,3);
vote = zeros(1, nTestSize);

% X = TrainData(:,1:2);
% y = TrainData(:,3);
% vote = zeros(1, nTrainSize);

sucess_rate = 0;
%%
for m = 1 : T;
feature_i = g_set(1,m); theta = g_set(2,m); s = g_set(3,m);
for n = 1 : length(X(:,feature_i));
if(s == 1)                                                 % 大于为1，小于为0
label(n) = ( (X(n,feature_i) >= theta) );              % 犯错了标记为1，正确标记为0
else
label(n) = ( (X(n,feature_i) <= theta) );
end;
vote(n) = vote(n) + alpha(m) * label(n)/sum(alpha);        % 利用归一化的alpha进行加权融合
%vote(n) = vote(n) + 1/T * label(n);                       % 利用无权重的融合效果经测试不好
if(m == T)
sucess_rate = sucess_rate + ((vote(n)>0.5) == y(n));
end;
end;
end;
fprintf('第%d次训练的g_set集大小为%d，测试数据分类成功率为%f\n',times,T,sucess_rate/nTestSize);
%     display('测试数据分类成功率为：');
%     display(sucess_rate/nTestSize);
end;
fclose(f);


*************************************************随时记录，随时分享****************************************************