88.6904095824422 | 4.29677132819266 | 2 | 0.0500 |
90.5048281577648 | 83.5507530641210 | 2 | 0.0500 |
49.8388955348408 | 36.9308179554190 | 2 | 0.0500 |
90.9700719119941 | 89.5719918011541 | 2 | 0.0500 |
57.8598446989567 | 27.4099065906944 | 2 | 0.0500 |
60.2589738655931 | 54.3673019266521 | 2 | 0.0500 |
86.1992146483479 | 59.5334513926939 | 2 | 0.0500 |
73.7482414756424 | 48.5612829189145 | 2 | 0.0500 |
%Author:Q
�te:2013/8/27
�scribe:寻找最佳分裂点的函数,不返回最佳分裂点,而是直接返回分类的标签。
%标签只接受1和-1,以后看能不能改进
%参数说明:
% 输入参数:
% data 为原始数据
% labels 为每个数据的标签
% weights 为每个数据的权重
%
% 输出参数:
% r_labels 机器分类的结果,有一些是返回最佳分裂点和分类状态,然后让别人自己去打标签,这个地方直接返回标签了
% r_weight 按照这个标签分类所产生的错误的权值和
% r_point 返回最佳分裂点
% r_pointdim 最佳分裂点所在的维度
% r_pointstate 最佳分裂点的分类状态,小于最佳分裂点的为这个值,大于这个值为-r_pointstate
[row column] = size(data);
min_error = sum(weights); %找最小的错误权值的
min_error_point = 0; %找最佳分裂点
min_error_state = -1; %分裂状态
min_error_dim = -1; %最佳分裂点的维度
for dim = 1:column %dim dimensionality 维度,理论上要支持多维数据
sorted = sort(data(:,dim),1,'ascend'); %dim =1,表示对每一列进行排序,,dim=2表示对每一行进行排序.
class_point = -1; %存储最佳分裂点,写在外面可以避免不必要的内存析构和复制
for i = 1:(row + 1) %n个点可以找出n + 1候选的最佳分裂点
if (i == 1) %第一个点
class_point = sorted(1) - 0.5;
elseif i == row + 1 %最后一个点
class_point = sorted(row) + 0.5;
else
class_point = (sorted(i - 1) + sorted(i))/2;
end
ind1 = data(:,dim) < class_point; %所有机器分类为-1的点的标签
ind2 = ~ind1; %所有机器分裂为1的点的标签
s_error = sum(weights((labels .* ind1) == 1)); %上半部被分类错误的
b_error = sum(weights((labels .* ind2) == -1)); %下半部被分类错误的
sum_error = s_error + b_error; %总错误的权值的和
if sum_error < min_error
min_error = sum_error;
min_error_point = class_point;
min_error_state = -1; %小于分裂点的为-1
min_error_dim = dim; %记录维度
end
ind1 = data(:,dim) < class_point;
ind2 = ~ind1;
%很上面是不是完全反过来的?
s_error = sum(weights((labels .* ind1) == -1)); %上半部分被分类错误的(注意一下分类正确是不是也是这个呢?)
b_error = sum(weights((labels .* ind2) == 1)); %下半部分被分类错误的
sum_error = s_error + b_error;
if sum_error < min_error
min_error = sum_error;
min_error_point = class_point;
min_error_state = 1; %小于分裂点的为1
min_error_dim = dim;
end
end
end
r_error = min_error;
r_point = min_error_point;
r_pointdim = min_error_dim;
r_pointstate= min_error_state;
label_ind1 = data(:,min_error_dim) < min_error_point;
label_ind2 = ~label_ind1;
r_labels = zeros(row,1);
if r_pointstate == -1
r_labels(label_ind1) = -1;
r_labels(label_ind2) = 1;
else
r_labels(label_ind1) = 1;
r_labels(label_ind2) = -1;
end
然后就是用这个函数去分类
88.6904 | 4.2968 | 2.0000 | 0.0500 |
57.8598 | 27.4099 | 2.0000 | 0.0500 |
49.8389 | 36.9308 | 2.0000 | 0.0500 |
18.4259 | 38.6681 | 1.0000 | 0.0500 |
73.7482 | 48.5613 | 2.0000 | 0.0500 |
60.2590 | 54.3673 | 2.0000 | 0.0500 |
86.1992 | 59.5335 | 2.0000 | 0.0500 |
21.9768 | 65.6613 | 1.0000 | 0.0500 |
52.9227 | 66.1339 | 1.0000 | 0.0500 |
12.5649 | 78.1641 | 1.0000 | 0.0500 |
43.7318 | 78.8254 | 1.0000 | 0.0500 |
46.9657 | 79.1344 | 1.0000 | 0.0500 |
19.7510 | 82.2232 | 1.0000 | 0.0500 |
66.1395 | 83.4353 | 1.0000 | 0.0500 |
90.5048 | 83.5508 | 2.0000 | 0.0500 |
61.2605 | 85.2921 | 1.0000 | 0.0500 |
30.3279 | 87.3756 | 1.0000 | 0.0500 |
90.9701 | 89.5720 | 2.0000 | 0.0500 |
64.5580 | 98.7675 | 1.0000 | 0.0500 |
77.5785 | 99.7898 | 1.0000 | 0.0500 |
然后开始更新权值,公式是 value = x/(1 - x) = 0.15/(1 - 0.15)= 0.1765
这里理论上人任何一个可以弱化权值的函数都是可以的。不一定只是这个函数
88.6904095824422 | 4.29677132819266 | 2 | 0.0294 | 2 |
90.5048281577648 | 83.5507530641210 | 2 | 0.1667 | 1 |
49.8388955348408 | 36.9308179554190 | 2 | 0.0294 | 2 |
52.9226950242885 | 66.1339306433348 | 1 | 0.0294 | 1 |
90.9700719119941 | 89.5719918011541 | 2 | 0.1667 | 1 |
57.8598446989567 | 27.4099065906944 | 2 | 0.0294 | 2 |
77.5785103656777 | 99.7898477049783 | 1 | 0.0294 | 1 |
66.1394630545105 | 83.4353381388180 | 1 | 0.0294 | 1 |
46.9657238940880 | 79.1343687458312 | 1 | 0.0294 | 1 |
21.9767619845275 | 65.6612708164405 | 1 | 0.0294 | 1 |
60.2589738655931 | 54.3673019266521 | 2 | 0.0294 | 2 |
18.4258996781770 | 38.6681329635985 | 1 | 0.1667 | 2 |
19.7509715511001 | 82.2232087530085 | 1 | 0.0294 | 1 |
86.1992146483479 | 59.5334513926939 | 2 | 0.0294 | 2 |
12.5649252286031 | 78.1641320694625 | 1 | 0.0294 | 1 |
64.5580255723501 | 98.7674556243708 | 1 | 0.0294 | 1 |
43.7318210302330 | 78.8253800442339 | 1 | 0.0294 | 1 |
61.2605067498194 | 85.2920886655010 | 1 | 0.0294 | 1 |
73.7482414756424 | 48.5612829189145 | 2 | 0.0294 | 2 |
30.3279447163244 | 87.3756072506857 | 1 | 0.0294 | 1 |
%Author:Q
�te:2013/8/28
�scribe:adaboost算法的实现,训练函数
%修改性很好,传的都是各种函数,要改的话只要再写一个函数,然后调用即可,之前代码不用动
%参数说明:
% 输入参数:
% data 实际的数据
% labels 这些数据的标签
% loopnum 循环次数
% findBestPoint ——函数 寻找最佳分裂点
% calcTurnWeight ——函数 计算每一轮的权值,默认的公式是 log10((1 - x)/x)
% getUpdateLabel ——函数 判断哪些权值需要更新
% updateWeights ——函数 更新数据的权值
%
% 输出参数:
% adaboost_model 一个结构体 adaboost的输出模型,包含必要的参数
% weights 保存每一轮分类结果的权重
% param 保存每一轮分类的必要信息
% error 分类错误的总和
% point 最佳分裂点
% pointdim 最佳分裂点所在的维度
% pointstate 分裂的状态
% labels 保存机器每一轮分类的标签
adaboost_model = struct('weights',zeros(1,loopnum),...
'param' ,[],...
'labels' ,[]);
[row column] = size(data);
weights = ones(row,1)/row;
tmpparam = struct( 'error', [],...
'point', [],...
'pointdim', [],...
'pointstate', []);
for i = 1:loopnum
[r_labels r_error r_point r_pointdim r_pointstate] = findBestPoint(data,labels,weights);
adaboost_model.labels = [adaboost_model.labels r_labels];
adaboost_model.weights(i) = calcTurnWeight(r_error);
tmpparam.error = r_error;
tmpparam.point = r_point;
tmpparam.pointdim = r_pointdim;
tmpparam.pointstate = r_pointstate;
adaboost_model.param = [adaboost_model.param tmpparam];
tmplabel = getUpdateLabel(labels,r_labels);
weights = updateWeights(weights,tmplabel,r_error,@calcWeightFun,@simplifyWeights);
end
finlabels = inteLabels(adaboost_model.labels,adaboost_model.weights,@negToLabels);
re_labels = finlabels ~= labels;
%Author:Q
�te:2013/8/28
�scribe:使用AdaBoost算法的分类机去分类数据
%参数说明:
% 输入参数:
% data 待分类的数据
% adaboost_model AdaBoost算法的分类机
% num 分类机中的个数
%
% 输出参数:
% re_labels 返回标签
[row column] = size(data);
re_labels = [];
tmp_labels = zeros(row,1);
for i = 1:num
dim = adaboost_model.param(1).pointdim;
point = adaboost_model.param(1).point;
state = adaboost_model.param(1).pointstate;
ind1 = data(:,dim) < point;
ind2 = ~ind1;
if state == -1
tmp_labels(ind1) = -1;
tmp_labels(ind2) = 1;
else
tmp_labels(ind1) = 1;
tmp_labels(ind2) = -1;
end
re_labels = [re_labels tmp_labels];
end
re_labels = inteLabels(re_labels,adaboost_model.weights,@negToLabels);
最后就是一个测试的Demo程序
%Author:Q
�te:2013/8/29
�scribe:AdaBoost算法的实现,分为两个部分,
�aBoost_Train 算法的训练机,AdaBoost_Identify 算法的分类机
%参数说明:
% 输入参数:
% data 进行训练的数据
% labels 输入数据的标签
% data_i 等待分分类的数据
% AdaBoost_Train Adaboost算法的训练机
% AdaBoost_Identify Adaboost算法的分类机
% loopnum 训练次数,动态判断有默认参数的
%
% 输出参数:
% re_labels 对待分类数据的分类好的标签
[row column] = size(data);
if nargin == 5
loopnum = row;
end
�aBoost训练函数
[adaboost_model re_lables] = AdaBoost_Train(data,labels,loopnum,@findBestPoint,@calcTurnWeight,@getUpdateLabel,@updateWeights);
�aBoost分类函数
re_labels = AdaBoost_Identify(data_i,adaboost_model,loopnum);
最后就是一个测试函数
�te:2013/8/27
�scribe:Adaboost算的设计和实现
close all
clear all
clc
�ta = textread('Data.txt');
datanum = 300;
loopnum = 100;
idatanum = 50;
data = abs(rand(datanum,2))*100;
data_i = abs(rand(idatanum,2))*100;
[row column] = size(data);
labels = zeros(row,1);
ind1 = (data(:,1)-data(:,2) > 0);
ind2 = ~ind1;
labels(ind1) = -1;
labels(ind2) = 1;
re_labels = AdaBoost(data,labels,data_i,@AdaBoost_Train,@AdaBoost_Identify,loopnum);
[row_i column_i] = size(data_i);
labels_i = zeros(row_i,1);
ind1 = (data_i(:,1)-data_i(:,2) > 0);
ind2 = ~ind1;
labels_i(ind1) = -1;
labels_i(ind2) = 1;
finlabel = re_labels ~= labels_i;
sum(finlabel)