(1, 2) X. Y. Liu, J. Wu and Z. H. Zhou, “Exploratory Undersampling for Class-Imbalance Learning,” in IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics), vol. 39, no. 2, pp. 539-550, April 2009.
Matlab: 代码
function ensemble= EasyEnsemble(trainset, traintarget, catidx, T, rounds)
% Input:
% trainset: n-by-d training set
% traintarget: n-by-1 training target
% catidx: indicates which attributes are discrete ones (Note: start from 1)
% T: sample $T$ subsets of negtive examples
% rounds: use $rounds$ iterations to train each AdaBoost classifier
% Output:
% ensemble: EasyEnsemble classifier, a structure variable
poscount = sum(traintarget==1); % 正样本数量
negcount = length(traintarget)-poscount; % 负样本数量
posset = trainset(traintarget==1,:); % 正样本数据集
negset = trainset(traintarget==0,:); % 负样本数据集
negset = negset(randperm(negcount),:); % 打乱顺序
% randperm完成的是不重复的重排采样(k-permutations),如果结果中的数需要重复多次出现的情况,则可以用:randi(n,1,k)
ensemble = struct('trees',{},'alpha',{},'thresh',{});
for node=1:T % stopping criteria
nset = negset(1:poscount,:); % a ramdom subset of negtive examples
curtrainset = [posset;nset]; % 正样本和负样本的集合
curtarget = zeros(size(curtrainset,1),1); # 初始化当前样本的标签集合
curtarget(1:poscount)=1; % 将正样本设置为 1,负样本为 0
ens = AdaBoost(curtrainset,curtarget,catidx,rounds);% node classifier
ensemble(node) = ens;
negset = negset(randperm(negcount),:); % 将样本的顺序继续重新排列
end
%combine all weak learners to form the final ensemble
depth = length(ensemble);
ens= struct('trees',{},'alpha',{},'thresh',{});
for i=1:depth
ens(1).trees = [ens.trees; ensemble(i).trees];
ens(1).alpha = [ens.alpha; ensemble(i).alpha];
end
ens.thresh = sum(ens.alpha)/2;
ensemble = ens;