Matlab实现决策树C4.5算法并执行

最新推荐文章于 2023-06-13 09:37:12 发布

William_Dong

最新推荐文章于 2023-06-13 09:37:12 发布

阅读量2w

点赞数 13

分类专栏： Matlab

本文链接：https://blog.csdn.net/dongweionly/article/details/25351979

版权

Matlab 专栏收录该内容

11 篇文章 0 订阅

订阅专栏

注：C4.5实现部分非原创，乃参考网上程序

第一部分：程序执行例子

traindata=[5	3	1.6	0.2	1
5	3.4	1.6	0.4	1
5.2	3.5	1.5	0.2	1
5.2	3.4	1.4	0.2	1
4.7	3.2	1.6	0.2	1
4.8	3.1	1.6	0.2	1
5.4	3.4	1.5	0.4	1
5.2	4.1	1.5	0.1	1
5.5	4.2	1.4	0.2	1
4.9	3.1	1.5	0.2	1
5	3.2	1.2	0.2	1
5.5	3.5	1.3	0.2	1
4.9	3.6	1.4	0.1	1
4.4	3	1.3	0.2	1
5.1	3.4	1.5	0.2	1
5	3.5	1.3	0.3	1
4.5	2.3	1.3	0.3	1
4.4	3.2	1.3	0.2	1
5	3.5	1.6	0.6	1
5.1	3.8	1.9	0.4	1
4.8	3	1.4	0.3	1
5.1	3.8	1.6	0.2	1
4.6	3.2	1.4	0.2	1
5.3	3.7	1.5	0.2	1
5	3.3	1.4	0.2	1
6.6	3	4.4	1.4	2
6.8	2.8	4.8	1.4	2
6.7	3	5	1.7	2
6	2.9	4.5	1.5	2
5.7	2.6	3.5	1	2
5.5	2.4	3.8	1.1	2
5.5	2.4	3.7	1	2
5.8	2.7	3.9	1.2	2
6	2.7	5.1	1.6	2
5.4	3	4.5	1.5	2
6	3.4	4.5	1.6	2
6.7	3.1	4.7	1.5	2
6.3	2.3	4.4	1.3	2
5.6	3	4.1	1.3	2
5.5	2.5	4	1.3	2
5.5	2.6	4.4	1.2	2
6.1	3	4.6	1.4	2
5.8	2.6	4	1.2	2
5	2.3	3.3	1	2
5.6	2.7	4.2	1.3	2
5.7	3	4.2	1.2	2
5.7	2.9	4.2	1.3	2
6.2	2.9	4.3	1.3	2
5.1	2.5	3	1.1	2
5.7	2.8	4.1	1.3	2
7.2	3.2	6	1.8	3
6.2	2.8	4.8	1.8	3
6.1	3	4.9	1.8	3
6.4	2.8	5.6	2.1	3
7.2	3	5.8	1.6	3
7.4	2.8	6.1	1.9	3
7.9	3.8	6.4	2	3
6.4	2.8	5.6	2.2	3
6.3	2.8	5.1	1.5	3
6.1	2.6	5.6	1.4	3
7.7	3	6.1	2.3	3
6.3	3.4	5.6	2.4	3
6.4	3.1	5.5	1.8	3
6	3	4.8	1.8	3
6.9	3.1	5.4	2.1	3
6.7	3.1	5.6	2.4	3
6.9	3.1	5.1	2.3	3
5.8	2.7	5.1	1.9	3
6.8	3.2	5.9	2.3	3
6.7	3.3	5.7	2.5	3
6.7	3	5.2	2.3	3
6.3	2.5	5	1.9	3
6.5	3	5.2	2	3
6.2	3.4	5.4	2.3	3
5.9	3	5.1	1.8	3
];
testdata=[5.1	3.5	1.4	0.2	1
4.9	3	1.4	0.2	1
4.7	3.2	1.3	0.2	1
4.6	3.1	1.5	0.2	1
5	3.6	1.4	0.2	1
5.4	3.9	1.7	0.4	1
4.6	3.4	1.4	0.3	1
5	3.4	1.5	0.2	1
4.4	2.9	1.4	0.2	1
4.9	3.1	1.5	0.1	1
5.4	3.7	1.5	0.2	1
4.8	3.4	1.6	0.2	1
4.8	3	1.4	0.1	1
4.3	3	1.1	0.1	1
5.8	4	1.2	0.2	1
5.7	4.4	1.5	0.4	1
5.4	3.9	1.3	0.4	1
5.1	3.5	1.4	0.3	1
5.7	3.8	1.7	0.3	1
5.1	3.8	1.5	0.3	1
5.4	3.4	1.7	0.2	1
5.1	3.7	1.5	0.4	1
4.6	3.6	1	0.2	1
5.1	3.3	1.7	0.5	1
4.8	3.4	1.9	0.2	1
7	3.2	4.7	1.4	2
6.4	3.2	4.5	1.5	2
6.9	3.1	4.9	1.5	2
5.5	2.3	4	1.3	2
6.5	2.8	4.6	1.5	2
5.7	2.8	4.5	1.3	2
6.3	3.3	4.7	1.6	2
4.9	2.4	3.3	1	2
6.6	2.9	4.6	1.3	2
5.2	2.7	3.9	1.4	2
5	2	3.5	1	2
5.9	3	4.2	1.5	2
6	2.2	4	1	2
6.1	2.9	4.7	1.4	2
5.6	2.9	3.6	1.3	2
6.7	3.1	4.4	1.4	2
5.6	3	4.5	1.5	2
5.8	2.7	4.1	1	2
6.2	2.2	4.5	1.5	2
5.6	2.5	3.9	1.1	2
5.9	3.2	4.8	1.8	2
6.1	2.8	4	1.3	2
6.3	2.5	4.9	1.5	2
6.1	2.8	4.7	1.2	2
6.4	2.9	4.3	1.3	2
6.3	3.3	6	2.5	3
5.8	2.7	5.1	1.9	3
7.1	3	5.9	2.1	3
6.3	2.9	5.6	1.8	3
6.5	3	5.8	2.2	3
7.6	3	6.6	2.1	3
4.9	2.5	4.5	1.7	3
7.3	2.9	6.3	1.8	3
6.7	2.5	5.8	1.8	3
7.2	3.6	6.1	2.5	3
6.5	3.2	5.1	2	3
6.4	2.7	5.3	1.9	3
6.8	3	5.5	2.1	3
5.7	2.5	5	2	3
5.8	2.8	5.1	2.4	3
6.4	3.2	5.3	2.3	3
6.5	3	5.5	1.8	3
7.7	3.8	6.7	2.2	3
7.7	2.6	6.9	2.3	3
6	2.2	5	1.5	3
6.9	3.2	5.7	2.3	3
5.6	2.8	4.9	2	3
7.7	2.8	6.7	2	3
6.3	2.7	4.9	1.8	3
6.7	3.3	5.7	2.1	3
];
train_patterns=traindata(:,1:(size(traindata,2)-1));
train_targets=traindata(:,size(traindata,2))';
test_patterns=testdata(:,1:(size(testdata,2)-1));
test_targets=testdata(:,size(testdata,2))';
test_targets_predict = C4_5(train_patterns', train_targets, test_patterns', 5, 10);
%train_patterns'行是feature，列是样本
%train_targets 是1行多列，列是训练样本个数
% test_patterns'行是feature，列是样本
%最后两个参数取值，见C4_5函数
temp_count=0;
for i=1:size(test_targets_predict,2)
    if test_targets(:,i)==test_targets_predict(:,i)
        temp_count=temp_count+1;
    end
end
accuracy=temp_count/size(test_targets,2);
disp(accuracy);

第二部分： C4.5函数，这部分是参照网上资料

function test_targets = C4_5(train_patterns, train_targets, test_patterns, inc_node, Nu)

% Classify using Quinlan's C4.5 algorithm
% Inputs:
% 	training_patterns   - Train patterns 行是特征，列是样本
%	training_targets	- Train targets  1行多列，列是训练样本个数
%   test_patterns       - Test  patterns 行是特征，列是样本
%	inc_node            - Percentage of incorrectly assigned samples at a node
%    inc_node为防止过拟合参数，表示样本数小于一定阈值结束递归，可设置为5-10
%  Nu is to determine whether the variable is discrete or continuous (the value is always set to 10)
%
% Outputs
%	test_targets        - Predicted targets 1行m列（列的长度是测试样本的个数）

%NOTE: In this implementation it is assumed that a pattern vector with fewer than 10 unique values (the parameter Nu)
%is discrete, and will be treated as such. Other vectors will be treated as continuous

[Ni, M]		= size(train_patterns);%输入向量为NI*M的矩阵，其中M表示训练样本个数，Ni为特征维数维数
inc_node    = inc_node*M/100;

%Find which of the input patterns are discrete, and discretisize the corresponding
%dimension on the test patterns
discrete_dim = zeros(1,Ni);
for i = 1:Ni,
    Ub = unique(train_patterns(i,:));
    Nb = length(Ub);
    if (Nb <= Nu),
        %This is a discrete pattern
        discrete_dim(i)	= Nb;
        dist            = abs(ones(Nb ,1)*test_patterns(i,:) - Ub'*ones(1, size(test_patterns,2)));
        [m, in]         = min(dist);
        test_patterns(i,:)  = Ub(in);
    end
end

%Build the tree recursively
%disp('Building tree')
tree            = make_tree(train_patterns, train_targets, inc_node, discrete_dim, max(discrete_dim), 0);

%Classify test samples
%disp('Classify test samples using the tree')
test_targets    = use_tree(test_patterns, 1:size(test_patterns,2), tree, discrete_dim, unique(train_targets));

%END

function targets = use_tree(patterns, indices, tree, discrete_dim, Uc)
%Classify recursively using a tree

targets = zeros(1, size(patterns,2));

if (tree.dim == 0)
    %Reached the end of the tree
    targets(indices) = tree.child;
    return
end

%This is not the last level of the tree, so:
%First, find the dimension we are to work on
dim = tree.dim;
dims= 1:size(patterns,1);

%And classify according to it
if (discrete_dim(dim) == 0),
    %Continuous pattern
    in				= indices(find(patterns(dim, indices) <= tree.split_loc));
    targets		= targets + use_tree(patterns(dims, :), in, tree.child(1), discrete_dim(dims), Uc);
    in				= indices(find(patterns(dim, indices) >  tree.split_loc));
    targets		= targets + use_tree(patterns(dims, :), in, tree.child(2), discrete_dim(dims), Uc);
else
    %Discrete pattern
    Uf				= unique(patterns(dim,:));
    for i = 1:length(Uf),
        if any(Uf(i) == tree.Nf) %Has this sort of data appeared before? If not, do nothing
            in   	= indices(find(patterns(dim, indices) == Uf(i)));
            targets	= targets + use_tree(patterns(dims, :), in, tree.child(find(Uf(i)==tree.Nf)), discrete_dim(dims), Uc);
        end
    end
end

%END use_tree

function tree = make_tree(patterns, targets, inc_node, discrete_dim, maxNbin, base)
%Build a tree recursively

[Ni, L]    					= size(patterns);
Uc         					= unique(targets);
tree.dim					= 0;
%tree.child(1:maxNbin)	= zeros(1,maxNbin);
tree.split_loc				= inf;

if isempty(patterns),
    return
end

%When to stop: If the dimension is one or the number of examples is small
if ((inc_node > L) | (L == 1) | (length(Uc) == 1)), %剩余训练集只剩一个，或太小，小于inc_node，或只剩一类，退出
    H					= hist(targets, length(Uc));  %返回类别数的直方图  
    [m, largest] 	= max(H); %更大的一类，m为大的值，即个数，largest为位置，即类别的位置  
    tree.Nf         = [];
    tree.split_loc  = [];
    tree.child	 	= Uc(largest);%直接返回其中更大的一类作为其类别  
    return
end

%Compute the node's I
for i = 1:length(Uc),
    Pnode(i) = length(find(targets == Uc(i))) / L;
end
Inode = -sum(Pnode.*log(Pnode)/log(2));

%For each dimension, compute the gain ratio impurity
%This is done separately for discrete and continuous patterns
delta_Ib    = zeros(1, Ni);
split_loc	= ones(1, Ni)*inf;

for i = 1:Ni,
    data	= patterns(i,:);
    Ud      = unique(data);
    Nbins	= length(Ud);
    if (discrete_dim(i)),
        %This is a discrete pattern
        P	= zeros(length(Uc), Nbins);
        for j = 1:length(Uc),
            for k = 1:Nbins,
                indices 	= find((targets == Uc(j)) & (patterns(i,:) == Ud(k)));
                P(j,k) 	= length(indices);
            end
        end
        Pk          = sum(P);
        P           = P/L;
        Pk          = Pk/sum(Pk);
        info        = sum(-P.*log(eps+P)/log(2));
        delta_Ib(i) = (Inode-sum(Pk.*info))/-sum(Pk.*log(eps+Pk)/log(2));
    else
        %This is a continuous pattern
        P	= zeros(length(Uc), 2);

        %Sort the patterns
        [sorted_data, indices] = sort(data);
        sorted_targets = targets(indices);

        %Calculate the information for each possible split
        I	= zeros(1, L-1);
        for j = 1:L-1,
            %for k =1:length(Uc),
            %    P(k,1) = sum(sorted_targets(1:j)        == Uc(k));
            %    P(k,2) = sum(sorted_targets(j+1:end)    == Uc(k));
            %end
            P(:, 1) = hist(sorted_targets(1:j) , Uc);
            P(:, 2) = hist(sorted_targets(j+1:end) , Uc);
            Ps		= sum(P)/L;
            P		= P/L;
            
            Pk      = sum(P);            
            P1      = repmat(Pk, length(Uc), 1);
            P1      = P1 + eps*(P1==0);
            
            info	= sum(-P.*log(eps+P./P1)/log(2));
            I(j)	= Inode - sum(info.*Ps);
        end
        [delta_Ib(i), s] = max(I);
        split_loc(i) = sorted_data(s);
    end
end

%Find the dimension minimizing delta_Ib
[m, dim]    = max(delta_Ib);
dims        = 1:Ni;
tree.dim    = dim;

%Split along the 'dim' dimension
Nf		= unique(patterns(dim,:));
Nbins	= length(Nf);
tree.Nf = Nf;
tree.split_loc      = split_loc(dim);

%If only one value remains for this pattern, one cannot split it.
if (Nbins == 1)
    H				= hist(targets, length(Uc));
    [m, largest] 	= max(H);
    tree.Nf         = [];
    tree.split_loc  = [];
    tree.child	 	= Uc(largest);
    return
end

if (discrete_dim(dim)),
    %Discrete pattern
    for i = 1:Nbins,
        indices         = find(patterns(dim, :) == Nf(i));
        tree.child(i)	= make_tree(patterns(dims, indices), targets(indices), inc_node, discrete_dim(dims), maxNbin, base);
    end
else
    %Continuous pattern
    indices1		   	= find(patterns(dim,:) <= split_loc(dim));
    indices2	   		= find(patterns(dim,:) > split_loc(dim));
    if ~(isempty(indices1) | isempty(indices2))
        tree.child(1)	= make_tree(patterns(dims, indices1), targets(indices1), inc_node, discrete_dim(dims), maxNbin, base+1);
        tree.child(2)	= make_tree(patterns(dims, indices2), targets(indices2), inc_node, discrete_dim(dims), maxNbin, base+1);
    else
        H				= hist(targets, length(Uc));
        [m, largest] 	= max(H);
        tree.child	 	= Uc(largest);
        tree.dim                = 0;
    end
end