regiongroup matlab,求教MATLABC45算法的region到底是什么啊??

这段代码实现了Quinlan的C4.5决策树算法,用于分类任务。算法首先通过PCA预处理数据,然后递归地构建决策树,并根据树结构生成决策表面。在构建树的过程中,通过比较不同特征的信息增益率来选择最佳划分属性。最终,利用决策树对决策区域的样本进行分类。
摘要由CSDN通过智能技术生成

function D = C4_5(train_features, train_targets, inc_node, region)

% Classify using Quinlan's C4.5 algorithm

% Inputs:

% features          - Train features

%        targets            - Train targets

%        inc_node    - Percentage of incorrectly assigned samples at a node

%   inc_node参数用于作为迭代结束的条件,我觉得就是叶子节点可以包含的最大的样本数

%   inc_node参数太大的话会导致分类准确率下降,太小的话可能会导致过拟合

%        region            - Decision region vector: [-x x -y y number_of_points]

%

% Outputs

%        D                        - Decision sufrace

%NOTE: In this implementation it is assumed that a feature vector with fewer than 10 unique values (the parameter Nu)

%is discrete, and will be treated as such. Other vectors will be treated as continuous

[Ni, M]                = size(train_features);%M是样本数,Ni是样本维数

inc_node    = inc_node*M/100;

Nu          = 10;

%For the decision region

N           = region(5);

mx          = ones(N,1) * linspace (region(1),region(2),N);

my          = linspace (region(3),region(4),N)' * ones(1,N);

flatxy      = [mx(:), my(:)]';

%Preprocessing

%[f, t, UW, m]   = PCA(train_features, train_targets, Ni, region);

%train_features  = UW * (train_features - m*ones(1,M));;

%flatxy          = UW * (flatxy - m*ones(1,N^2));;

%Find which of the input features are discrete, and discretisize the corresponding

%dimension on the decision region

discrete_dim = zeros(1,Ni);

for i = 1:Ni,

Nb = length(unique(train_features(i,:)));

if (Nb <= Nu),

%This is a discrete feature

discrete_dim(i)        = Nb;

[H, flatxy(i,:)]        = high_histogram(flatxy(i,:), Nb);

end

end

%Build the tree recursively 递归地构造树

disp('Building tree')

tree        = make_tree(train_features, train_targets, inc_node, discrete_dim, max(discrete_dim), 0);

%Make the decision region according to the tree

disp('Building decision surface using the tree')

targets                = use_tree(flatxy, 1:N^2, tree, discrete_dim, unique(train_targets));

D                   = reshape(targets,N,N);

%END

function targets = use_tree(features, indices, tree, discrete_dim, Uc)

%Classify recursively using a tree

targets = zeros(1, size(features,2));

if (tree.dim == 0)

%Reached the end of the tree

targets(indices) = tree.child;

return

end

%This is not the last level of the tree, so:

%First, find the dimension we are to work on

dim = tree.dim;

dims= 1:size(features,1);

%And classify according to it

if (discrete_dim(dim) == 0),

%Continuous feature

in                        = indices(find(features(dim, indices) <= tree.split_loc));

targets                = targets + use_tree(features(dims, :), in, tree.child(1), discrete_dim(dims), Uc);

in                        = indices(find(features(dim, indices) >  tree.split_loc));

targets                = targets + use_tree(features(dims, :), in, tree.child(2), discrete_dim(dims), Uc);

else

%Discrete feature

Uf                        = unique(features(dim,:));

for i = 1:length(Uf),

in           = indices(find(features(dim, indices) == Uf(i)));

targets        = targets + use_tree(features(dims, :), in, tree.child(i), discrete_dim(dims), Uc);

end

end

%END use_tree

function tree = make_tree(features, targets, inc_node, discrete_dim, maxNbin, base)

%Build a tree recursively

[Ni, L]                                            = size(features);%L为样本数,Ni为样本维数

Uc                                                 = unique(targets);%Uc中存放的是类标签

tree.dim                                                = 0;

%tree.child(1:maxNbin)        = zeros(1,maxNbin);

tree.split_loc                                = inf;%inf正无穷

if isempty(features),

return

end

%When to stop: If the dimension is one or the number of examples is small

if ((inc_node > L) || (L == 1) || (length(Uc) == 1)),

H                                = hist(targets, length(Uc));

[m, largest]         = max(H);%largest对应的是出现次数最多为m的类标签,

tree.child                 = Uc(largest);

return

end

%Compute the node's I

for i = 1:length(Uc),

Pnode(i) = length(find(targets == Uc(i))) / L;%计算每一类的样本数所占的比例

end

Inode = -sum(Pnode.*log(Pnode)/log(2));%计算熵

%For each dimension, compute the gain ratio impurity

%This is done separately for discrete and continuous features

delta_Ib    = zeros(1, Ni);

split_loc        = ones(1, Ni)*inf;

for i = 1:Ni,

data        = features(i,:);

Nbins        = length(unique(data));%Nbins是对应某个属性值取值的个数

if (discrete_dim(i)),%对于离散属性

%This is a discrete feature

P        = zeros(length(Uc), Nbins);

for j = 1:length(Uc),

for k = 1:Nbins,

indices         = find((targets == Uc(j)) & (features(i,:) == k));%我觉得应该是(features(i,:) == unique(data)(k))吧?

P(j,k)         = length(indices);%每一类中出现某个属性取值的个数

end

end

Pk          = sum(P);%按列求和

P           = P/L;

Pk          = Pk/sum(Pk);

info        = sum(-P.*log(eps+P)/log(2));

delta_Ib(i) = (Inode-sum(Pk.*info))/-sum(Pk.*log(eps+Pk)/log(2));%计算信息增益率,公式为Gain(A)/I(A),其中Gain(A)=Inode-sum(Pk.*info)就是属性A的信息增益

else                                                                %而I(A)=-sum(Pk.*log(eps+Pk)/log(2))为属性A的所包含的信息

%This is a continuous feature 对于连续属性

P        = zeros(length(Uc), 2);

%Sort the features

[sorted_data, indices] = sort(data);

sorted_targets = targets(indices);

%Calculate the information for each possible split

I        = zeros(1, L-1);

for j = 1:L-1,

for k =1:length(Uc),

P(k,1) = length(find(sorted_targets(1:j) == Uc(k)));

P(k,2) = length(find(sorted_targets(j+1:end) == Uc(k)));

end

Ps                = sum(P)/L;

P                = P/L;

info        = sum(-P.*log(eps+P)/log(2));

I(j)        = Inode - sum(info.*Ps);

end

[delta_Ib(i), s] = max(I);%找到信息增益最大的划分方法

split_loc(i) = sorted_data(s);%对应属性i的划分位置就是能使信息增益最大的划分值

end

end

%Find the dimension maximizing delta_Ib

[m, dim] = max(delta_Ib);

dims         = 1:Ni;

tree.dim = dim;

%Split along the 'dim' dimension

Nf                = unique(features(dim,:));

Nbins        = length(Nf);

if (discrete_dim(dim)),

%Discrete feature

for i = 1:Nbins,

indices            = find(features(dim, :) == Nf(i));%找到属性值为Nf(i)的索引值

tree.child(i)        = make_tree(features(dims, indices), targets(indices), inc_node, discrete_dim(dims), maxNbin, base);%递归

end

else

%Continuous feature

tree.split_loc        = split_loc(dim);%分叉值

indices1                           = find(features(dim,:) <= split_loc(dim));

indices2                           = find(features(dim,:) > split_loc(dim));

tree.child(1)        = make_tree(features(dims, indices1), targets(indices1), inc_node, discrete_dim(dims), maxNbin);%递归

tree.child(2)        = make_tree(features(dims, indices2), targets(indices2), inc_node, discrete_dim(dims), maxNbin);%递归

end

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值