function [ result ] = TDW_multiclass( TrainingData_File)
%% This is a function expand TDW to multiclass
% 该函数是基于one-vs-one方法的得到的处理多类问题的TDW分类器
% trainX 训练样本的特征向量构成的n行m列矩阵,每一行是一个样本
% trainY 训练样本的标签构成的n行Q列矩阵,每一行对应一个样本,Q表示类别的个数,
% 如果第i个样本属于第j类 则trainY(i,j)=1,且trainY(i,:)中的其它元素都为-1
%% Get the detail of dataset
train_data = load(TrainingData_File);
[Y, X,~,~, ~,~] = Data(train_data);
trainY = Y';
trainX = X';
[n,m]=size(trainY); %训练样本集中样本的个数n,以及类别的个数m
Sample_size = zeros(m,1); %每类样本的个数
Class = {[], [], [], []};
Sample_area = zeros(n,m);
%% Get the area of 2 class of samples
for r = 1:m,
flagp = [];
flagpY = [];
for i = 1:n
if trainY(i,r) == 1
flagp = [flagp;trainX(i,:)]; %属于第r类的样本
flagpY = [flagpY;trainY(i,:)];
end
end
np = size(flagp,1); %训练集中的第r类样本个数np
if r+1<=m
for j = r+1:m
flagn = [];
flagnY = [];
for i = 1:n
if trainY(i,j) == 1
flagn = [flagn;trainX(i,:)]; %属于第j(j = r+1)类的样本
flagnY =[flagnY;trainY(i,:)];
end;
end
nn = size(flagn,1); %训练集中的第j类样本个数
clear flag;
flag = TDW_Binary([flagp;flagn],[flagpY;flagnY]); %每次取两类样本训练分类器对测试集进行判别
Class{1,r} = [Class{1,r} flag(1:np,1)];
Class{1,j} = [Class{1,j} flag(np+1:np+nn,1)];
end
end
for i = 1:np
table = tabulate(Class{1,r}(i,:));
[~,b] = max(table(:,2));
Sample_area(i,r) = table(b,1); %第r类的第i个样本属于的area
end
Sample_size(1,r) = np;
end
clear flag flag1 table;
%% Classify datasets using formulas
% Set Rho=m/n,the algorithm preference for classifying date set as 'Overlapping'
Sample_Number = zeros(3,1);
for i =1:m
table = tabulate(Sample_area(:,i));
if find(table(:,1) == -1)
a = find(table(:,1) == -1);
Sample_Number(1,1) = Sample_Number(1,1) + table(a,2); %第一行为各类样本负域的个数,第二行为正域的个数,第三行为边界域的个数
end
if find(table(:,1) == 1)
a = find(table(:,1) == 1);
Sample_Number(2,1) = Sample_Number(2,1) + table(a,2);
end
if find(table(:,1) == 2)
a = find(table(:,1) == 2);
Sample_Number(3,1) = Sample_Number(3,1) + table(a,2);
end
end
Rho = m/n; %设定分界参数 For now,how to determine the partermeter is the main problem
Ratio_NP = Sample_Number(1,1)/n; %负域样本比例
Ratio_BN = Sample_Number(3,1)/Sample_Number(1,1); %边界域样本与负域样本比值
Ratio_BP = Sample_Number(3,1)/n; %边界域样本比例
Sample_Number(3,1)
if Ratio_NP > Rho && Ratio_BN < 1
result = ['Outlier'];
elseif Ratio_BP > 10*Rho && Ratio_BN > 1
result = ['Overlapping'];
else
result = ['Inter|Intar'];
end
disp(['DatasetCategory=',result]);
end
function [DatasetCategory] = TDW_Func(TrainingData_File)
%% ***This function aim to classify dataset(binary only)*** %%
%% Set dataset and initialization
% TrainingData_File=['adult.csv'];
train_data = load(TrainingData_File);
[train_target, P,NumberofData,NumberofInputNeurons, ~,~] = Data(train_data);
n=0;k=0;
%% Distance between each sample and the rest of the sample
Distance=zeros(NumberofData-1,NumberofData);
for i=1:1:NumberofData
for j=1:1:NumberofData
sum=0;
if i>j
for k=1:NumberofInputNeurons
sum=sum+power((P(k,i)-P(k,j)),2);
Distance(j,i)=sqrt(sum);
end
elseif i<j
for k=1:NumberofInputNeurons
sum=sum+power((P(k,i)-P(k,j)),2);
Distance(j-1,i)=sqrt(sum);
end
end
end
end
% size(Distance)
% Distance1= Distance;
% Distance(Distance==0)=[];
% size(Distance)
% Distance=reshape(Distance,NumberofData-1,NumberofData); %The i-th column is the distance
% %of the i-1th sample and the remaining i-1 samples
%% Determining the value of neighborhood
%Distance_sorted=zeros(NumberofData-1,NumberofData);
w=0.1; %Range(0,1),it the key to determining the number of neigborhood samples
for i=1:NumberofData
Distance_c=Distance(:,i);
table=tabulate(Distance_c);
[n,m]=size(table);
Distance_sorted(1:n,i)=table(:,1); %Arrange thr distance from large to small
end
Delata=zeros(1,NumberofData); %For every sample has a Delata
for i=1:NumberofData
Delata(1,i)=min(Distance_sorted(:,i))+w*(max(Distance_sorted(:,i))-min(Distance_sorted(:,i)));
end
%% Get the sample belongs to the neighborhood
Distance_neig=zeros(NumberofData-1,NumberofData,2);
for i=1:NumberofData
k=1;
for j=1:NumberofData-1
if Distance(j,i)<Delata(1,i)
Distance_neig(k,i,1)=Distance(j,i);
if j<i
Distance_neig(k,i,2)=train_target(1,j);
else
Distance_neig(k,i,2)=train_target(1,j+1);
end
k=k+1;
end
end
end
%% Determining which area the sample is
alph=5;
beta=-5/6; %Partition parameter
NumberofPos=0;
NumberofBnd=0;
NumberofNeg=0; %Initialize the number of different area sample
fx=zeros(NumberofData,1);
table=tabulate(train_target(1,:));
if table(1,2)<table(2,2); %Find which class is majority
lable=-1;
else
lable=1;
end
for i=1:NumberofData
N1=0; %Initialize the number of minority
N0=0; %Initialize the number of majority
for j=1:NumberofData
if Distance_neig(j,i,2)==lable
N1=N1+1; %Number of minority
elseif Distance_neig(j,i,2)==-lable
N0=N0+1; %Number of majority
elseif Distance_neig(j,i,2)==0
break;
end
end
if train_target(1,i)==1
fx(i)=(N1-N0)/(N0+1);
elseif train_target(1,i)==-1
fx(i)=(N0-N1)/(N1+1);
end
if fx(i)>alph
NumberofPos=NumberofPos+1;
PosData(1,NumberofPos)=train_target(1,i);
PosData(2:NumberofInputNeurons+1,NumberofPos)=P(:,i);
elseif fx(i)<=alph && fx(i)>=beta
NumberofBnd=NumberofBnd+1;
BndData(1,NumberofBnd)=train_target(1,i);
BndData(2:NumberofInputNeurons+1,NumberofBnd)=P(:,i);
elseif fx(i)<beta
NumberofNeg=NumberofNeg+1;
NegData(1,NumberofNeg)=train_target(1,i);
NegData(2:NumberofInputNeurons+1,NumberofNeg)=P(:,i);
end
end
if NumberofNeg<0.1*NumberofData && NumberofPos>0.7*NumberofData
DatasetCategory=['Inter_calss'];
elseif NumberofNeg>0.1*NumberofData && NumberofBnd<0.2*NumberofData
DatasetCategory=['Outlier'];
elseif NumberofBnd>0.3*NumberofData && NumberofNeg<0.1*NumberofData
DatasetCategory=['Overlapping'];
else
DatasetCategory=['Intraclass'];
end
disp(['NumberofPos=',num2str(NumberofPos)]);
disp(['NumberofBnd=',num2str(NumberofBnd)]);
disp(['NumberofNeg=',num2str(NumberofNeg)]);
disp(['DatasetCategory=',DatasetCategory]);
end
20200811
补上鸽了很久很久的TDW_Binary( X,Y )代码
function [ result ] = TDW_Binary( X,Y )
%% This function is aim to get the Negative,Boundary and Postive sample of a dataset
%% intilizition
global K;
a = find(Y(1,:) == 1); %标签标识的训练目标,需将其数值化(1,-1)为行向量
train_target = Y(:,a)';
P = X';
[NumberofInputNeurons,NumberofData] = size(P);
PBN = zeros(NumberofData,1);
n=0;
k=0;
%% Distance between each sample and the rest of the sample
Distance=zeros(NumberofData-1,NumberofData);
for i=1:1:NumberofData
for j=1:1:NumberofData
sum=0;
if i>j
for k=1:NumberofInputNeurons
sum=sum+power((P(k,i)-P(k,j)),2);
Distance(j,i)=sqrt(sum);
end
elseif i<j
for k=1:NumberofInputNeurons
sum=sum+power((P(k,i)-P(k,j)),2);
Distance(j-1,i)=sqrt(sum);
end
end
end
end
%% Determining the value of neighborhood
%Distance_sorted=zeros(NumberofData-1,NumberofData);
w=0.05; %Range(0.01,0.05),it's the key to determining the number of neigborhood samples
for i=1:NumberofData
Distance_c=Distance(:,i);
table=tabulate(Distance_c);
[n,m]=size(table);
Distance_sorted(1:n,i)=table(:,1); %Arrange the distance from large to small
end
Delata=zeros(1,NumberofData); %For every sample has a Delata
for i=1:NumberofData
Delata(1,i)=min(Distance_sorted(:,i))+w*(max(Distance_sorted(:,i))-min(Distance_sorted(:,i)));
end
%% Get the sample belongs to the neighborhood
Distance_neig=zeros(NumberofData-1,NumberofData,2);
for i=1:NumberofData
k=1;
for j=1:NumberofData-1
if Distance(j,i)<Delata(1,i)
Distance_neig(k,i,1)=Distance(j,i);
if j<i
Distance_neig(k,i,2)=train_target(1,j);
else
Distance_neig(k,i,2)=train_target(1,j+1);
end
k=k+1;
end
end
end
%% Use KNN to get the K nearest sample
k = 15; %if use KNN,the k is a key partemeter
IDX = knnsearch(P',P', 'K', k, 'Distance', 'euclidean'); %返回每个样本的K近邻样本,每行代表每个样本的K近邻样本的索引值
%% Determining which area the sample is
alph=K; % 正域与边界域的阈值
beta=-K/(K+1); % Partition parameter
NumberofPos=0;
NumberofBnd=0;
NumberofNeg=0; % Initialize the number of different area sample
fx=zeros(NumberofData,1);
table=tabulate(train_target(1,:));
if table(1,2)<table(2,2); % Find which class is minjority
lable=table(1,1);
else
lable=table(2,1);
end
for i=1:NumberofData
N1=0; %Initialize the number of minority
N0=0; %Initialize the number of majority
%% Use My_methord to count the neighborhood sample categorier for each sample
for j=1:NumberofData
if Distance_neig(j,i,2)==lable
N1=N1+1; %Number of minority
elseif Distance_neig(j,i,2)==-lable
N0=N0+1; %Number of majority
elseif Distance_neig(j,i,2)==0
break;
end
end
%% Use KNN to count the neighborhood sample categorier for each sample
% ind = find(IDX(i,:) ~= i);
% indmax = find(train_target(1,IDX(i,ind)) == -lable);
% N0 = length(indmax);
% indmin = find(train_target(1,IDX(i,ind)) == lable);
% N1 = length(indmin);
if train_target(1,i)==lable
fx(i)=(N1-N0)/(N0+1);
elseif train_target(1,i)==-lable
fx(i)=(N0-N1)/(N1+1);
end
if fx(i)>=alph %正域判断
NumberofPos=NumberofPos+1;
PosData(1,NumberofPos)=train_target(1,i);
PosData(2:NumberofInputNeurons+1,NumberofPos)=P(:,i);
PBN(i,1) = 1;
elseif fx(i)<alph && fx(i)>beta %边界域判断
NumberofBnd=NumberofBnd+1;
BndData(1,NumberofBnd)=train_target(1,i);
BndData(2:NumberofInputNeurons+1,NumberofBnd)=P(:,i);
PBN(i,1) = 2;
elseif fx(i)<=beta %负域判断
NumberofNeg=NumberofNeg+1;
NegData(1,NumberofNeg)=train_target(1,i);
NegData(2:NumberofInputNeurons+1,NumberofNeg)=P(:,i);
PBN(i,1) = -1;
end
end
result = PBN;
end