【代码在抽取训练集和测试集部分出错】
收获:
1.0 得知libsvm对于多分类问题默认使用的就是1 vs 1;
2.0 得知分类标签常用+1,-1,但是是任意选择的,对于多分类问题,建议选择1,2,3...
3.0 交叉验证寻找最优的c和gamma效果不错,但是用时过长。
4.0 libsvm的详细用法和参数解析:上一页。
1.采用交叉验证法获得的最大识别率为98.75,与不适用交叉验证的区别还是很大的。
2:1 vs rest 代码待写…
3.matlab训练代码如下:
% libsvm默认的多分类问题就是使用的 1v1;
% 可以自己写1 vs rest
clc;
clear all;
f_id = fopen('iris.data');
if(f_id==2)
disp('打开文件失败');
return;
end
% 读取全部数据,并获取全部分类。
% oneFloatFlag = 0;
xapp = []; % 全部数据
tagSetData = []; % o
tagVerData = []; % s
tagVirData = []; % g
vec = zeros(6,1);
olabel = [];
slabel = [];
glabel = [];
while ~feof(f_id)
c = fscanf(f_id,'%f,%f,%f,%f,%s',[1,5]); % 处理一行
vec = [c(1);c(2);c(3);c(4)];
xapp = [xapp,vec];
if (c(13)==111)
tagSetData = [tagSetData,vec];
olabel = [olabel,1];
elseif (c(13)==115)
tagVerData = [tagVerData,vec];
slabel = [slabel,2];
elseif (c(13)==103)
tagVirData = [tagVirData,vec];
glabel = [glabel,3];
end
end
fclose(f_id);
% 总数据
all_data = [tagSetData,tagVerData,tagVirData];
all_label = [olabel,slabel,glabel];
% 打乱数据,然后取80个训练,70个测试:
x_train = [];
y_train = [];
trainNumber = 80;
[N,M] = size(all_data);
p = randperm(M);
for i = 1:trainNumber
x_train = [x_train,all_data(:,p(i))]; % 数据
y_train= [y_train,all_label(p(i))]; % 数据的标签
end
x_train = x_train';
y_train = y_train';
% 获取测试样本
x_test = [];
y_test = [];
for i =trainNumber+1:M
x_test = [x_test,all_data(i:p(i))];
y_test = [y_test,all_label(p(i))];
end
x_test = x_test';
y_test = y_test';
% 归一化数据;
[Tn,Tm] = size(x_train);
avgX = mean(x_train);
stdX = std(x_train);
for i = 1:Tn % 以行遍历所有数据
x_train(i,:) = (x_train(i,:)-avgX)./stdX;
end
[Tn,Tm] = size(x_test);
avgX = mean(x_test);
stdX = std(x_test);
for i = 1:Tn % 以行遍历所有数据
x_test(i,:) = (x_test(i,:)-avgX)./stdX;
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%SVM Gaussian kernel
%Search for the optimal C and gamma, K(x1,x2) = exp{-||x1-x2||^2/gamma} to
%make the recognition rate maximum.
%Firstly, search C and gamma in a crude scale (as recommended in 'A practical Guide to Support Vector Classification'))
CScale = [-5, -3, -1, 1, 3, 5,7,9,11,13,15];
gammaScale = [-15,-13,-11,-9,-7,-5,-3,-1,1,3];
C = 2.^CScale;
gamma = 2.^gammaScale;
maxRecognitionRate = 0;
for i = 1:length(C) % 找到最佳的超参数 c 和 gama 的下标
for j = 1:length(gamma) % svm训练参数设置:
cmd=['-t 2 -c ',num2str(C(i)),' -g ',num2str(gamma(j)),' -v 5'];
recognitionRate = svmtrain(y_train,x_train,cmd); %训练数据和参数
if recognitionRate>maxRecognitionRate
maxRecognitionRate = recognitionRate;
maxCIndex = i;
maxGammaIndex = j;
end
end
end
% 在一个精确的尺度上搜索最优的C和gamma。(上述求得的尺度)
% 寻找最佳c和gama的附近的值;由于CScale和gamaScale是递增的,所以,小一点往后退,大一点向前走
% 等分长度:
n = 10;
minScale = 0.5*(CScale(max(1,maxCIndex-1)) + CScale(maxCIndex));
maxScale = 0.5*(CScale(min(length(CScale),maxCIndex+1)) +CScale(maxCIndex));
newCScale = [minScale:(maxScale-minScale)/n:maxScale];
minGammaScale = 0.5*(gammaScale(max(1,maxGammaIndex-1))+gammaScale(maxGammaIndex));
maxGammaScale = 0.5*(gammaScale(min(length(gammaScale),maxGammaIndex+1))+gammaScale(maxGammaIndex));
newGammaScale = [minGammaScale:(maxGammaScale-minGammaScale)/n:maxGammaScale];
newC = 2.^newCScale;
newGamma = 2.^newGammaScale;
maxRecognitionRate = 0;
for i = 1:length(newC) % 找到最佳的超参数 c 和 gama 的下标
for j = 1:length(newGamma) % svm训练参数设置:
cmd=['-t 2 -c ',num2str(newC(i)),' -g ',num2str(newGamma(j)),' -v 5'];
recognitionRate = svmtrain(y_train,x_train,cmd); %训练数据和参数
if recognitionRate>maxRecognitionRate
maxRecognitionRate = recognitionRate;
maxC = i;
maxG = j;
end
end
end
% 使用上述求得得最好的C和gama训练模型
cmd=['-t 2 -c ',num2str(maxC),' -g ',num2str(maxG)];
model = svmtrain(y_train,x_train,cmd);
save model.mat model;
save x_train.mat x_train;
save y_train.mat y_train;
save x_test.mat x_test;
save y_test.mat y_test;