1. Spam Email Detection using SVM
用的是LibSVM库,最开始write in Matlab,后把其用在之前做的Web Demo里面
基本过程:
(1)打乱样本,分成train,validation,test三部分
(2)确认kernel type,如果用高斯核,就用validation set确认最佳的C,Gamma
(3)用最佳的参数进行训练
clear all
clc
load data.mat
[N,M] = size(data);
p = randperm(N); %直接打乱了训练样本
numberOfSamplesForTraining = N * 0.1;
Training = data(p(1:numberOfSamplesForTraining), :);
Testing = data(p(numberOfSamplesForTraining+1:end), :);
xTraining = Training(:, 1:6);
yTraining = Training(:, 7);
xTesting = Testing(:, 1:6);
yTesting = Testing(:, 7);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%SVM Gaussian kernel
%Search for the optimal C and gamma, K(x1,x2) = exp{-||x1-x2||^2/gamma} to
%make the recognition rate maximum.
%Firstly, search C and gamma in a crude scale (as recommended in 'A practical Guide to Support Vector Classification'))
CScale = [-5, -3, -1, 1, 3, 5,7,9,11,13,15];
gammaScale = [-15,-13,-11,-9,-7,-5,-3,-1,1,3];
C = 2.^CScale;
gamma = 2.^gammaScale;
maxRecognitionRate = 0;
for i = 1:length(C)
for j = 1:length(gamma)
cmd=['-t 2 -c ',num2str(C(i)),' -g ',num2str(gamma(j)),' -v 5'];
recognitionRate = svmtrain(yTraining,xTraining,cmd);
if recognitionRate>maxRecognitionRate
maxRecognitionRate = recognitionRate
maxCIndex = i;
maxGammaIndex = j;
end;
end;
end;
%Then search for optimal C and gamma in a refined scale.
n = 10;
minCScale = 0.5*(CScale(max(1,maxCIndex-1))+CScale(maxCIndex));
maxCScale = 0.5*(CScale(min(length(CScale),maxCIndex+1))+CScale(maxCIndex));
newCScale = [minCScale:(maxCScale-minCScale)/n:maxCScale];
minGammaScale = 0.5*(gammaScale(max(1,maxGammaIndex-1))+gammaScale(maxGammaIndex));
maxGammaScale = 0.5*(gammaScale(min(length(gammaScale),maxGammaIndex+1))+gammaScale(maxGammaIndex));
newGammaScale = [minGammaScale:(maxGammaScale-minGammaScale)/n:maxGammaScale];
newC = 2.^newCScale;
newGamma = 2.^newGammaScale;
maxRecognitionRate = 0;
for i = 1:length(newC)
for j = 1:length(newGamma)
cmd=['-t 2 -c ',num2str(newC(i)),' -g ',num2str(newGamma(j)),' -v 5'];
recognitionRate = svmtrain(yTraining,xTraining,cmd);
if recognitionRate>maxRecognitionRate
maxRecognitionRate = recognitionRate
maxC = newC(i);
maxGamma = newGamma(j);
end;
end;
end;
%Train the SVM model by the optimal C and gamma.
cmd=['-t 2 -c ',num2str(maxC),' -g ',num2str(maxGamma)];
model = svmtrain(yTraining,xTraining,cmd);
save model.mat model;
%Test the model on the remaining testing data and obtain the recognition rate.
load model.mat;
[yPred,accuracy,decisionValues] = svmpredict(yTesting,xTesting,model);
save yPred.mat yPred;
save decisionValues.mat decisionValues;
save xTesting.mat xTesting;
save yTesting.mat yTesting;
在Java里面写的时候,先把模型训练好,保存起来,收到邮件就先load model,再对其predict,把结果一并储存在DataBase中
2. 兵王问题 using SVM && NN
SVM就不说了,类似之前的,用NN的话也是用Matlab里面的神经网络toolbox中的newpr,用newff效果不好
% using newpr: it is doing a lot better
[N,M] = size(xapp);
X = xapp(:, 1 : floor(M*0.5));
y = yapp(:, 1 : floor(M*0.5));
net = newpr(X, y, 20 * ones(1, 10));
net.trainParam.epochs=1000;
net.trainParam.lr=0.1;
net.trainParam.goal=0.00004;
net=train(net,X,y);
predict = sim(net, xapp);
[t, index] = max(predict);
[t yy] = max(yapp);
ratio = sum(yy==index) / length(yy)
而且从结果来看:SVM只需要少量的数据样本就可以达到比较高的accuracy
3. Recommend System using collaborative filtering
直接用的之前Coursera上ML的代码,把电影feature,用户喜欢什么类型电影feature放到一个Object function里面进行求极值
因为最后rating要求是1-5的整数,所以算出double矩阵后四舍五入,这时发现train accuracy竟然只有50%,后来允许有1的误差就达到了95%,感觉准确率还行
4. 图片压缩 using K-Means
implement by myself in Java & 集成到之前的Web Demo
遇到的一个坑爹问题就是数组的应用,直接把center引用指向原始二维数组的话,当累加二维数组时,center也相当于累加了,所以最后就出现了灰度值大于255的情况
import java.awt.image.BufferedImage;
import java.io.File;
import java.util.Arrays;
import java.util.Random;
import javax.imageio.ImageIO;
/*
* k cluster
* convert to double
*/
public class Cluster {
public static double[][] image;
public static double[][] center;
public static int[] assign;
public static double[][] distance;
static int h, w;
static int K = 16;
public static void main(String[] args) throws Exception {
run("bird_small.png");
for(double[] c : center)
System.out.println(Arrays.toString(c));
}
public static void run(String file) throws Exception {
readImg(file);
initial();
runKmeans();
}
private static void readImg(String file) throws Exception {
File f = new File(file);
BufferedImage img = ImageIO.read(f);
w = img.getWidth();
h = img.getHeight();
// System.out.println(img.getType());
image = new double[w*h][3];
for(int i=0; i<w; i++)
for(int j=0; j<h; j++) {
int rgb = img.getRGB(i, j);
String s = Integer.toBinaryString(rgb);
int id = i*h+j;
image[id][0] = Integer.valueOf(s.substring(24, 32), 2)/255.0;
image[id][1] = Integer.valueOf(s.substring(16, 24), 2)/255.0;
image[id][2] = Integer.valueOf(s.substring(8, 16), 2)/255.0;
}
}
private static void initial() {
center = new double[K][3];
assign = new int[image.length];
distance = new double[image.length][K];
Random random = new Random();
for(int i=0; i<K; i++)
center[i] = image[random.nextInt(image.length)];
}
private static void runKmeans() {
// based on max iterations
for(int i=0; i<10; i++) {
System.out.println(i+1 + " iteration ... ");
findColsest();
reCalCenter();
}
}
private static void findColsest() {
// compute distance to center
for(int i=0; i<distance.length; i++) {
for(int j=0; j<distance[0].length; j++) {
double d = (image[i][0]-center[j][0])*(image[i][0]-center[j][0])
+ (image[i][1]-center[j][1])*(image[i][1]-center[j][1])
+ (image[i][2]-center[j][2])*(image[i][2]-center[j][2]);
distance[i][j] = d;
}
}
for(int i=0; i<assign.length; i++) {
double min = distance[i][0];
int minIdx = 0;
for(int j=1; j<center.length; j++) {
if(distance[i][j] < min) {
min = distance[i][j];
minIdx = j;
}
}
assign[i] = minIdx;
}
}
private static void reCalCenter() {
for(int i=0; i<K; i++) {
center[i][0]=0;center[i][1]=0;center[i][2]=0;
int cnt = 0;
for(int j=0; j<assign.length; j++) {
if(assign[j] == i) {
center[i][0]+=image[j][0];
center[i][1]+=image[j][1];
center[i][2]+=image[j][2];
cnt++;
}
}
if(cnt == 0) {
Random random = new Random();
center[i] = image[random.nextInt(image.length)];
} else {
center[i][0] /= cnt;
center[i][1] /= cnt;
center[i][2] /= cnt;
}
}
}
}
5. MNIST using NN && CNN
用的是TensorFlow,就是官网的例程,Deep Learning已经碾压了其他的方法
COursera上也有用NN实现的版本