java代码-KNN分类器练习
1.下载使用weka jar包。
测试weka并掌握对数据集表格的使用。
package week1;
import java.io.FileReader;
import weka.core.Instances;
import weka.core.Instance;
import weka.core.Attribute;
public class WekaDataTest {
/**
*****************
* The only testing method.
*
* @param args
*****************
*/
public static void main(String args[]) {
Instances tempData = null;
try {
FileReader fileReader = new FileReader("C:/Users/huoyifeng/Desktop/java jar/weather-original.arff");
tempData = new Instances(fileReader);
fileReader.close();
} catch (Exception ee) {
System.out.println("Cannot read the file: \r\n" + ee);
System.exit(0);
} // Of try
// Step 1. Show the data.
System.out.println("\r\n********* Part 1 *********");
System.out.println("The data table is:\r\n" + tempData);
// Step 2. Show one instance.
System.out.println("\r\n********* Part 2 *********");
System.out.println("The 3rd instance is: \r\n" + tempData.instance(2));
// Step 3. Show one attribute.
System.out.println("\r\n********* Part 3 *********");
System.out.println("The 2nd attribute is: \r\n" + tempData.attribute(1));
System.out.println("Its number of values is: \r\n" + tempData.attribute(1).numValues());
System.out.println("The 3nd attribute is: \r\n" + tempData.attribute(2));
System.out.println("Its number of values is: \r\n" + tempData.attribute(2).numValues());
// Step 4. Take out one value from the data table.
System.out.println("\r\n********* Part 4 *********");
System.out.println("The 1st attribute value of the 1st instance is: " + tempData.instance(0).value(0));
System.out.println("The 3rd attribute value of the 1st instance is: " + tempData.instance(0).value(2));
System.out.println("The 5th attribute value of the 1st instance is: " + tempData.instance(0).value(4));
System.out.println("The 5th attribute value of the 1st instance is: " + tempData.instance(0).value(4));
// Step 5. Set the class attribute and show.
System.out.println("\r\n********* Part 5 *********");
tempData.setClassIndex(0);
System.out.println("If we use the 1st attribute as the class, it is: \r\n" + tempData.classAttribute());
tempData.setClassIndex(4);
System.out.println("If we use the 5th attribute as the class, it is: \r\n" + tempData.classAttribute());
System.out.println("The class value of the 1st instance is: " + tempData.instance(0).classValue());
}// Of main
}// Of class WekaDataTest
测试结果:
第一步:展示数据集;
第二步:instance展示某一行,例如展示第三行数据。
第三步:Attribute管理某一属性,例如第二列的全体属性。
第四步:获得第 i 行第 j 列的数据,用的是内部表示形式。
2. Day51-Day52 KNN基本操作
Day51代码:
package week1;
import java.io.FileReader;
import java.util.Arrays;
import java.util.Random;
import weka.core.*;
/**
* kNN classification.
*
* @author hyf
* @time2022/3/13
*/
//KNN训练
//
public class Day51 {
/**
* 曼哈顿距离,|x|+|y|
*/
public static final int MANHATTAN = 0;
/**
* 欧氏距离
*/
public static final int EUCLIDEAN = 1;
/**
* 距离衡量方式
*/
public int distanceMeasure = EUCLIDEAN;
/**
*一个随机实例
*/
public static final Random random = new Random();
/**
* 邻居数量
*/
int numNeighbors = 7;
/**
* 存储整个数据集
*/
Instances dataset;
/**
*训练集。由数据索引表示
*/
int[] trainingSet;
/**
* 测试集。由数据索引表示.
*/
int[] testingSet;
/**
* The predictions.
*/
int[] predictions;
/**
*********************
* The first constructor.
*
* @param paraFilename
* The arff filename.
*********************
*/
public Day51(String paraFilename) {
try {
FileReader fileReader = new FileReader(paraFilename);
dataset = new Instances(fileReader);
// The last attribute is the decision class.
dataset.setClassIndex(dataset.numAttributes() - 1);
fileReader.close();
} catch (Exception ee) {
System.out.println("Error occurred while trying to read \'" + paraFilename
+ "\' in KnnClassification constructor.\r\n" + ee);
System.exit(0);
} // Of try
}// Of the first constructor
/**
*********************
* Get a random indices for data randomization.
*
* @param paraLength
* The length of the sequence.
* @return An array of indices, e.g., {4, 3, 1, 5, 0, 2} with length 6.
*********************
*/
public static int[] getRandomIndices(int paraLength) {
int[] resultIndices = new int[paraLength];
// Step 1. Initialize.
for (int i = 0; i < paraLength; i++) {
resultIndices[i] = i;
} // Of for i
// Step 2. Randomly swap.
int tempFirst, tempSecond, tempValue;
for (int i = 0; i < paraLength; i++) {
// Generate two random indices.
tempFirst = random.nextInt(paraLength);
tempSecond = random.nextInt(paraLength);
// Swap.
tempValue = resultIndices[tempFirst];
resultIndices[tempFirst] = resultIndices[tempSecond];
resultIndices[tempSecond] = tempValue;
} // Of for i
return resultIndices;
}// Of getRandomIndices
/**
*********************
* Split the data into training and testing parts.
*
* @param paraTrainingFraction
* The fraction of the training set.
*********************
*/
public void splitTrainingTesting(double paraTrainingFraction) {
int tempSize = dataset.numInstances();
int[] tempIndices = getRandomIndices(tempSize);
int tempTrainingSize = (int) (tempSize * paraTrainingFraction);
trainingSet = new int[tempTrainingSize];
testingSet = new int[tempSize - tempTrainingSize];
for (int i = 0; i < tempTrainingSize; i++) {
trainingSet[i] = tempIndices[i];
} // Of for i
for (int i = 0; i < tempSize - tempTrainingSize; i++) {
testingSet[i] = tempIndices[tempTrainingSize + i];
} // Of for i
}// Of splitTrainingTesting
/**
*********************
* Predict for the whole testing set. The results are stored in predictions.
* #see predictions.
*********************
*/
public void predict() {
predictions = new int[testingSet.length];
for (int i = 0; i < predictions.length; i++) {
predictions[i] = predict(testingSet[i]);
} // Of for i
}// Of predict
/**
*********************
* Predict for given instance.
*
* @return The prediction.
*********************
*/
public int predict(int paraIndex) {
int[] tempNeighbors = computeNearests(paraIndex);
int resultPrediction = simpleVoting(tempNeighbors);
return resultPrediction;
}// Of predict
/**
*********************
* The distance between two instances.
*
* @param paraI
* The index of the first instance.
* @param paraJ
* The index of the second instance.
* @return The distance.
*********************
*/
public double distance(int paraI, int paraJ) {
double resultDistance = 0;
double tempDifference;
switch (distanceMeasure) {
case MANHATTAN:
for (int i = 0; i < dataset.numAttributes() - 1; i++) {
tempDifference = dataset.instance(paraI).value(i) - dataset.instance(paraJ).value(i);
if (tempDifference < 0) {
resultDistance -= tempDifference;
} else {
resultDistance += tempDifference;
} // Of if
} // Of for i
break;
case EUCLIDEAN:
for (int i = 0; i < dataset.numAttributes() - 1; i++) {
tempDifference = dataset.instance(paraI).value(i) - dataset.instance(paraJ).value(i);
resultDistance += tempDifference * tempDifference;
} // Of for i
break;
default:
System.out.println("Unsupported distance measure: " + distanceMeasure);
}// Of switch
return resultDistance;
}// Of distance
/**
*********************
* Get the accuracy of the classifier.
*
* @return The accuracy.
*********************
*/
public double getAccuracy() {
// A double divides an int gets another double.
double tempCorrect = 0;
for (int i = 0; i < predictions.length; i++) {
if (predictions[i] == dataset.instance(testingSet[i]).classValue()) {
tempCorrect++;
} // Of if
} // Of for i
return tempCorrect / testingSet.length;
}// Of getAccuracy
/**
************************************
* Compute the nearest k neighbors. Select one neighbor in each scan. In
* fact we can scan only once. You may implement it by yourself.
*
* @param paraK
* the k value for kNN.
* @param paraCurrent
* current instance. We are comparing it with all others.
* @return the indices of the nearest instances.
************************************
*/
public int[] computeNearests(int paraCurrent) {
int[] resultNearests = new int[numNeighbors];
boolean[] tempSelected = new boolean[trainingSet.length];
double tempDistance;
double tempMinimalDistance;
int tempMinimalIndex = 0;
// Select the nearest paraK indices.
for (int i = 0; i < numNeighbors; i++) {
tempMinimalDistance = Double.MAX_VALUE;
for (int j = 0; j < trainingSet.length; j++) {
if (tempSelected[j]) {
continue;
} // Of if
tempDistance = distance(paraCurrent, trainingSet[j]);
if (tempDistance < tempMinimalDistance) {
tempMinimalDistance = tempDistance;
tempMinimalIndex = j;
} // Of if
} // Of for j
resultNearests[i] = trainingSet[tempMinimalIndex];
tempSelected[tempMinimalIndex] = true;
} // Of for i
System.out.println("The nearest of " + paraCurrent + " are: " + Arrays.toString(resultNearests));
return resultNearests;
}// Of computeNearests
/**
************************************
* Voting using the instances.
*
* @param paraNeighbors
* The indices of the neighbors.
* @return The predicted label.
************************************
*/
public int simpleVoting(int[] paraNeighbors) {
int[] tempVotes = new int[dataset.numClasses()];
for (int i = 0; i < paraNeighbors.length; i++) {
tempVotes[(int) dataset.instance(paraNeighbors[i]).classValue()]++;
} // Of for i
int tempMaximalVotingIndex = 0;
int tempMaximalVoting = 0;
for (int i = 0; i < dataset.numClasses(); i++) {
if (tempVotes[i] > tempMaximalVoting) {
tempMaximalVoting = tempVotes[i];
tempMaximalVotingIndex = i;
} // Of if
} // Of for i
return tempMaximalVotingIndex;
}// Of simpleVoting
/**
*********************
* The entrance of the program.
*
* @param args
* Not used now.
*********************
*/
public static void main(String args[]) {
Day51 tempClassifier = new Day51("C:/Users/huoyifeng/Desktop/java jar/iris.arff");
tempClassifier.splitTrainingTesting(0.8);
tempClassifier.predict();
System.out.println("The accuracy of the classifier is: " + tempClassifier.getAccuracy());
}// Of main
}
1.两种距离度量
2.数据随机分割方式.
3.求邻居
4.求最短(投票)
Day52:
重新实现 computeNearests, 仅需要扫描一遍训练集, 即可获得 k k k 个邻居. 提示: 现代码与插入排序思想相结合.
/**
* @Description: 计算最近的k个邻居。在每一轮扫描中选择一个邻居
* @Param: [paraIndex]
* @return: int[]
*/
public int[] computeNearests(int paraCurrent) {
int[] resultNearests = new int[numNeighbors];
boolean[] tempSelected = new boolean[trainingSet.length];
double tempDistance;
double tempMinimalDistance;
int tempMinimalIndex = 0;
/*//选择最近的k个索引
for (int i = 0; i < numNeighbors; i++) {
tempMinimalDistance = Double.MAX_VALUE;
for (int j = 0; j < trainingSet.length; j++) {
if (tempSelected[j]) {
continue;
}
tempDistance = distance(paraCurrent, trainingSet[j]);
if (tempDistance < tempMinimalDistance) {
tempMinimalDistance = tempDistance;
tempMinimalIndex = j;
}
}
resultNearests[i] = trainingSet[tempMinimalIndex];
tempSelected[tempMinimalIndex] = true;
}*/
//使用直接插入排序
//创建一个临时二维数组去存储距离
double[][] tempDistanceArray = new double[trainingSet.length][2];
tempDistanceArray[0][0] = 0;
tempDistanceArray[0][1] = distance(paraCurrent, trainingSet[0]);
int j;
for (int i = 1; i < trainingSet.length; i++) {
tempDistance = distance(paraCurrent, trainingSet[i]);
for (j = i - 1; j >= 0; j--) {
if (tempDistance < tempDistanceArray[j][1]) {
tempDistanceArray[j + 1] = tempDistanceArray[j];
} else {
break;
}
}
tempDistanceArray[j + 1][0] = i;
tempDistanceArray[j + 1][1] = tempDistance;
}
for (int i = 0; i < numNeighbors; i++) {
resultNearests[i] = trainingSet[(int)tempDistanceArray[i][0]];
}
System.out.println("The nearest of " + paraCurrent + " are: " + Arrays.toString(resultNearests));
return resultNearests;
}
增加 setDistanceMeasure() 方法.
/**
* @Description: 选择距离计算方式
* @Param: [paraType:0 or 1]
* @return: void
*/
public void setDistanceMeasure(int paraType) {
if (paraType == 0) {
distanceMeasure = MANHATTAN;
} else if (paraType == 1) {
distanceMeasure = EUCLIDEAN;
} else {
System.out.println("Wrong Distance Measure!!!");
}
}
public static void main(String[] args) {
KnnClassification tempClassifier = new KnnClassification("C:/Users/huoyifeng/Desktop/java jar/iris.arfff");
tempClassifier.setDistanceMeasure(1);
tempClassifier.splitTrainingTesting(0.8);
tempClassifier.predict();
System.out.println("The accuracy of the classifier is: " + tempClassifier.getAccuracy());
}
增加 setNumNeighors() 方法.
/**
* @Description: 设置邻居数量
* @Param: [paraNumNeighbors]
* @return: void
*/
public void setNumNeighbors(int paraNumNeighbors) {
if (paraNumNeighbors > dataset.numInstances()) {
System.out.println("The number of neighbors is bigger than the number of dataset!!!");
return;
}
numNeighbors = paraNumNeighbors;
}
public static void main(String[] args) {
KnnClassification tempClassifier = new KnnClassification("C:/Users/huoyifeng/Desktop/java jar/iris.arff");
tempClassifier.setDistanceMeasure(1);
tempClassifier.setNumNeighbors(8);
tempClassifier.splitTrainingTesting(0.8);
tempClassifier.predict();
System.out.println("The accuracy of the classifier is: " + tempClassifier.getAccuracy());
}
Glocal 代码+Glocal论文理解+Matlab中manopt的工具箱的理解
Glocal 基础模型:
Y
∈
R
l
×
n
Y\in \mathbb{R}^{l\times n}
Y∈Rl×n是一个
l
×
n
l\times n
l×n的实际标签矩阵,它的秩是
k
<
l
k<l
k<l,将它分解为两个矩阵
U
∈
R
l
×
k
U\in \mathbb{R}^{l\times k}
U∈Rl×k和
V
∈
R
k
×
n
V\in \mathbb{R}^{k\times n}
V∈Rk×n
W
∈
R
d
×
k
W\in \mathbb{R}^{d\times k}
W∈Rd×k是一个
d
×
k
d\times k
d×k的矩阵。
U
U
U 反映原始标签如何与潜在标签相关联。
V
V
V 表示潜在标签
∣
∣
Π
Ω
(
Y
−
U
V
)
∣
∣
|| \Pi_{\Omega}(Y-UV)||
∣∣ΠΩ(Y−UV)∣∣是最小化重建误差
R
(
U
,
V
,
W
)
R(U,V,W)
R(U,V,W)是正则化子,λ,λ2是折衷参数。当平方损失用于问题(2)时,它可以被任何可微损失函数代替。
Ω
\Omega
Ω是包含
Y
Y
Y中观察到的标签的索引(即Y中的非零元素的索引)的集合。
Glocal基本模型:对标签矩阵进行低秩分解,得到潜在标签,并学习从特征空间到潜在标签的映射。因此,我们可以得到更紧凑、更抽象的潜在标签表示,它是稠密、实值和低维的。学习从特征空间到潜在标签空间的映射也比学习到原始标签空间的映射容易得多(原始标签空间是稀疏的,二值的以及更高的维度)。此外,它还直接为丢失标签的恢复提供了解决方案。
example代码:
clc;
addpath('clf');
addpath('lib');
addpath(genpath('lib/manopt'));
addpath('evl');
[in_result, out_result] = run_arts();
disp('============== 70% label missing: ================');
disp('Recovery Result:');
disp(in_result(1));
disp('Prediction Result:');
disp(out_result(1));
disp('============== 30% label missing: ================');
disp('Recovery Result:');
disp(in_result(2));
disp('Prediction Result:');
disp(out_result(2));
run_arts()函数
function [in_result, out_result] = run_arts()
param = importdata('arts_param.mat');
data = importdata('dt/Arts_sp.mat');
param.tooloptions.maxiter = 30;
param.tooloptions.gradnorm = 1e-3;
param.tooloptions.stopfun = @mystopfun;
out_result = [];
in_result = [];
for j=3:4:8
s = RandStream.create('mt19937ar','seed',1);
RandStream.setGlobalStream(s);
for kk = 1:1
Xtrn = data.train{kk,1};
Ytrn = data.train{kk,2};
Xtst = data.test{kk,1};
Ytst = data.test{kk,2};
[J] = genObv( Ytrn, 0.1*j);
tic;
[V,U,W,SP,Beta] = MLCTrain(J,Ytrn, Xtrn, Ytst,Xtst,param);
tm = toc;
zz = mean(Ytst);
Ytst(:,zz==-1) = [];
Xtst(:,zz==-1) = [];
tstv = (U*W'*Xtst);
ret = evalt(tstv,Ytst, (max(tstv(:))-min(tstv(:)))/2);
ret.time = tm;
out_result = [out_result;ret];
zz = mean(Ytrn);
Ytrn(:,zz==-1) = [];
Xtrn(:,zz==-1) = [];
tstv2 = U*W'*Xtrn;
ret = evalt(tstv2,Ytrn, (max(tstv2(:))-min(tstv2(:)))/2);
in_result = [in_result;ret];
end
end
end
function stopnow = mystopfun(problem, x, info, last)
if last < 5
stopnow = 0;
return;
end
flag = 1;
for i = 1:3
flag = flag & abs(info(last-i).cost-info(last-i-1).cost) < 1e-5;
end
stopnow = flag;
end
评价指标:
排名损失(Rkl):负面标签排名高于正面标签的分数。
ROC 曲线下的平均面积 (Auc):正实例排名高于一个负例,对所有标签进行平均所得。
覆盖率(Cvg):计算需要多少步才能将预测的标签排名向下移动以覆盖实例的所有正标签。
平均精度 (Ap):这是排名高于特定正标签的正标签的平均分数。
效果:70%缺失标签
============== 70% label missing: ================
Recovery Result:
Coverage: 4.3240
AveragePrecision: 0.6594
RankingLoss: 0.1060
Top1: 0.5850
Top3: 0.3347
Top5: 0.2431
AvgAuc: 0.8969
Prediction Result:
Coverage: 5.8345
AveragePrecision: 0.5607
RankingLoss: 0.1589
Top1: 0.4590
Top3: 0.2755
Top5: 0.2087
AvgAuc: 0.8434
time: 5.9429
效果:30%缺失标签
============== 30% label missing: ================
Recovery Result:
Coverage: 3.1427
AveragePrecision: 0.7378
RankingLoss: 0.0709
Top1: 0.6667
Top3: 0.3861
Top5: 0.2702
AvgAuc: 0.9319
Prediction Result:
Coverage: 5.9285
AveragePrecision: 0.5958
RankingLoss: 0.1562
Top1: 0.5175
Top3: 0.2953
Top5: 0.2144
AvgAuc: 0.8460
time: 3.9977
Manopt工具箱用于流形与矩阵优化;
流形上的优化问题是解决非线性优化问题的一种有效方法。
利用Manopt,可以很容易地处理应用中自然产生的各种约束和对称性问题,如正交性、低秩性、正定性和群组作用下的不变性等。
这些工具也非常适合向量和矩阵的无约束优化。
另外,IvyYin给出了更细致的说明:
工具包,它专门用于简化最先进的黎曼优化算法,通过内部处理大部分微分几何,解决非线性优化问题。它是一个用于优化三维形状的工具包,三维流形优化是非线性优化的一个快速发展的分支。 其重点是利用搜索空间的平滑几何形状来设计有效的数值算法, 优化后的流形非常适合于处理秩和正交性约束。 这种结构化约束在机器学习应用中普遍存在,包括低阶矩阵计算,传感器网络定位,独立分量分析,度量学习,降维等。
manopt安装成功显示:
%function basicexample
Verify that Manopt was indeed added to the Matlab path.
if isempty(which('spherefactory'))
error(['You should first add Manopt to the Matlab path.\n' ...
'Please run importmanopt first.']);
end
% Generate the problem data.
n = 1000;
A = randn(n);
A = .5*(A+A');
% Create the problem structure.
manifold = spherefactory(n);
problem.M = manifold;
% Define the problem cost function and its gradient.
problem.cost = @(x) -x'*(A*x);
problem.grad = @(x) manifold.egrad2rgrad(x, -2*A*x);
% Numerically check gradient consistency.
checkgradient(problem);
% Solve.
% The trust-regions algorithm requires the Hessian. Since we do not
% provide it, it will go for a standard approximation of it. The first
% instruction tells Manopt not to issue a warning when this happens.
warning('off', 'manopt:getHessian:approx');
[x xcost info] = trustregions(problem); %#ok<ASGLU>
% Display some statistics.
figure;
semilogy([info.iter], [info.gradnorm], '.-');
xlabel('Iteration #');
ylabel('Gradient norm');
title('Convergence of the trust-regions algorithm on the sphere');
end
1.生成问题数据
2.创建问题
3.定义问题成本函数及其梯度。
4.检查梯度一致性
5.显示一些统计数据,生成图像: