做文本聚类分析,采用了pca等降维效果都不好,于是决定采用有监督的学习算法lda,网络找代码,找到一个看不懂如何降维,于是自己改写,代码如下:
package lda;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import org.apache.commons.math3.linear.EigenDecomposition;
import org.apache.commons.math3.linear.LUDecomposition;
import org.apache.commons.math3.linear.MatrixUtils;
import org.apache.commons.math3.linear.RealMatrix;
import org.apache.commons.math3.linear.RealVector;
import Jama.Matrix;
public class LDA
{
private double[][] groupRataTengah;
private double[][] kovarianGlobal;
private double[] probabilitas;
private ArrayList<Integer> groupList = new ArrayList<Integer>();
static int hasil;
static double f1, f2, f3;
private HashMap _map = new HashMap();
private RealVector[] _top2vec = new RealVector[2];
public LDA()
{
}
/**
*
* @param d 聚类结果数组
* @param g 聚类的类别标识,和前面的d关系一致
* @param p
*/
public LDA(double[][] d, int[] g, boolean p)
{
// memeriksa apakah data dan kelompok array mempunyai ukuran yang sama
if (d.length != g.length)
return;
double[][] data = new double[d.length][d[0].length];// panjang data(i)
// dan fitur(j)
for (int i = 0; i < d.length; i++)
{
for (int j = 0; j < d[i].length; j++)
{
data[i][j] = d[i][j];
}
}
int[] group = new int[g.length];
for (int j = 0; j < g.length; j++)
{
group[j] = g[j];
}
double[] rataTengah;
double[][][] kovarian;
// memisahkan berdasarkan grup atau kelas
for (int i = 0; i < group.length; i++)
{
if (!groupList.contains(group[i]))
{
groupList.add(group[i]);
}
}
// membagi data ke dalam subset
ArrayList<double[]>[] subset = new ArrayList[groupList.size()];
for (int i = 0; i < subset.length; i++)
{
subset[i] = new ArrayList<double[]>();
for