NMF把一个矩阵分解为两个矩阵的乘积,可以用来解决很多问题,例如:用户聚类、item聚类、预测(补全)用户对item的评分、个性化推荐等问题。NMF的过程可以转化为最小化损失函数(即误差函数)的过程,其实整个问题也就是一个最优化的问题。详细实现过程如下:(其中,输入矩阵很多时候会比较稀疏,即很多元素都是缺失项,故数据存储采用的是libsvm的格式,这个类在此忽略)
- package NMF_danji;
- import java.io.File;
- import java.util.ArrayList;
- /**
- * @author 玉心sober: http://weibo.com/karensober
- * @date 2013-05-19
- *
- * */
- public class NMF {
- private Dataset dataset = null;
- private int M = -1; // 行数
- private int V = -1; // 列数
- private int K = -1; // 隐含主题数
- double[][] P;
- double[][] Q;
- public NMF(String datafileName, int topics) {
- File datafile = new File(datafileName);
- if (datafile.exists()) {
- if ((this.dataset = new Dataset(datafile)) == null) {
- System.out.println(datafileName + " is null");
- }
- this.M = this.dataset.size();
- this.V = this.dataset.getFeatureNum();
- this.K = topics;
- } else {
- System.out.println(datafileName + " doesn't exist");
- }
- }
- public void initPQ() {
- P = new double[this.M][this.K];
- Q = new double[this.K][this.V];
- for (int k = 0; k < K; k++) {
- for (int i = 0; i < M; i++) {
- P[i][k] = Math.random();
- }
- for (int j = 0; j < V; j++) {
- Q[k][j] = Math.random();
- }
- }
- }
- // 随机梯度下降,更新参数
- public void updatePQ(double alpha, double beta) {
- for (int i = 0; i < M; i++) {
- ArrayList<Feature> Ri = this.dataset.getDataAt(i).getAllFeature();
- for (Feature Rij : Ri) {
- // eij=Rij.weight-PQ for updating P and Q
- double PQ = 0;
- for (int k = 0; k < K; k++) {
- PQ += P[i][k] * Q[k][Rij.dim];
- }
- double eij = Rij.weight - PQ;
- // update Pik and Qkj
- for (int k = 0; k < K; k++) {
- double oldPik = P[i][k];
- P[i][k] += alpha
- * (2 * eij * Q[k][Rij.dim] - beta * P[i][k]);
- Q[k][Rij.dim] += alpha
- * (2 * eij * oldPik - beta * Q[k][Rij.dim]);
- }
- }
- }
- }
- // 每步迭代后计算SSE
- public double getSSE(double beta) {
- double sse = 0;
- for (int i = 0; i < M; i++) {
- ArrayList<Feature> Ri = this.dataset.getDataAt(i).getAllFeature();
- for (Feature Rij : Ri) {
- double PQ = 0;
- for (int k = 0; k < K; k++) {
- PQ += P[i][k] * Q[k][Rij.dim];
- }
- sse += Math.pow((Rij.weight - PQ), 2);
- }
- }
- for (int i = 0; i < M; i++) {
- for (int k = 0; k < K; k++) {
- sse += ((beta / 2) * (Math.pow(P[i][k], 2)));
- }
- }
- for (int i = 0; i < V; i++) {
- for (int k = 0; k < K; k++) {
- sse += ((beta / 2) * (Math.pow(Q[k][i], 2)));
- }
- }
- return sse;
- }
- // 采用随机梯度下降方法迭代求解参数,即求解最终分解后的矩阵
- public boolean doNMF(int iters, double alpha, double beta) {
- for (int step = 0; step < iters; step++) {
- updatePQ(alpha, beta);
- double sse = getSSE(beta);
- if (step % 100 == 0)
- System.out.println("step " + step + " SSE = " + sse);
- }
- return true;
- }
- public void printMatrix() {
- System.out.println("===========原始矩阵==============");
- for (int i = 0; i < this.dataset.size(); i++) {
- for (Feature feature : this.dataset.getDataAt(i).getAllFeature()) {
- System.out.print(feature.dim + ":" + feature.weight + " ");
- }
- System.out.println();
- }
- }
- public void printFacMatrxi() {
- System.out.println("===========分解矩阵==============");
- for (int i = 0; i < P.length; i++) {
- for (int j = 0; j < Q[0].length; j++) {
- double cell = 0;
- for (int k = 0; k < K; k++) {
- cell += P[i][k] * Q[k][j];
- }
- System.out.print(baoliu(cell, 3) + " ");
- }
- System.out.println();
- }
- }
- // 为double类型变量保留有效数字
- public static double baoliu(double d, int n) {
- double p = Math.pow(10, n);
- return Math.round(d * p) / p;
- }
- public static void main(String[] args) {
- double alpha = 0.002;
- double beta = 0.02;
- NMF nmf = new NMF("D:\\myEclipse\\graphModel\\data\\nmfinput.txt", 10);
- nmf.initPQ();
- nmf.doNMF(3000, alpha, beta);
- // 输出原始矩阵
- nmf.printMatrix();
- // 输出分解后矩阵
- nmf.printFacMatrxi();
- }
- }
package NMF_danji;
import java.io.File;
import java.util.ArrayList;
/**
* @author 玉心sober: http://weibo.com/karensober
* @date 2013-05-19
*
* */
public class NMF {
private Dataset dataset = null;
private int M = -1; // 行数
private int V = -1; // 列数
private int K = -1; // 隐含主题数
double[][] P;
double[][] Q;
public NMF(String datafileName, int topics) {
File datafile = new File(datafileName);
if (datafile.exists()) {
if ((this.dataset = new Dataset(datafile)) == null) {
System.out.println(datafileName + " is null");
}
this.M = this.dataset.size();
this.V = this.dataset.getFeatureNum();
this.K = topics;
} else {
System.out.println(datafileName + " doesn't exist");
}
}
public void initPQ() {
P = new double[this.M][this.K];
Q = new double[this.K][this.V];
for (int k = 0; k < K; k++) {
for (int i = 0; i < M; i++) {
P[i][k] = Math.random();
}
for (int j = 0; j < V; j++) {
Q[k][j] = Math.random();
}
}
}
// 随机梯度下降,更新参数
public void updatePQ(double alpha, double beta) {
for (int i = 0; i < M; i++) {
ArrayList<Feature> Ri = this.dataset.getDataAt(i).getAllFeature();
for (Feature Rij : Ri) {
// eij=Rij.weight-PQ for updating P and Q
double PQ = 0;
for (int k = 0; k < K; k++) {
PQ += P[i][k] * Q[k][Rij.dim];
}
double eij = Rij.weight - PQ;
// update Pik and Qkj
for (int k = 0; k < K; k++) {
double oldPik = P[i][k];
P[i][k] += alpha
* (2 * eij * Q[k][Rij.dim] - beta * P[i][k]);
Q[k][Rij.dim] += alpha
* (2 * eij * oldPik - beta * Q[k][Rij.dim]);
}
}
}
}
// 每步迭代后计算SSE
public double getSSE(double beta) {
double sse = 0;
for (int i = 0; i < M; i++) {
ArrayList<Feature> Ri = this.dataset.getDataAt(i).getAllFeature();
for (Feature Rij : Ri) {
double PQ = 0;
for (int k = 0; k < K; k++) {
PQ += P[i][k] * Q[k][Rij.dim];
}
sse += Math.pow((Rij.weight - PQ), 2);
}
}
for (int i = 0; i < M; i++) {
for (int k = 0; k < K; k++) {
sse += ((beta / 2) * (Math.pow(P[i][k], 2)));
}
}
for (int i = 0; i < V; i++) {
for (int k = 0; k < K; k++) {
sse += ((beta / 2) * (Math.pow(Q[k][i], 2)));
}
}
return sse;
}
// 采用随机梯度下降方法迭代求解参数,即求解最终分解后的矩阵
public boolean doNMF(int iters, double alpha, double beta) {
for (int step = 0; step < iters; step++) {
updatePQ(alpha, beta);
double sse = getSSE(beta);
if (step % 100 == 0)
System.out.println("step " + step + " SSE = " + sse);
}
return true;
}
public void printMatrix() {
System.out.println("===========原始矩阵==============");
for (int i = 0; i < this.dataset.size(); i++) {
for (Feature feature : this.dataset.getDataAt(i).getAllFeature()) {
System.out.print(feature.dim + ":" + feature.weight + " ");
}
System.out.println();
}
}
public void printFacMatrxi() {
System.out.println("===========分解矩阵==============");
for (int i = 0; i < P.length; i++) {
for (int j = 0; j < Q[0].length; j++) {
double cell = 0;
for (int k = 0; k < K; k++) {
cell += P[i][k] * Q[k][j];
}
System.out.print(baoliu(cell, 3) + " ");
}
System.out.println();
}
}
// 为double类型变量保留有效数字
public static double baoliu(double d, int n) {
double p = Math.pow(10, n);
return Math.round(d * p) / p;
}
public static void main(String[] args) {
double alpha = 0.002;
double beta = 0.02;
NMF nmf = new NMF("D:\\myEclipse\\graphModel\\data\\nmfinput.txt", 10);
nmf.initPQ();
nmf.doNMF(3000, alpha, beta);
// 输出原始矩阵
nmf.printMatrix();
// 输出分解后矩阵
nmf.printFacMatrxi();
}
}
结果:
...
step 2900 SSE = 0.5878774074369989
===========原始矩阵==============
0:9.0 1:2.0 2:1.0 3:1.0 4:1.0
0:8.0 1:3.0 2:2.0 3:1.0
0:3.0 3:1.0 4:2.0 5:8.0
1:1.0 3:2.0 4:4.0 5:7.0
0:2.0 1:1.0 2:1.0 4:1.0 5:3.0
===========分解矩阵==============
8.959 2.007 1.007 0.996 1.007 6.293
7.981 2.972 1.989 1.005 2.046 7.076
3.01 1.601 1.773 1.003 2.005 7.968
4.821 1.009 2.209 1.984 3.968 6.988
2.0 0.991 0.984 0.51 1.0 2.994