M-distance推荐
算法介绍
数据说明
前几项数据如下:
0,0,5
0,1,3
0,2,4
0,3,3
0,4,3
0,5,5
0,6,4
0,7,1
0,8,5
0,9,3
其中0,0,5说明0号用户为0号电影评分为5
预测目标
利用数据集中的信息,预测用户对暂未观看过的电影的评分情况,从而对用户推荐电影。
算法原理
近似于KNN,都是得到若干个“邻居”,计算“邻居”的评分平均值得到自己的评分,区别在于获得“邻居”的策略不同。
这里有两种邻居,一种是电影邻居,即使用一些得分相近的电影,将他们的平均得分作为预测结果;另一种就是用户邻居。本博客以电影邻居实现为例。
假设这里要预测用户i对电影j的评分k,那么电影邻居z需要满足如下条件:
- 用户i同样观看过这部电影z并作出评分。
- 这部电影z的平均得分,与电影j的平均得分相近。
在满足这些条件后,那么会将电影邻居们的平均得分汇总,将它们的平均数作为预测结果k。
leave-one-out测试方法设计
使用留一法对算法进行测试,即对于数据量为N的数据而言,每次选取一条数据作为测试集,其他N-1条数据作为训练集,这个过程重复N次。
算法流程
变量准备
这里的变量都已经在注释上重新作了说明。
//对0邻居的情况默认打3分
public static final double DEFAULT_RATING = 3.0;
//用户数量
private int numUsers;
//电影数量
private int numItems;
//评分数量
private int numRatings;
//验证集,存放所有评分的预测结果
private double[] predictions;
//用二维数组表示一个压缩三元组
private int[][] compressedRatingMatrix;
//用户看的电影数目
private int[] userDegrees;
//用户对所有电影的平均打分
private double[] userAverageRatings;
//电影被评分的次数
private int[] itemDegrees;
//电影的平均得分
private double[] itemAverageRatings;
//用户的开始出现位置
private int[] userStartingIndices;
//0邻居的数据数量
private int numNonNeighbors;
//误差半径
private double radius;
读取数据
在构造函数中读取数据,并计算每个用户看电影数、每个电影被看的次数、每个用户的评分总分、每个电影被评分总数、每个用户的评分平均分、每个电影的评分平均分等。
/**
*************************
* 最终目的:读取数据,并计算每个用户看电影数、每个电影被看的次数、每个用户的评分总分、每个电影被评分总数、每个用户的评分平均分、每个电影的评分平均分等
*
* @param paraNumUsers
* number of users
* @param paraNumItems
* number of items
* @param paraNumRatings
* number of ratings
*************************
*/
public MDistance(String paraFilename, int paraNumUsers, int paraNumItems, int paraNumRatings) throws Exception {
// set变量
numItems = paraNumItems;
numUsers = paraNumUsers;
numRatings = paraNumRatings;
userDegrees = new int[numUsers];
userStartingIndices = new int[numUsers + 1];
userAverageRatings = new double[numUsers];
itemDegrees = new int[numItems];
compressedRatingMatrix = new int[numRatings][3];
itemAverageRatings = new double[numItems];
predictions = new double[numRatings];
System.out.println("Reading " + paraFilename);
// 读取文件,并为变量赋值
File tempFile = new File(paraFilename);
if (!tempFile.exists()) {
System.out.println("File " + paraFilename + " does not exists.");
System.exit(0);
} // Of if
BufferedReader tempBufReader = new BufferedReader(new FileReader(tempFile));
String tempString;
String[] tempStrArray;
int tempIndex = 0;
userStartingIndices[0] = 0;
userStartingIndices[numUsers] = numRatings;
//这里使用循环读取文件所有信息,讲数据存入压缩二维数组三元组中,再实现用户看电影数、电影被评价数的累加
while ((tempString = tempBufReader.readLine()) != null) {
// Each line has three values
tempStrArray = tempString.split(",");
compressedRatingMatrix[tempIndex][0] = Integer.parseInt(tempStrArray[0]);
compressedRatingMatrix[tempIndex][1] = Integer.parseInt(tempStrArray[1]);
compressedRatingMatrix[tempIndex][2] = Integer.parseInt(tempStrArray[2]);
userDegrees[compressedRatingMatrix[tempIndex][0]]++;
itemDegrees[compressedRatingMatrix[tempIndex][1]]++;
//由于数据集中同一个用户的评分是连续存放的,因此可以借此实现(用户第一次出现数组)
if (tempIndex > 0) {
// Starting to read the data of a new user.
if (compressedRatingMatrix[tempIndex][0] != compressedRatingMatrix[tempIndex - 1][0]) {
userStartingIndices[compressedRatingMatrix[tempIndex][0]] = tempIndex;
} // Of if
} // Of if
tempIndex++;
} // Of while
tempBufReader.close();
double[] tempUserTotalScore = new double[numUsers];
double[] tempItemTotalScore = new double[numItems];
//计算用户对电影的评价总分、电影的被评价总分
for (int i = 0; i < numRatings; i++) {
tempUserTotalScore[compressedRatingMatrix[i][0]] += compressedRatingMatrix[i][2];
tempItemTotalScore[compressedRatingMatrix[i][1]] += compressedRatingMatrix[i][2];
} // Of for i
//从用户角度计算平均评分
for (int i = 0; i < numUsers; i++) {
userAverageRatings[i] = tempUserTotalScore[i] / userDegrees[i];
} // Of for i
//从电影角度计算平均评分
for (int i = 0; i < numItems; i++) {
itemAverageRatings[i] = tempItemTotalScore[i] / itemDegrees[i];
} // Of for i
}// Of the first constructor
leave-one-out方法测试
对N条评分分别进行留一法测试,并将预测结果保存至prediction中。
public void leaveOneOutPrediction() {
double tempItemAverageRating;
// Make each line of the code shorter.
int tempUser, tempItem, tempRating;
System.out.println("\r\nLeaveOneOutPrediction for radius " + radius);
numNonNeighbors = 0;
//对所有评分进行循环,把每一次评分使用留一法进行预测,i是评分的索引
for (int i = 0; i < numRatings; i++) {
tempUser = compressedRatingMatrix[i][0];
tempItem = compressedRatingMatrix[i][1];
tempRating = compressedRatingMatrix[i][2];
// Step 1. Recompute average rating of the current item.
//计算该电影平均分(除去当前电影评分)
tempItemAverageRating = (itemAverageRatings[tempItem] * itemDegrees[tempItem] - tempRating)
/ (itemDegrees[tempItem] - 1);
// Step 2. Recompute neighbors, at the same time obtain the ratings
// Of neighbors.
//计算其他电影的平均评分
int tempNeighbors = 0;
double tempTotal = 0;
int tempComparedItem;
//在该用户评分过的电影中遍历
for (int j = userStartingIndices[tempUser]; j < userStartingIndices[tempUser + 1]; j++) {
//tempComparedItem是当前遍历到的电影
tempComparedItem = compressedRatingMatrix[j][1];
if (tempItem == tempComparedItem) {
continue;// Ignore itself.
}// Of if
//比较该电影平均分与当前遍历到的电影平均分,考虑是否将其纳入邻居(并没有将所有电影都进行遍历,而是仅遍历了该用户看过的电影)
if (Math.abs(tempItemAverageRating - itemAverageRatings[tempComparedItem]) < radius) {
tempTotal += compressedRatingMatrix[j][2];
tempNeighbors++;
} // Of if
} // Of for j
// Step 3. Predict as the average value of neighbors.
//计算所有邻居的评分平均值
if (tempNeighbors > 0) {
predictions[i] = tempTotal / tempNeighbors;
} else {
predictions[i] = DEFAULT_RATING;
numNonNeighbors++;
} // Of if
} // Of for i
}// Of leaveOneOutPrediction
- 首先开启循环,遍历所有的评分(留一法)。
- 在该情况下,保存此时的用户信息、电影信息。
- 对该用户看过的电影进行遍历,统计这些电影的平均评分,得到“邻居”
- 由这些邻居得到该情况下的预测结果,将结果存入prediction中。
使用MAE、RSME多种误差分析方式分析
/**
*************************
* Compute the MAE based on the deviation of each leave-one-out.
*
* @author Fan Min
*************************
*/
public double computeMAE() throws Exception {
double tempTotalError = 0;
for (int i = 0; i < predictions.length; i++) {
tempTotalError += Math.abs(predictions[i] - compressedRatingMatrix[i][2]);
} // Of for i
return tempTotalError / predictions.length;
}// Of computeMAE
/**
*************************
* Compute the MAE based on the deviation of each leave-one-out.
*
* @author Fan Min
*************************
*/
public double computeRSME() throws Exception {
double tempTotalError = 0;
for (int i = 0; i < predictions.length; i++) {
tempTotalError += (predictions[i] - compressedRatingMatrix[i][2])
* (predictions[i] - compressedRatingMatrix[i][2]);
} // Of for i
double tempAverage = tempTotalError / predictions.length;
return Math.sqrt(tempAverage);
}// Of computeRSME
多次设置不同的误差半径,比较分析预测结果
for (double tempRadius = 0.2; tempRadius < 0.6; tempRadius += 0.1) {
tempRecommender.setRadius(tempRadius);
tempRecommender.leaveOneOutPrediction();
double tempMAE = tempRecommender.computeMAE();
double tempRSME = tempRecommender.computeRSME();
System.out.println("Radius = " + tempRadius + ", MAE = " + tempMAE + ", RSME = " + tempRSME
+ ", numNonNeighbors = " + tempRecommender.numNonNeighbors);
} // Of for tempRadius
详细注释
package knn_nb;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
public class MDistance {
//对0邻居的情况默认打3分
public static final double DEFAULT_RATING = 3.0;
//用户数量
private int numUsers;
//电影数量
private int numItems;
//评分数量
private int numRatings;
//验证集,存放所有评分的预测结果
private double[] predictions;
//用二维数组表示一个压缩三元组
private int[][] compressedRatingMatrix;
//用户看的电影数目
private int[] userDegrees;
//用户对所有电影的平均打分
private double[] userAverageRatings;
//电影被评分的次数
private int[] itemDegrees;
//电影的平均得分
private double[] itemAverageRatings;
//用户的开始出现位置
private int[] userStartingIndices;
//0邻居的数据数量
private int numNonNeighbors;
//误差半径
private double radius;
/**
*************************
* 最终目的:读取数据,并计算每个用户看电影数、每个电影被看的次数、每个用户的评分总分、每个电影被评分总数、每个用户的评分平均分、每个电影的评分平均分等
*
* @param paraNumUsers
* number of users
* @param paraNumItems
* number of items
* @param paraNumRatings
* number of ratings
*************************
*/
public MDistance(String paraFilename, int paraNumUsers, int paraNumItems, int paraNumRatings) throws Exception {
// set变量
numItems = paraNumItems;
numUsers = paraNumUsers;
numRatings = paraNumRatings;
userDegrees = new int[numUsers];
userStartingIndices = new int[numUsers + 1];
userAverageRatings = new double[numUsers];
itemDegrees = new int[numItems];
compressedRatingMatrix = new int[numRatings][3];
itemAverageRatings = new double[numItems];
predictions = new double[numRatings];
System.out.println("Reading " + paraFilename);
// 读取文件,并为变量赋值
File tempFile = new File(paraFilename);
if (!tempFile.exists()) {
System.out.println("File " + paraFilename + " does not exists.");
System.exit(0);
} // Of if
BufferedReader tempBufReader = new BufferedReader(new FileReader(tempFile));
String tempString;
String[] tempStrArray;
int tempIndex = 0;
userStartingIndices[0] = 0;
userStartingIndices[numUsers] = numRatings;
//这里使用循环读取文件所有信息,讲数据存入压缩二维数组三元组中,再实现用户看电影数、电影被评价数的累加
while ((tempString = tempBufReader.readLine()) != null) {
// Each line has three values
tempStrArray = tempString.split(",");
compressedRatingMatrix[tempIndex][0] = Integer.parseInt(tempStrArray[0]);
compressedRatingMatrix[tempIndex][1] = Integer.parseInt(tempStrArray[1]);
compressedRatingMatrix[tempIndex][2] = Integer.parseInt(tempStrArray[2]);
userDegrees[compressedRatingMatrix[tempIndex][0]]++;
itemDegrees[compressedRatingMatrix[tempIndex][1]]++;
//由于数据集中同一个用户的评分是连续存放的,因此可以借此实现(用户第一次出现数组)
if (tempIndex > 0) {
// Starting to read the data of a new user.
if (compressedRatingMatrix[tempIndex][0] != compressedRatingMatrix[tempIndex - 1][0]) {
userStartingIndices[compressedRatingMatrix[tempIndex][0]] = tempIndex;
} // Of if
} // Of if
tempIndex++;
} // Of while
tempBufReader.close();
double[] tempUserTotalScore = new double[numUsers];
double[] tempItemTotalScore = new double[numItems];
//计算用户对电影的评价总分、电影的被评价总分
for (int i = 0; i < numRatings; i++) {
tempUserTotalScore[compressedRatingMatrix[i][0]] += compressedRatingMatrix[i][2];
tempItemTotalScore[compressedRatingMatrix[i][1]] += compressedRatingMatrix[i][2];
} // Of for i
//从用户角度计算平均评分
for (int i = 0; i < numUsers; i++) {
userAverageRatings[i] = tempUserTotalScore[i] / userDegrees[i];
} // Of for i
//从电影角度计算平均评分
for (int i = 0; i < numItems; i++) {
itemAverageRatings[i] = tempItemTotalScore[i] / itemDegrees[i];
} // Of for i
}// Of the first constructor
/**
*************************
* 设置半径
*
* @param paraRadius
* The given radius.
*************************
*/
public void setRadius(double paraRadius) {
if (paraRadius > 0) {
radius = paraRadius;
} else {
radius = 0.1;
} // Of if
}// Of setRadius
/**
*************************
* Leave-one-out prediction. The predicted values are stored in predictions.
*
*
*************************
*/
public void leaveOneOutPrediction() {
double tempItemAverageRating;
// Make each line of the code shorter.
int tempUser, tempItem, tempRating;
System.out.println("\r\nLeaveOneOutPrediction for radius " + radius);
numNonNeighbors = 0;
//对所有评分进行循环,把每一次评分使用留一法进行预测,i是评分的索引
for (int i = 0; i < numRatings; i++) {
tempUser = compressedRatingMatrix[i][0];
tempItem = compressedRatingMatrix[i][1];
tempRating = compressedRatingMatrix[i][2];
// Step 1. Recompute average rating of the current item.
//计算该电影平均分(除去当前电影评分)
tempItemAverageRating = (itemAverageRatings[tempItem] * itemDegrees[tempItem] - tempRating)
/ (itemDegrees[tempItem] - 1);
// Step 2. Recompute neighbors, at the same time obtain the ratings
// Of neighbors.
//计算其他电影的平均评分
int tempNeighbors = 0;
double tempTotal = 0;
int tempComparedItem;
//在该用户评分过的电影中遍历
for (int j = userStartingIndices[tempUser]; j < userStartingIndices[tempUser + 1]; j++) {
//tempComparedItem是当前遍历到的电影
tempComparedItem = compressedRatingMatrix[j][1];
if (tempItem == tempComparedItem) {
continue;// Ignore itself.
}// Of if
//比较该电影平均分与当前遍历到的电影平均分,考虑是否将其纳入邻居(并没有将所有电影都进行遍历,而是仅遍历了该用户看过的电影)
if (Math.abs(tempItemAverageRating - itemAverageRatings[tempComparedItem]) < radius) {
tempTotal += compressedRatingMatrix[j][2];
tempNeighbors++;
} // Of if
} // Of for j
// Step 3. Predict as the average value of neighbors.
//计算所有邻居的评分平均值
if (tempNeighbors > 0) {
predictions[i] = tempTotal / tempNeighbors;
} else {
predictions[i] = DEFAULT_RATING;
numNonNeighbors++;
} // Of if
} // Of for i
}// Of leaveOneOutPrediction
/**
*************************
* Compute the MAE based on the deviation of each leave-one-out.
*
* @author Fan Min
*************************
*/
public double computeMAE() throws Exception {
double tempTotalError = 0;
for (int i = 0; i < predictions.length; i++) {
tempTotalError += Math.abs(predictions[i] - compressedRatingMatrix[i][2]);
} // Of for i
return tempTotalError / predictions.length;
}// Of computeMAE
/**
*************************
* Compute the MAE based on the deviation of each leave-one-out.
*
* @author Fan Min
*************************
*/
public double computeRSME() throws Exception {
double tempTotalError = 0;
for (int i = 0; i < predictions.length; i++) {
tempTotalError += (predictions[i] - compressedRatingMatrix[i][2])
* (predictions[i] - compressedRatingMatrix[i][2]);
} // Of for i
double tempAverage = tempTotalError / predictions.length;
return Math.sqrt(tempAverage);
}// Of computeRSME
/**
*************************
* The entrance of the program.
*
* @param args
* Not used now.
*************************
*/
public static void main(String[] args) {
try {
MDistance tempRecommender = new MDistance("C:\\Users\\hp\\Desktop\\deepLearning\\src\\main\\java\\resources\\movielens-943u1682m.txt", 943, 1682, 100000);
for (double tempRadius = 0.2; tempRadius < 0.6; tempRadius += 0.1) {
tempRecommender.setRadius(tempRadius);
tempRecommender.leaveOneOutPrediction();
double tempMAE = tempRecommender.computeMAE();
double tempRSME = tempRecommender.computeRSME();
System.out.println("Radius = " + tempRadius + ", MAE = " + tempMAE + ", RSME = " + tempRSME
+ ", numNonNeighbors = " + tempRecommender.numNonNeighbors);
} // Of for tempRadius
} catch (Exception ee) {
System.out.println(ee);
} // Of try
}// Of main
}// Of class MBR