用协同过滤算法对电影评分，并使用RMSE算出预测误差的MapReduce实现

最新推荐文章于 2024-06-07 14:58:07 发布

Username_Password_R

最新推荐文章于 2024-06-07 14:58:07 发布

阅读量4.8k

点赞数

文章标签： MapReduce实现协同过滤

本文链接：https://blog.csdn.net/u014686180/article/details/46699655

版权

该博客详细介绍了如何使用MapReduce实现协同过滤算法，通过对电影评分数据进行预处理，包括计算电影平均评分、去中心化、抽样测试集等步骤，然后计算电影之间的相似度，预测用户对电影的评分，并利用RMSE计算预测误差。

摘要由CSDN通过智能技术生成

MapReduce程序实现

Mean.java：该MapReduce用来计算每部电影的平均评分
Regular.java：该MapReduce用来对数据进行去中心化，即用每一行记录的评分减去该行对应电影的平均分
ExtractTestData.java：该MapReduce用来从数据集中抽取出119条记录作为测试集
MovieSimilar.java：该MapReduce用余弦值作为相似度计算出某部电影与其他所有电影的相似度
MostSimilarMovies.java：该MapReduce用来求出电影相似度最高的20个记录
PredictMovieRating.java：该MapReduce根据20个最高的电影相似度数据预测用户对电影的评分
CalculateDifference.java：该MapReduce用来计算评分的实际值与预测值的差异
CalculateRMSE.java：该MapReduce用来求预测电影的均方根误差RMSE

相关原理请查看实验报告：用协同过滤算法对电影评分，并使用RMSE算出预测误差

1）Mean.java

package mr;

import java.io.IOException;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

/*

* 该MapReduce用来计算每部电影的平均评分

*/

public class Mean {

public static class FirstMapper extends Mapper<Object, Text,Text , Text> {

String [] dataset = new String[4];

public void map(Object key, Text value, Context context)

throws IOException, InterruptedException {

//value：1，2，3.5，1112486027

dataset = value.toString().split(",");

Text mkey = new Text();

Text mvalue = new Text();

//过滤掉标题行

if("movieId".equals(dataset[1]))

return;

mkey.set(dataset[1]);

mvalue.set(dataset[2]);

//key:movieId,value:rating

context.write(mkey, mvalue);

}

}

public static class FirstReducer extends Reducer<Text, Text, Text, Text> {

Text mean=new Text();

public void reduce(Text key, Iterable<Text> values,Context context)

throws IOException, InterruptedException {

float rate = 0.0f;

int i=0;

for(Text text : values){

try{

//把同一部电影的所有评分加起来

rate += Float.parseFloat(text.toString());

}catch(Exception e){

return;

}

//对每一部电影，统计其有多少个评分

i++;

}

mean.set((rate/i)+"");

//key:movieId,value:电影的平均评分

context.write(key,mean);

}

}

}

2）Regular.java

package mr;

import java.io.IOException;

import java.util.ArrayList;

import java.util.List;

import java.util.StringTokenizer;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.InputSplit;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

/*

* 该MapReduce用来对数据进行去中心化，即用每一行记录的评分减去该行对应电影的平均分

*/

public class Regular {

public static class FirstMapper extends Mapper<Object, Text,Text , Text> {

String [] dataset = new String[4];

Text movieId = new Text();

Text ratingMean = new Text();

Text mkey = new Text();

Text mvalue = new Text();

public void map(Object key, Text value, Context context)

throws IOException, InterruptedException {

InputSplit inputSplit = context.getInputSplit();

//求出当前读取的输入文件的文件名

String strname = ((FileSplit) inputSplit).getPath().getName();

//若文件名是part-r-00000，则当前读取的输入文件是电影平均评分的文件

if("part-r-00000".equals(strname)){

StringTokenizer token = new StringTokenizer(value.toString());

//value：movieId,rating(平均评分)

if(token.hasMoreElements()){

movieId.set(token.nextToken());

if(token.hasMoreElements()){

ratingMean.set("#"+token.nextToken());

}

}

//key：movieId,#rating(平均评分)

context.write(movieId, (ratingMean));

//若文件名不是part-r-00000，则当前读取的输入文件是数据集的文件

}else{

//value：1，2，3.5，1112486027

dataset = value.toString().split(",");

//过滤掉标题行

if("movieId".equals(dataset[1]))

return;

mkey.set(dataset[1]);

mvalue.set((dataset[0]+","+dataset[2]));

//key:movieId,value:userId,rating

context.write(mkey, mvalue);

}

}

public static class FirstReducer extends Reducer<Text, Text, Text, Text> {

float mean = 0.0f;

float rating = 0.0f;

//该reduce函数，对每个key（movieId）,迭代器Iterable包含该movieId对应的所有userId和movieId对应的平均评分

public void reduce(Text key, Iterable<Text> values,Context context)

throws IOException, InterruptedException {

List<String> list = new ArrayList<String>();

for(Text text : values){

//求出电影的平均评分（以#开头）

if(text.toString().startsWith("#")){

mean = Float.parseFloat(text.toString().substring(1));

continue;

}

//把movieId对应的所有用户userId及其评分rating保存到list集合

list.add(text.toString());

}

//遍历这个list集合，对每个元素（userId,rating）,用rating-mean求出每个userId的评分去中心化后的评分

for(String str : list){

rating = Float.parseFloat(str.substring(str.indexOf(",")+1))-mean;

str = ","+str.substring(0, str.indexOf(","))+","+rating;

//key:movieId,value:,userId,rating(去中心化后的评分，在value前面加逗号，是便于后面的分割处理）

context.write(key, new Text(str));

}

}

3）ExtractTestData.java

package mr;

import java.io.IOException;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

/*

* 该MapReduce用来从数据集中抽取出119条记录作为测试集

*/

public class ExtractTestData {

public static class FirstMapper extends Mapper<Object, Text,Text , Text> {

String [] dataset = new String[4];

static int count = 1;

static int location = 1;

public void map(Object key, Text value, Context context)

throws IOException, InterruptedException {

//value：1，2，3.5，1112486027

dataset = value.toString().split(",");

Text mkey = new Text();

Text mvalue = new Text();

//过滤掉标题行

if("movieId".equals(dataset[1]))

return;

//若当前读取到的记录的userId=location，且统计记录数的count小于119，则把该记录传到reduce

if(dataset[0].trim().equals(location+"")&&count<=119){

mkey.set(dataset[1]);

mvalue.set(dataset[0]+","+dataset[2]);

//location的增量

location = location+count;

//统计记录数加1

count++;

//key:movieId,value:userId,rating

context.write(mkey, mvalue);