MapReduce程序实现
- Mean.java:该MapReduce用来计算每部电影的平均评分
- Regular.java:该MapReduce用来对数据进行去中心化,即用每一行记录的评分减去该行对应电影的平均分
- ExtractTestData.java:该MapReduce用来从数据集中抽取出119条记录作为测试集
- MovieSimilar.java:该MapReduce用余弦值作为相似度计算出某部电影与其他所有电影的相似度
- MostSimilarMovies.java:该MapReduce用来求出电影相似度最高的20个记录
- PredictMovieRating.java:该MapReduce根据20个最高的电影相似度数据预测用户对电影的评分
- CalculateDifference.java:该MapReduce用来计算评分的实际值与预测值的差异
- CalculateRMSE.java:该MapReduce用来求预测电影的均方根误差RMSE
1)Mean.java
package mr;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
/*
* 该MapReduce用来计算每部电影的平均评分
*/
public class Mean {
public static class FirstMapper extends Mapper<Object, Text,Text , Text> {
String [] dataset = new String[4];
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
//value:1,2,3.5,1112486027
dataset = value.toString().split(",");
Text mkey = new Text();
Text mvalue = new Text();
//过滤掉标题行
if("movieId".equals(dataset[1]))
return;
mkey.set(dataset[1]);
mvalue.set(dataset[2]);
//key:movieId,value:rating
context.write(mkey, mvalue);
}
}
public static class FirstReducer extends Reducer<Text, Text, Text, Text> {
Text mean=new Text();
public void reduce(Text key, Iterable<Text> values,Context context)
throws IOException, InterruptedException {
float rate = 0.0f;
int i=0;
for(Text text : values){
try{
//把同一部电影的所有评分加起来
rate += Float.parseFloat(text.toString());
}catch(Exception e){
return;
}
//对每一部电影,统计其有多少个评分
i++;
}
mean.set((rate/i)+"");
//key:movieId,value:电影的平均评分
context.write(key,mean);
}
}
}
2)Regular.java
package mr;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
/*
* 该MapReduce用来对数据进行去中心化,即用每一行记录的评分减去该行对应电影的平均分
*/
public class Regular {
public static class FirstMapper extends Mapper<Object, Text,Text , Text> {
String [] dataset = new String[4];
Text movieId = new Text();
Text ratingMean = new Text();
Text mkey = new Text();
Text mvalue = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
InputSplit inputSplit = context.getInputSplit();
//求出当前读取的输入文件的文件名
String strname = ((FileSplit) inputSplit).getPath().getName();
//若文件名是part-r-00000,则当前读取的输入文件是电影平均评分的文件
if("part-r-00000".equals(strname)){
StringTokenizer token = new StringTokenizer(value.toString());
//value:movieId,rating(平均评分)
if(token.hasMoreElements()){
movieId.set(token.nextToken());
if(token.hasMoreElements()){
ratingMean.set("#"+token.nextToken());
}
}
//key:movieId,#rating(平均评分)
context.write(movieId, (ratingMean));
//若文件名不是part-r-00000,则当前读取的输入文件是数据集的文件
}else{
//value:1,2,3.5,1112486027
dataset = value.toString().split(",");
//过滤掉标题行
if("movieId".equals(dataset[1]))
return;
mkey.set(dataset[1]);
mvalue.set((dataset[0]+","+dataset[2]));
//key:movieId,value:userId,rating
context.write(mkey, mvalue);
}
}
}
public static class FirstReducer extends Reducer<Text, Text, Text, Text> {
float mean = 0.0f;
float rating = 0.0f;
//该reduce函数,对每个key(movieId),迭代器Iterable包含该movieId对应的所有userId和movieId对应的平均评分
public void reduce(Text key, Iterable<Text> values,Context context)
throws IOException, InterruptedException {
List<String> list = new ArrayList<String>();
for(Text text : values){
//求出电影的平均评分(以#开头)
if(text.toString().startsWith("#")){
mean = Float.parseFloat(text.toString().substring(1));
continue;
}
//把movieId对应的所有用户userId及其评分rating保存到list集合
list.add(text.toString());
}
//遍历这个list集合,对每个元素(userId,rating),用rating-mean求出每个userId的评分去中心化后的评分
for(String str : list){
rating = Float.parseFloat(str.substring(str.indexOf(",")+1))-mean;
str = ","+str.substring(0, str.indexOf(","))+","+rating;
//key:movieId,value:,userId,rating(去中心化后的评分,在value前面加逗号,是便于后面的分割处理)
context.write(key, new Text(str));
}
}
}
}
3)ExtractTestData.java
package mr;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
/*
* 该MapReduce用来从数据集中抽取出119条记录作为测试集
*/
public class ExtractTestData {
public static class FirstMapper extends Mapper<Object, Text,Text , Text> {
String [] dataset = new String[4];
static int count = 1;
static int location = 1;
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
//value:1,2,3.5,1112486027
dataset = value.toString().split(",");
Text mkey = new Text();
Text mvalue = new Text();
//过滤掉标题行
if("movieId".equals(dataset[1]))
return;
//若当前读取到的记录的userId=location,且统计记录数的count小于119,则把该记录传到reduce
if(dataset[0].trim().equals(location+"")&&count<=119){
mkey.set(dataset[1]);
mvalue.set(dataset[0]+","+dataset[2]);
//location的增量
location = location+count;
//统计记录数加1
count++;
//key:movieId,value:userId,rating
context.write(mkey, mvalue);
}else
return;
}
}
public static class FirstReducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values,Context context)
throws IOException, InterruptedException {
//遍历这个迭代器Iterable,输出每一个元素(userId,rating)
for(Text text: values){
text.set(","+text.toString());
//key:movieId,value:,userId,rating(value前面加逗号,是便于后面的分割处理)
context.write(key,text);
}
}
}
}
/*
//通过这个reduce处理,抽取到的测试集更均匀,但计算量更大
public static class FirstReducer extends Reducer<Text, Text, Text, Text> {
Text mkey = new Text();
Text mvalue = new Text();
int count = 1;
public void reduce(Text key, Iterable<Text> values,Context context)
throws IOException, InterruptedException {