MapReduce之基于内容的电影推荐(完)
接上一篇博文MapReduce之基于内容的电影推荐(二) ,在MapReduce阶段2中,已经获取到了相关参数,接下来做的便是通过计算关联度来计算两个电影的相似度。
MapReduce阶段3
阶段3为通过整理阶段2处理好的数据,计算两个向量之间的关联度,
mapper阶段任务
mapper将MapReduce阶段2的数据进行整理,输出类型如下所示:
两个不同电影以及他们的向量
键 | 值 |
---|---|
<movie1,movie2> | <1,10,2,20,2,1,4> |
<movie1,movie3> | <1,10,3,30,3,1,9> |
<movie2,movie3> | <2,20,3,30,6,4,9> |
mapper阶段编码
还是由于作业链之间无法传递自定义类型,所以通过字符串转换成自定义类型
package com.deng.MovieRecommend;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class correlationMapper extends Mapper<LongWritable, Text,Tuple2,Tuple7> {
public Tuple2<String,String> reduceKey;
public Tuple7<Integer,Integer,Integer,Integer,Integer,Integer,Integer> reduceValue;
public void map(LongWritable key,Text value,Context context){
String line=value.toString();
String[] inform=line.split("\t");
reduceKey=toTuple2(inform[0]);
reduceValue=toTuple7(inform[1]);
try {
context.write(reduceKey,reduceValue);
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
public Tuple2 toTuple2(String s){
String[] line=s.split(",");
String movie1=line[0].substring(7);
String movie2=line[1].substring(0,line[1].length()-1);
return new Tuple2(movie1,movie2);
}
public Tuple7 toTuple7(String s){
String[] line=s.split(",");
Integer _1=Integer.parseInt(line[0].substring(7));
Integer _2=Integer.parseInt(line[1]);
Integer _3=Integer.parseInt(line[2]);
Integer _4=Integer.parseInt(line[3]);
Integer _5=Integer.parseInt(line[4]);
Integer _6=Integer.parseInt(line[5]);
Integer _7=Integer.parseInt(line[6].substring(0,line[6].length()-1));
return new Tuple7(_1,_2,_3,_4,_5,_6,_7);
}
}
reducer阶段任务
依据两个不同电影的向量来计算皮尔逊积矩阵相关系数,杰卡德相似系数和余弦相似系数,进而判断两个电影的相似度
reducer阶段编码
package com.deng.MovieRecommend;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class correlationReducer extends Reducer<Tuple2<String,String>,Tuple7<Integer,Integer,Integer,Integer
,Integer,Integer,Integer>, Tuple2,Tuple3> {
public Tuple2<String,String> reduceKey;
public Tuple3<Double,Double,Double> reduceValue;
public Integer groupSize=0,dotProduct=0,rating1Sum=0,
rating2Sum=0,rating1NormSq=0,rating2NormSq=0,
maxNumOfumRaterS1=0,maxNumOfumRaterS2=0;
public void reduce(Tuple2<String,String> key,Iterable<Tuple7<Integer,Integer,Integer,Integer
,Integer,Integer,Integer>> values,Context context){
List<Tuple7<Integer,Integer,Integer,Integer
,Integer,Integer,Integer>> list=new ArrayList<Tuple7<Integer, Integer, Integer, Integer, Integer, Integer, Integer>>();
for(Tuple7<Integer,Integer,Integer,Integer
,Integer,Integer,Integer> t:values) {
list.add(new Tuple7(t.get_1(),t.get_2(),t.get_3(), t.get_4(),
t.get_5(),t.get_6(),t.get_7()));
}
for(Tuple7<Integer,Integer,Integer,Integer
,Integer,Integer,Integer> t7:list){
dotProduct+=Integer.parseInt(String.valueOf(t7.get_5()));
rating1Sum+=Integer.parseInt(String.valueOf(t7.get_1()));
rating2Sum+=Integer.parseInt(String.valueOf(t7.get_3()));
rating1NormSq+=Integer.parseInt(String.valueOf(t7.get_6()));
rating2NormSq+=Integer.parseInt(String.valueOf(t7.get_7()));
if(Integer.parseInt(String.valueOf(t7.get_2()))>maxNumOfumRaterS1){
maxNumOfumRaterS1=Integer.parseInt(String.valueOf(t7.get_2()));
}
if(Integer.parseInt(String.valueOf(t7.get_4()))>maxNumOfumRaterS2){
maxNumOfumRaterS2=Integer.parseInt(String.valueOf(t7.get_4()));
}
double pearson=calculatePearsonCorrelation(groupSize,dotProduct,rating1Sum,
rating2Sum,rating1NormSq,rating2NormSq);
double jaccard=calculateJaccardCorrelation(groupSize,maxNumOfumRaterS1,maxNumOfumRaterS2);
double cosine=calculateCosineCorrelation(dotProduct,Math.sqrt(rating1NormSq),Math.sqrt(rating2NormSq));
reduceKey=key;
reduceValue=new Tuple3<Double,Double,Double>(pearson,jaccard,cosine);
try {
context.write(reduceKey,reduceValue);
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
//计算皮尔逊积矩阵相关系数
public double calculatePearsonCorrelation(double size,double dotProduct,
double rating1Sum,double rating2Sum,
double rating1NormSq,double rating2NormSq){
double numerator=size*dotProduct-rating1Sum*rating2Sum;
double denominator=Math.sqrt(size*rating1NormSq-rating1Sum*rating1Sum)*
Math.sqrt(size*rating2NormSq-rating2Sum*rating2Sum);
return numerator/denominator;
}
//计算余弦相似系数
public double calculateCosineCorrelation(double dotProduct,double rating1Norm,double rating2Norm){
return dotProduct/(rating1Norm*rating2Norm);
}
// 计算 杰卡德相似系数
public double calculateJaccardCorrelation(double inCommon,double totalA,double totalB){
double union=totalA+totalA-inCommon;
return inCommon/union;
}
}
完成的驱动代码如下:
package com.deng.MovieRecommend;
import com.deng.FileUtil;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MovieRecommendDriver {
public static void main(String[] args) throws Exception{
FileUtil.deleteDirs("output");
FileUtil.deleteDirs("output2");
FileUtil.deleteDirs("output3");
Configuration conf=new Configuration();
String[] otherArgs=new String[]{"input/movieRecommend.txt","output"};
Job findNumberOfRatingJob=new Job(conf,"MovieRecommendDriver");
findNumberOfRatingJob.setJarByClass(MovieRecommendDriver.class);
findNumberOfRatingJob.setMapperClass(findNumberOfRatersMapper.class);
findNumberOfRatingJob.setReducerClass(findNumberOfRatersReducer.class);
findNumberOfRatingJob.setOutputKeyClass(Text.class);
findNumberOfRatingJob.setOutputValueClass(Tuple2.class);
FileInputFormat.addInputPath(findNumberOfRatingJob,new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(findNumberOfRatingJob,new Path(otherArgs[1]));
if((findNumberOfRatingJob.waitForCompletion(true)?0:1)==0){
Job VectorJob=new Job(conf,"MovieRecommendDriver");
VectorJob.setJarByClass(MovieRecommendDriver.class);
VectorJob.setMapperClass(VectorMapper.class);
VectorJob.setReducerClass(VectorReducer.class);
VectorJob.setOutputKeyClass(Text.class);
VectorJob.setOutputValueClass(Tuple3.class);
FileInputFormat.setInputPaths(VectorJob,new Path("output/part-r-00000"));
FileOutputFormat.setOutputPath(VectorJob,new Path("output2"));
if((VectorJob.waitForCompletion(true)?0:1)==0){
Job correlationJob=new Job(conf,"MovieRecommendDriver");
correlationJob.setJarByClass(MovieRecommendDriver.class);
correlationJob.setMapperClass(correlationMapper.class);
correlationJob.setReducerClass(correlationReducer.class);
correlationJob.setOutputKeyClass(Tuple2.class);
correlationJob.setOutputValueClass(Tuple7.class);
FileInputFormat.setInputPaths(correlationJob,new Path("output2/part-r-00000"));
FileOutputFormat.setOutputPath(correlationJob,new Path("output3"));
System.exit(correlationJob.waitForCompletion(true)?0:1);
}
}
}
}