package cn.spark.study.core;
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import scala.Tuple3;
import scala.Tuple7;
public class MovieRecommendationsWithJoin {
public static void main(String[] args){
SparkConf conf = new SparkConf().setAppName("MovieRecommendationsWithJoin");
JavaSparkContext jsc = new JavaSparkContext(conf);
if(args.length < 1){
System.out.println("err");
System.exit(1);
}
JavaRDD<String> records = jsc.textFile(args[0],1);
List<String> debug4 = records.collect();
for(String t2 : debug4){
System.out.println("debug4" + t2);
}
/**
* 输入:数据为 用户,电影,评分
* 输出:一个<key,value>键值对 key=电影 value = Tuple2<用户,评分>
*/
JavaPairRDD<String,Tuple2<String,Integer>> moviesRDD = records.mapToPair(new PairFunction<String,String,Tuple2<String,Integer>>(){
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, Tuple2<String, Integer>> call(String s) throws Exception {
String[] m = s.split(" ");
System.out.println(m);
String user = m[0];
String movie = m[1];
Integer rating = new Integer(m[2]);
Tuple2<String,Integer> value = new Tuple2<String,Integer>(user,rating);
return new Tuple2<String, Tuple2<String, Integer>>(movie,value);
}
});
List<Tuple2<String, Tuple2<String, Integer>>> debug3 = moviesRDD.collect();
for(Tuple2<String, Tuple2<String, Integer>> t2 : debug3){
System.out.println("debug3 key = " + t2._1 + " value " + t2._2);
}
/**
* 按照key分组 key=电影
*/
JavaPairRDD<String,Iterable<Tuple2<String,Integer>>> groupMovie = moviesRDD.groupByKey();
List<Tuple2<String, Iterable<Tuple2<String, Integer>>>> debug2 = groupMovie.collect();
for(Tuple2<String, Iterable<Tuple2<String, Integer>>> t2 : debug2){
System.out.println("debug2 key = " + t2._1 + " value " + t2._2);
}
/**
* 输出:key=用户 value=Tuple3<电影,评分,评分人数>
*/
JavaPairRDD<String,Tuple3<String,Integer,Integer>> userRDD = groupMovie.flatMapToPair(
new PairFlatMapFunction<Tuple2<String,Iterable<Tuple2<String,Integer>>>,String,Tuple3<String,Integer,Integer>>(){
private static final long serialVersionUID = 1L;
@Override
public Iterable<Tuple2<String, Tuple3<String, Integer, Integer>>> call(
Tuple2<String, Iterable<Tuple2<String, Integer>>> m) throws Exception {
List<Tuple2<String,Integer>> list = new ArrayList<Tuple2<String,Integer>>();
String movie = m._1;
Integer numberOfRaters = 0;
for(Tuple2<String, Integer> t2 : m._2){
numberOfRaters++;
list.add(t2);
}
List<Tuple2<String,Tuple3<String,Integer,Integer>>> results =
new ArrayList<Tuple2<String,Tuple3<String,Integer,Integer>>>();
for(Tuple2<String,Integer> t2 : list){
String user = t2._1;
Integer rating = t2._2;
Tuple3<String,Integer,Integer> t3 = new Tuple3<String,Integer,Integer>(movie,rating,numberOfRaters);
results.add(new Tuple2<String,Tuple3<String,Integer,Integer>>(user,t3));
}
return results;
}
});
List<Tuple2<String, Tuple3<String, Integer, Integer>>> debug1 = userRDD.collect();
for(Tuple2<String, Tuple3<String, Integer, Integer>> t2 : debug1){
System.out.println("debug1 key = " + t2._1 + " value " + t2._2);
}
/**
* join自连接
*/
JavaPairRDD<String,Tuple2<Tuple3<String,Integer,Integer>,Tuple3<String,Integer,Integer>>> joinRDD = userRDD.join(userRDD);
List<Tuple2<String,Tuple2<Tuple3<String,Integer,Integer>,Tuple3<String,Integer,Integer>>>> debug = joinRDD.collect();
for(Tuple2<String,Tuple2<Tuple3<String,Integer,Integer>,Tuple3<String,Integer,Integer>>> t2 : debug){
System.out.println("debug key = " + t2._1 + " value " + t2._2);
}
/**
* filter
*/
JavaPairRDD<String,Tuple2<Tuple3<String,Integer,Integer>,Tuple3<String,Integer,Integer>>> filteredRDD = joinRDD.filter(
new Function<Tuple2<String,Tuple2<Tuple3<String,Integer,Integer>,Tuple3<String,Integer,Integer>>>,Boolean>(){
private static final long serialVersionUID = 1L;
@Override
public Boolean call(
Tuple2<String, Tuple2<Tuple3<String, Integer, Integer>, Tuple3<String, Integer, Integer>>> s)
throws Exception {
Tuple3<String,Integer,Integer> movie1 = s._2._1;
Tuple3<String,Integer,Integer> movie2 = s._2._2;
String movieName1 = movie1._1();
String movieName2 = movie2._1();
if(movieName1.compareTo(movieName2) < 0){
return true;
}
else{
return false;
}
}
});
/**
*
*/
JavaPairRDD<Tuple2<String,String>, Tuple7<Integer,Integer,Integer,Integer,Integer,Integer,Integer>> moivePair = filteredRDD.mapToPair(
new PairFunction<Tuple2<String,Tuple2<Tuple3<String,Integer,Integer>,Tuple3<String,Integer,Integer>>>,
Tuple2<String,String>,
Tuple7<Integer,Integer,Integer,Integer,Integer,Integer,Integer>>(){
private static final long serialVersionUID = 1L;
@Override
public Tuple2<Tuple2<String, String>, Tuple7<Integer, Integer, Integer, Integer, Integer, Integer, Integer>> call(
Tuple2<String, Tuple2<Tuple3<String, Integer, Integer>, Tuple3<String, Integer, Integer>>> s)
throws Exception {
Tuple3<String,Integer,Integer> movie1 = s._2._1;
Tuple3<String,Integer,Integer> movie2 = s._2._2;
Tuple2<String, String> m1m2key = new Tuple2<String, String>(movie1._1(),movie2._1());
int ratingProduct = movie1._2() * movie2._2();//两个评分相乘
int rating1Squared = movie1._2() * movie1._2();
int rating2Squared = movie2._2() * movie2._2();
Tuple7<Integer,Integer,Integer,Integer,Integer,Integer,Integer> t7 =
new Tuple7<Integer,Integer,Integer,Integer,Integer,Integer,Integer>(
movie1._2(),//评分
movie1._3(),//评分人数
movie2._2(),
movie2._3(),
ratingProduct,
rating1Squared,
rating2Squared);
return new Tuple2<Tuple2<String, String>, Tuple7<Integer, Integer, Integer, Integer, Integer, Integer, Integer>>(
m1m2key,t7);
}
});
/**
* 按key分组
*/
JavaPairRDD<Tuple2<String,String>, Iterable<Tuple7<Integer,Integer,Integer,Integer,Integer,Integer,Integer>>> groupM1m2RDD =
moivePair.groupByKey();
/**
* 计算关联度
*/
JavaPairRDD<Tuple2<String,String>, Tuple3<Double,Double,Double>> corr = groupM1m2RDD.mapValues(
new Function<Iterable<Tuple7<Integer,Integer,Integer,Integer,Integer,Integer,Integer>>,Tuple3<Double,Double,Double>>(){
private static final long serialVersionUID = 1L;
@Override
public Tuple3<Double, Double, Double> call(
Iterable<Tuple7<Integer, Integer, Integer, Integer, Integer, Integer, Integer>> s)
throws Exception {
return calculateCorrelations(s);
}
});
System.out.println("=== Movie Correlations ===");
List<Tuple2<Tuple2<String,String>, Tuple3<Double,Double,Double>>> last = corr.collect();
for(Tuple2<Tuple2<String,String>, Tuple3<Double,Double,Double>> t2 : last){
System.out.println("last key = " + t2._1 + " value + " + t2._2);
}
}
static Tuple3<Double,Double,Double> calculateCorrelations(
Iterable<Tuple7<Integer,Integer,Integer,Integer,Integer,Integer,Integer>> values){
int groupSize = 0;//各向量长度
int dotProduct = 0;
int rationg1Sum = 0;
int rationg2Sum = 0;
int rating1NormSq = 0;
int rating2NormSq = 0;
int maxNumOfumRaters1 = 0;
int maxNumOfumRaters2 = 0;
for(Tuple7<Integer,Integer,Integer,Integer,Integer,Integer,Integer> t7 : values){
groupSize++;
dotProduct += t7._5();//两个评分相乘
rationg1Sum += t7._1();//评分1
rationg2Sum += t7._3();//评分2
rating1NormSq += t7._6();//评分1相乘
rating2NormSq += t7._7();//评分2相乘
int numOfRaterS1 = t7._2();//给1电影评分的人数
if(numOfRaterS1 > maxNumOfumRaters1){
maxNumOfumRaters1 = numOfRaterS1;
}
int numOfRaterS2 = t7._4();
if(numOfRaterS2 > maxNumOfumRaters2){
maxNumOfumRaters2 = numOfRaterS2;
}
}
double pearson = calculatePearsonCorrelation(groupSize,dotProduct,rationg1Sum,rationg2Sum,rating1NormSq,rating2NormSq);
double cosine = calculateCosineCorrelation(groupSize,Math.sqrt(rating1NormSq),Math.sqrt(rating2NormSq));
double jaccard = calculateJaccardCorrelation(groupSize,maxNumOfumRaters1,maxNumOfumRaters2);
return new Tuple3<Double,Double,Double>(pearson,cosine,jaccard);
}
//计算两个电影的皮尔逊关联度
static double calculatePearsonCorrelation(double groupSize,double dotProduct,double rationg1Sum,
double rationg2Sum,double rating1NormSq,double rating2NormSq){
double numerator = groupSize * dotProduct - rationg1Sum * rationg2Sum;
double denominator = Math.sqrt(groupSize * rating1NormSq - rationg1Sum * rationg1Sum) *
Math.sqrt(groupSize * rating2NormSq - rationg2Sum * rationg2Sum);
return numerator / denominator;
}
/**
* 两个向量A和B之间的余弦值
*
*/
static double calculateCosineCorrelation(double dotProduct,double rationg1Norm,double rationg2Norm){
return dotProduct / (rationg1Norm * rationg2Norm);
}
/**
* 两个集合A和B之间的杰卡德相似度
*/
static double calculateJaccardCorrelation(double inCommon,double totalA,double totalB){
double union = totalA + totalB - inCommon;
return inCommon - union;
}
}
测试数据:
User1 Movie1 3
User1 Movie2 4
User1 Movie3 3
User2 Movie1 2
User2 Movie2 5
User2 Movie3 3
User2 Movie5 5
User3 Movie1 2
User3 Movie2 3
User3 Movie3 2
User4 Movie1 5
User4 Movie2 3
User4 Movie3 3
User4 Movie4 2
User4 Movie5 3
本例中user指看电影的用户movie指电影id 最后的数量指 评分数
脚本:
/usr/local/spark1.5/bin/spark-submit \
--class cn.spark.study.core.MovieRecommendationsWithJoin \
--num-executors 3 \
--driver-memory 100m \
--executor-memory 100m \
--executor-cores 3 \
/usr/local/spark-text/java/movies/zmovies.jar hdfs://spark01:9000/movies.txt
运行结果:
last key = (Movie2,Movie3) value + (0.5222329678670935,0.09353047470899113,0.0)
last key = (Movie1,Movie3) value + (0.47140452079103173,0.11085479909473865,0.0)
last key = (Movie3,Movie4) value + (NaN,0.16666666666666666,-3.0)
last key = (Movie3,Movie5) value + (NaN,0.08084520834544433,-2.0)
last key = (Movie4,Movie5) value + (NaN,0.16666666666666666,-1.0)
last key = (Movie1,Movie2) value + (-0.492365963917331,0.08035434036903077,0.0)
last key = (Movie2,Movie5) value + (1.0,0.058823529411764705,-2.0)
last key = (Movie1,Movie5) value + (-1.0,0.0636929755298482,-2.0)
last key = (Movie1,Movie4) value + (NaN,0.1,-3.0)
last key = (Movie2,Movie4) value + (NaN,0.16666666666666666,-3.0)