package cn.spark.study.core;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import scala.Tuple2;
public class SparkFriendRecommendation {
public static void main(String[] args){
SparkConf conf = new SparkConf().setAppName("SparkFriendRecommendation");
JavaSparkContext jsc = new JavaSparkContext(conf);
if(args.length < 1){
System.out.println("err");
System.exit(1);
}
JavaRDD<String> records = jsc.textFile(args[0],1);
JavaPairRDD<Long,Tuple2<Long,Long>> pairs = records.flatMapToPair(new PairFlatMapFunction<String,Long,Tuple2<Long,Long>>(){
private static final long serialVersionUID = 1L;
@Override
public Iterable<Tuple2<Long, Tuple2<Long, Long>>> call(String s) throws Exception {
String[] tokens = s.split(" ");
Long person = Long.parseLong(tokens[0]);
String friedsAsString = tokens[1];
String[] friendsTokenized = friedsAsString.split(",");
List<Long> friends = new ArrayList<Long>();
List<Tuple2<Long,Tuple2<Long,Long>>> mapperOutput = new ArrayList<Tuple2<Long,Tuple2<Long,Long>>>();
for (String f : friendsTokenized){
Long fl = Long.parseLong(f);
friends.add(fl);
Tuple2<Long,Long> v1 = new Tuple2<Long,Long>(fl,-1L);
Tuple2<Long,Tuple2<Long,Long>> v = new Tuple2<Long,Tuple2<Long,Long>>(person,v1);
mapperOutput.add(v);
}
for(int i = 0;i < friends.size();i++){
for(int j = i+1; j < friends.size();j++){
Tuple2<Long,Long> pf1 = new Tuple2<Long,Long>(friends.get(j),person);
mapperOutput.add(new Tuple2<Long,Tuple2<Long,Long>>(friends.get(i),pf1));
Tuple2<Long,Long> pf2 = new Tuple2<Long,Long>(friends.get(i),person);
mapperOutput.add(new Tuple2<Long,Tuple2<Long,Long>>(friends.get(j),pf2));
}
}
return mapperOutput;
}
});
//debug2
List<Tuple2<Long, Tuple2<Long, Long>>> debug2 = pairs.collect();
for(Tuple2<Long, Tuple2<Long, Long>> r : debug2){
System.out.println("debug2 key = " + r._1 + "\t value = " + r._2);
}
JavaPairRDD<Long, Iterable<Tuple2<Long, Long>>> grouped = pairs.groupByKey();
//debug3
List<Tuple2<Long, Iterable<Tuple2<Long, Long>>>> debug3 = grouped.collect();
for(Tuple2<Long, Iterable<Tuple2<Long, Long>>> r : debug3){
System.out.println("debug3 key = " + r._1 + "\t value = " + r._2);
}
JavaPairRDD<Long,String> rec = grouped.mapValues(new Function<Iterable<Tuple2<Long,Long>>,String>(){
private static final long serialVersionUID = 1L;
@Override
public String call(Iterable<Tuple2<Long, Long>> s) throws Exception {
final Map<Long,List<Long>> mutualFriends = new HashMap<Long,List<Long>>();
for(Tuple2<Long,Long> t2 : s){
final Long toUser = t2._1;
final Long mutualFriend = t2._2;
final boolean alreadyFriend = (mutualFriend == -1);
if(mutualFriends.containsKey(toUser)){
if(alreadyFriend){
mutualFriends.put(toUser, null);
}
else if(mutualFriends.get(toUser) != null){
mutualFriends.get(toUser).add(mutualFriend);
}
}
else{
if(alreadyFriend){
mutualFriends.put(toUser, null);
}
else {
List<Long> list1 = new ArrayList<Long>(Arrays.asList(mutualFriend));
mutualFriends.put(toUser, list1);
}
}
}
// mutualFriends.forEach((Key,Value)->{
// for(Long l : Value){
// System.out.println("Map<Long,List<Long>>" + l);
// }
// });
return buildRecommendations(mutualFriends);
}
});
//debug4
List<Tuple2<Long, String>> debug4 = rec.collect();
for(Tuple2<Long, String> r : debug4){
System.out.println("last key = " + r._1 + "\t value = " + r._2);
}
jsc.close();
}
static String buildRecommendations(Map<Long,List<Long>> mutualFriends){
StringBuilder r = new StringBuilder();
for(Map.Entry<Long, List<Long>> entry : mutualFriends.entrySet()){
if(entry.getValue() == null){
continue;
}
r.append(entry.getKey());
r.append("(");
r.append(entry.getValue().size());
r.append(":");
r.append(entry.getValue());
r.append("),");
}
return r.toString();
}
}
测试数据:
1 2,3,4,5,6,7,8
2 1,3,4,5,7
3 1,2
4 1,2,6
5 1,2
6 1,4
7 1,2
8 1
本列中1的好友分别为2,3,4,5,6,7,8以行类推
脚本:
/usr/local/spark1.5/bin/spark-submit \
--class cn.spark.study.core.SparkFriendRecommendation \
--num-executors 3 \
--driver-memory 100m \
--executor-memory 100m \
--executor-cores 3 \
/usr/local/spark-text/java/SparkFriendRecommendation/SparkFriendRecommendation.jar hdfs://spark01:9000/tSparkFriendRecommendation.txt
运行结果:
debug2 key = 1 value = (2,-1)
debug2 key = 1 value = (3,-1)
debug2 key = 1 value = (4,-1)
debug2 key = 1 value = (5,-1)
debug2 key = 1 value = (6,-1)
debug2 key = 1 value = (7,-1)
debug2 key = 1 value = (8,-1)
debug2 key = 2 value = (3,1)
debug2 key = 3 value = (2,1)
debug2 key = 2 value = (4,1)
debug2 key = 4 value = (2,1)
debug2 key = 2 value = (5,1)
debug2 key = 5 value = (2,1)
debug2 key = 2 value = (6,1)
debug2 key = 6 value = (2,1)
debug2 key = 2 value = (7,1)
debug2 key = 7 value = (2,1)
debug2 key = 2 value = (8,1)
debug2 key = 8 value = (2,1)
debug2 key = 3 value = (4,1)
debug2 key = 4 value = (3,1)
debug2 key = 3 value = (5,1)
debug2 key = 5 value = (3,1)
debug2 key = 3 value = (6,1)
debug2 key = 6 value = (3,1)
debug2 key = 3 value = (7,1)
debug2 key = 7 value = (3,1)
debug2 key = 3 value = (8,1)
debug2 key = 8 value = (3,1)
debug2 key = 4 value = (5,1)
debug2 key = 5 value = (4,1)
debug2 key = 4 value = (6,1)
debug2 key = 6 value = (4,1)
debug2 key = 4 value = (7,1)
debug2 key = 7 value = (4,1)
debug2 key = 4 value = (8,1)
debug2 key = 8 value = (4,1)
debug2 key = 5 value = (6,1)
debug2 key = 6 value = (5,1)
debug2 key = 5 value = (7,1)
debug2 key = 7 value = (5,1)
debug2 key = 5 value = (8,1)
debug2 key = 8 value = (5,1)
debug2 key = 6 value = (7,1)
debug2 key = 7 value = (6,1)
debug2 key = 6 value = (8,1)
debug2 key = 8 value = (6,1)
debug2 key = 7 value = (8,1)
debug2 key = 8 value = (7,1)
debug2 key = 2 value = (1,-1)
debug2 key = 2 value = (3,-1)
debug2 key = 2 value = (4,-1)
debug2 key = 2 value = (5,-1)
debug2 key = 2 value = (7,-1)
debug2 key = 1 value = (3,2)
debug2 key = 3 value = (1,2)
debug2 key = 1 value = (4,2)
debug2 key = 4 value = (1,2)
debug2 key = 1 value = (5,2)
debug2 key = 5 value = (1,2)
debug2 key = 1 value = (7,2)
debug2 key = 7 value = (1,2)
debug2 key = 3 value = (4,2)
debug2 key = 4 value = (3,2)
debug2 key = 3 value = (5,2)
debug2 key = 5 value = (3,2)
debug2 key = 3 value = (7,2)
debug2 key = 7 value = (3,2)
debug2 key = 4 value = (5,2)
debug2 key = 5 value = (4,2)
debug2 key = 4 value = (7,2)
debug2 key = 7 value = (4,2)
debug2 key = 5 value = (7,2)
debug2 key = 7 value = (5,2)
debug2 key = 3 value = (1,-1)
debug2 key = 3 value = (2,-1)
debug2 key = 1 value = (2,3)
debug2 key = 2 value = (1,3)
debug2 key = 4 value = (1,-1)
debug2 key = 4 value = (2,-1)
debug2 key = 4 value = (6,-1)
debug2 key = 1 value = (2,4)
debug2 key = 2 value = (1,4)
debug2 key = 1 value = (6,4)
debug2 key = 6 value = (1,4)
debug2 key = 2 value = (6,4)
debug2 key = 6 value = (2,4)
debug2 key = 5 value = (1,-1)
debug2 key = 5 value = (2,-1)
debug2 key = 1 value = (2,5)
debug2 key = 2 value = (1,5)
debug2 key = 6 value = (1,-1)
debug2 key = 6 value = (4,-1)
debug2 key = 1 value = (4,6)
debug2 key = 4 value = (1,6)
debug2 key = 7 value = (1,-1)
debug2 key = 7 value = (2,-1)
debug2 key = 1 value = (2,7)
debug2 key = 2 value = (1,7)
debug2 key = 8 value = (1,-1)
debug3 key = 4 value = [(2,1), (3,1), (5,1), (6,1), (7,1), (8,1), (1,2), (3,2), (5,2), (7,2), (1,-1), (2,-1), (6,-1), (1,6)]
debug3 key = 1 value = [(2,-1), (3,-1), (4,-1), (5,-1), (6,-1), (7,-1), (8,-1), (3,2), (4,2), (5,2), (7,2), (2,3), (2,4), (6,4), (2,5), (4,6), (2,7)]
debug3 key = 6 value = [(2,1), (3,1), (4,1), (5,1), (7,1), (8,1), (1,4), (2,4), (1,-1), (4,-1)]
debug3 key = 3 value = [(2,1), (4,1), (5,1), (6,1), (7,1), (8,1), (1,2), (4,2), (5,2), (7,2), (1,-1), (2,-1)]
debug3 key = 7 value = [(2,1), (3,1), (4,1), (5,1), (6,1), (8,1), (1,2), (3,2), (4,2), (5,2), (1,-1), (2,-1)]
debug3 key = 8 value = [(2,1), (3,1), (4,1), (5,1), (6,1), (7,1), (1,-1)]
debug3 key = 5 value = [(2,1), (3,1), (4,1), (6,1), (7,1), (8,1), (1,2), (3,2), (4,2), (7,2), (1,-1), (2,-1)]
debug3 key = 2 value = [(3,1), (4,1), (5,1), (6,1), (7,1), (8,1), (1,-1), (3,-1), (4,-1), (5,-1), (7,-1), (1,3), (1,4), (6,4), (1,5), (1,7)]
last key = 4 value = 3(2:[1, 2]),5(2:[1, 2]),7(2:[1, 2]),8(1:[1]),
last key = 1 value =
last key = 6 value = 2(2:[1, 4]),3(1:[1]),5(1:[1]),7(1:[1]),8(1:[1]),
last key = 3 value = 4(2:[1, 2]),5(2:[1, 2]),6(1:[1]),7(2:[1, 2]),8(1:[1]),
last key = 7 value = 3(2:[1, 2]),4(2:[1, 2]),5(2:[1, 2]),6(1:[1]),8(1:[1]),
last key = 8 value = 2(1:[1]),3(1:[1]),4(1:[1]),5(1:[1]),6(1:[1]),7(1:[1]),
last key = 5 value = 3(2:[1, 2]),4(2:[1, 2]),6(1:[1]),7(2:[1, 2]),8(1:[1]),
last key = 2 value = 6(2:[1, 4]),8(1:[1]),
本列中给4的推荐好友为3,共同好友的个数为2分别是1和2