package cn.spark.study.core;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import com.google.common.collect.Sets;
import scala.Tuple2;
public class FindCommonFriends {
public static void main(String[] args){
SparkConf conf = new SparkConf().setAppName("FindCommonFriends");
JavaSparkContext jsc = new JavaSparkContext(conf);
if(args.length < 1){
System.out.println("err");
System.exit(1);
}
JavaRDD<String> records = jsc.textFile(args[0]);
JavaPairRDD<Tuple2<Long,Long>,Iterable<Long>> pairs = records.flatMapToPair(
new PairFlatMapFunction<String,Tuple2<Long,Long>,Iterable<Long>>(){
private static final long serialVersionUID = 1L;
@Override
public Iterable<Tuple2<Tuple2<Long, Long>, Iterable<Long>>> call(String s) throws Exception {
String[] tokens = s.split(",");
long person = Long.parseLong(tokens[0]);
String friendsAsString = tokens[1];
String[] friendsTokenized = friendsAsString.split(" ");
if(friendsTokenized.length == 1){
Tuple2<Long,Long> key = buildSortedTuple(person,Long.parseLong(friendsTokenized[0]));
return Arrays.asList(new Tuple2<Tuple2<Long,Long>,Iterable<Long>>(key,new ArrayList<Long>()));
}
List<Long> friends = new ArrayList<Long>();
for(String r : friendsTokenized){
friends.add(Long.parseLong(r));
}
List<Tuple2<Tuple2<Long,Long>,Iterable<Long>>> result = new ArrayList<Tuple2<Tuple2<Long,Long>,Iterable<Long>>>();
for(Long l : friends){
Tuple2<Long,Long> key = buildSortedTuple(person,l);
result.add(new Tuple2<Tuple2<Long,Long>,Iterable<Long>>(key,friends));
}
return result;
}
});
//debug1
List<Tuple2<Tuple2<Long,Long>,Iterable<Long>>> debug1 = pairs.collect();
for(Tuple2<Tuple2<Long,Long>,Iterable<Long>> r : debug1){
System.out.println("debug1 key = " + r._1 + "\t value = " + r._2);
}
JavaPairRDD<Tuple2<Long, Long>, Iterable<Iterable<Long>>> grouped = pairs.groupByKey();
//debug2
List<Tuple2<Tuple2<Long, Long>, Iterable<Iterable<Long>>>> debug2 = grouped.collect();
for(Tuple2<Tuple2<Long, Long>, Iterable<Iterable<Long>>> r : debug2){
System.out.println("debug2 key = " + r._1 + "\t value = " + r._2);
}
JavaPairRDD<Tuple2<Long,Long>,Iterable<Long>> commonFriends = grouped.mapValues(new Function<Iterable<Iterable<Long>>,Iterable<Long>>(){
private static final long serialVersionUID = 1L;
@Override
public Iterable<Long> call(Iterable<Iterable<Long>> s) throws Exception {
Map<Long,Integer> countCommon = new HashMap<Long,Integer>();
int size = 0;
for(Iterable<Long> r : s){
size++;
List<Long> list = iterableToList(r);
if((list == null) || (list.isEmpty())){
continue;
}
for(Long f : list){
Integer count = countCommon.get(f);
if(count == null){
countCommon.put(f, 1);
}
else{
countCommon.put(f, ++count);
}
}
}
List<Long> finalCommonFriends = new ArrayList<Long>();
for(Map.Entry<Long, Integer> entry : countCommon.entrySet()){
if(entry.getValue() == size){
finalCommonFriends.add(entry.getKey());
}
}
return finalCommonFriends;
}
});
//debug3
List<Tuple2<Tuple2<Long,Long>,Iterable<Long>>> debug3 = commonFriends.collect();
for(Tuple2<Tuple2<Long, Long>, Iterable<Long>> r : debug3){
System.out.println("debug3 commonFriends key = " + r._1 + "\t value = " + r._2);
}
System.out.println("===================================Two reduceByKey================================================");
JavaPairRDD<Tuple2<Long,Long>,Iterable<Long>> commonFriendsReduce = pairs.reduceByKey(
new Function2<Iterable<Long>,Iterable<Long>,Iterable<Long>>(){
private static final long serialVersionUID = 1L;
@Override
public Iterable<Long> call(Iterable<Long> a, Iterable<Long> b) throws Exception {
Set<Long> x = Sets.newHashSet(a);
Set<Long> intersection = new HashSet<Long>();
for(Long item : b){
if(x.contains(item)){
intersection.add(item);
}
}
return intersection;
}
});
Map<Tuple2<Long, Long>, Iterable<Long>> commonFriendsMap = commonFriendsReduce.collectAsMap();
for(Entry<Tuple2<Long, Long>, Iterable<Long>> r : commonFriendsMap.entrySet()){
System.out.println("Two commonFriendsReduce key = " + r.getKey() + "\t value = " + r.getValue());
}
}
//排序避免重复的主键
static Tuple2<Long,Long> buildSortedTuple(Long p1,Long p2){
if(p1 < p2){
return new Tuple2(p1,p2);
}
else{
return new Tuple2(p2,p1);
}
}
static List<Long> iterableToList(Iterable<Long> r) {
List<Long> l = new ArrayList<Long>();
for(Long e : r){
l.add(e);
}
return l;
}
}
测试数据:
100,200 300 400 500
200,100 300 400
300,100 200 400 500
400,100 200 300
500,100 300
600,100
运行脚本:
/usr/local/spark1.5/bin/spark-submit \
--class cn.spark.study.core.FindCommonFriends \
--num-executors 3 \
--driver-memory 100m \
--executor-memory 100m \
--executor-cores 3 \
/usr/local/spark-text/java/findCommonFriends/FindCommonFriends.jar hdfs://spark01:9000/commonFriends.txt
运行结果:
debug1 key = (100,200) value = [200, 300, 400, 500]
debug1 key = (100,300) value = [200, 300, 400, 500]
debug1 key = (100,400) value = [200, 300, 400, 500]
debug1 key = (100,500) value = [200, 300, 400, 500]
debug1 key = (100,200) value = [100, 300, 400]
debug1 key = (200,300) value = [100, 300, 400]
debug1 key = (200,400) value = [100, 300, 400]
debug1 key = (100,300) value = [100, 200, 400, 500]
debug1 key = (200,300) value = [100, 200, 400, 500]
debug1 key = (300,400) value = [100, 200, 400, 500]
debug1 key = (300,500) value = [100, 200, 400, 500]
debug1 key = (100,400) value = [100, 200, 300]
debug1 key = (200,400) value = [100, 200, 300]
debug1 key = (300,400) value = [100, 200, 300]
debug1 key = (100,500) value = [100, 300]
debug1 key = (300,500) value = [100, 300]
debug1 key = (100,600) value = []
debug2 key = (300,400) value = [[100, 200, 400, 500], [100, 200, 300]]
debug2 key = (100,200) value = [[200, 300, 400, 500], [100, 300, 400]]
debug2 key = (300,500) value = [[100, 200, 400, 500], [100, 300]]
debug2 key = (100,500) value = [[200, 300, 400, 500], [100, 300]]
debug2 key = (200,300) value = [[100, 300, 400], [100, 200, 400, 500]]
debug2 key = (100,600) value = [[]]
debug2 key = (100,300) value = [[200, 300, 400, 500], [100, 200, 400, 500]]
debug2 key = (200,400) value = [[100, 300, 400], [100, 200, 300]]
debug2 key = (100,400) value = [[200, 300, 400, 500], [100, 200, 300]]
debug3 commonFriends key = (300,400) value = [100, 200]
debug3 commonFriends key = (100,200) value = [400, 300]
debug3 commonFriends key = (300,500) value = [100]
debug3 commonFriends key = (100,500) value = [300]
debug3 commonFriends key = (200,300) value = [100, 400]
debug3 commonFriends key = (100,600) value = []
debug3 commonFriends key = (100,300) value = [200, 500, 400]
debug3 commonFriends key = (200,400) value = [100, 300]
debug3 commonFriends key = (100,400) value = [200, 300]
===================================Two reduceByKey================================================
Two commonFriendsReduce key = (300,400) value = [100, 200]
Two commonFriendsReduce key = (200,400) value = [100, 300]
Two commonFriendsReduce key = (100,600) value = []
Two commonFriendsReduce key = (200,300) value = [100, 400]
Two commonFriendsReduce key = (100,500) value = [300]
Two commonFriendsReduce key = (300,500) value = [100]
Two commonFriendsReduce key = (100,300) value = [200, 500, 400]
Two commonFriendsReduce key = (100,200) value = [400, 300]
Two commonFriendsReduce key = (100,400) value = [200, 300]