鉴于上篇用mapreduce实现查找二度好友,代码偏多而且复杂。如果用spark来实现的话,可以大大减少代码量,如下是笔者的实现代码:
package bigdata.testspark
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object FindDegree2Friends {
var files = "/home/typery/temp/hadoop_test/deg2friends/src_data/total_firends.txt";
def main(args: Array[String]) {
val conf = new SparkConf().setMaster("local[*]").setAppName("FIND_DEGREE2_FRIENDS");
val sc = new SparkContext(conf);
sc.setLogLevel("WARN")
val srcDataRdd = sc.textFile(files).map(_.split("\t")).cache()
val srcFriendRdd = srcDataRdd.map(f => (f(0), f(1))).union(srcDataRdd.map(f => (f(1), f(0)))).distinct().cache()
val deg1FriendRdd = srcFriendRdd.map(f => ((f._1, f._2), 1)).cache()
val deg2FriendRdd = srcFriendRdd.join(srcFriendRdd).map(f => (f._2._1, f._2._2)).filter(f => (f._1 != f._2)).map(f => ((f._1, f._2), 2)).cache()
deg2FriendRdd.leftOuterJoin(deg1FriendRdd).filter(f => (f._2._2.isEmpty)).map(f => (f._1, 1)).reduceByKey((x, y) => (x + y)).foreach(f => (println(f._1 + " has " + f._2 + " same friends!")))
}
}
执行的计算结果如下:
(Tom,HanMeimei) has 1 same friends!
(HanMeimei,Jim) has 1 same friends!
(Tom,Tim) has 1 same friends!
(Tim,Jim) has 1 same friends!
(LiLei,Lucy) has 1 same friends!
(Jim,Tom) has 2 same friends!
(Tim,Kate) has 2 same friends!
(Lucy,Kate) has 2 same friends!
(Kate,LiLei) has 1 same friends!
(Lucy,Tim) has 2 same friends!
(Jim,Lucy) has 1 same friends!
(Jim,HanMeimei) has 1 same friends!
(HanMeimei,Tom) has 1 same friends!
(Tim,Lucy) has 2 same friends!
(Lucy,LiLei) has 1 same friends!
(Kate,Tim) has 2 same friends!
(Tom,Jim) has 2 same friends!
(Lucy,Tom) has 1 same friends!
(Jim,Tim) has 1 same friends!
(Lucy,Jim) has 1 same friends!
(LiLei,Lily) has 1 same friends!
(Kate,Lucy) has 2 same friends!
(Tom,Lucy) has 1 same friends!
(Lily,HanMeimei) has 3 same friends!
(LiLei,Kate) has 1 same friends!
(HanMeimei,Lily) has 3 same friends!
(Lily,LiLei) has 1 same friends!
(Tim,Tom) has 1 same friends!