val conf = new SparkConf().setAppName("tst_cartesian").setMaster("local") val sc = new SparkContext(conf) //val pairs = sc.parallelize(Array(("a", Vectors.dense(1)), ("b", Vectors.dense(2)), ("c", Vectors.dense(3)) )) val pairs = sc.parallelize(List("a", "b", "c")) //1.默认算子,性能尚可,但简易可用 val cartesian_rdd = pairs.cartesian(pairs) println("--------cartesian--------") cartesian_rdd.foreach(println(_)) //2.自定义实现,性能一般,但结果优异,只取到上三角的对象 def combs(rdd: RDD[String]): RDD[(String, String)] = { val count = rdd.count if (rdd.count < 2) { sc.makeRDD[(String, String)](Seq.empty) } else if (rdd.count == 2) { val values = rdd.collect sc.makeRDD[(String, String)](Seq((values(0), values(1)))) } else { val elem = rdd.take(1) val elemRdd = sc.makeRDD(elem) val subtracted = rdd.subtract(elemRdd) val comb = subtracted.map(e => (elem(0), e)) comb.union(combs(subtracted)) } } val cartesian_rdd2 = combs(pairs) println("--------combs--------") cartesian_rdd2.foreach(println(_)) //3.Join实现。性能待定 //key值相同,value实现笛卡尔积组合 //4.将rdd拉动driver端,用两层循环实现。耗费driver-memory,性能不可控。 val drivers = pairs.collect() var empnos = scala.collection.mutable.ArrayBuffer[(String, String)]() for (driver_one <- drivers) { for (driver_two <- drivers) { val pair_empno = (driver_one, driver_two) empnos += pair_empno } } println("--------drive--------") empnos.foreach(println(_)) println("结论:Spark不要轻易计算笛卡尔积!")
Spark笛卡尔积实现方案描述
最新推荐文章于 2023-01-11 20:15:00 发布