import java.io.PrintWriter import grizzled.slf4j.Logger import org.apache.spark.graphx._ import org.apache.spark.graphx.util.GraphGenerators import org.apache.spark.sql.SparkSession import org.redblue.demo.TestGraph.toGexf // 寻找与顶点距离最远的根顶点的算法 // 重要知识点 // RDD是延迟求值的,RDD的操作函数只是看起来会被执行,实际上并非如此。 // RDD的API函数有两类:transformation(转换函数)和action(执行函数)。 // Transformation是延迟执行的,这些函数的调用,会被放入待执行队列,并不会马上执行; // 当一个action函数被调用时,会沿着这个action函数向上逐一追溯队列中的transformation函数, // 直到最源头的起始调用(一般是读取数据源),然后再沿着transformation函数顺序实际执行, // 一直执行到action函数,得出计算结果 /* Transformation的官方文档方法集合如下: map filter flatMap mapPartitions mapPartitionsWithIndex sample union intersection distinct groupByKey reduceByKey aggregateByKey sortByKey join cogroup cartesian pipe coalesce repartition repartitionAndSortWithinPartitions Action的官方文档方法集合如下: reduce collect count first take takeSample takeOrdered saveAsTextFile saveAsSequenceFile saveAsObjectFile countByKey foreach */ object TestAggregateMessages { val log = Logger(this.getClass) def main(args: Array[String]): Unit = { val sparkSession = SparkSession.builder .master("local[*]") .appName("TestGraph") .config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false) .config("spark.hadoop.validateOutputSpecs", false).getOrCreate val sc = sparkSession.sparkContext; val vertices = sc.makeRDD(Array((1L, "Ann"), (2L, "Bill"), (3L, "Charles"), (4L, "Diane") , (5L, "Went to gym this morning"))) val edges = sc.makeRDD(Array(Edge(1L, 2L, "is-friends-with"), Edge(2L, 3L, "is-friends-with") , Edge(3L, 4L, "is-friends-with"), Edge(4L, 5L, "Likes-status") , Edge(3L, 5L, "Worse-status"))) val graph = Graph(vertices, edges) //边上发送消息 def sendMsg(ec: EdgeContext[Int, String, Int]): Unit = { ec.sendToDst(ec.srcAttr + 1) println("sendMsg: ", /*ec.srcId, ec.srcAttr,*/ ec.dstId, ec.dstAttr, ec.srcAttr + 1) } //合并消息,获取最大距离顶点 def mergeMsg(a: Int, b: Int): Int = { println("mergeMsg: ", a, b) math.max(a, b) } // 执行函数执行时边开始向目标顶点发送消息 // 目标顶点收到消息后进行合并处理,获取最大的距离 // 图的递归结束条件是 图不再有变化 // 顶点1 没有收到消息,距离为0 // 顶点2 收到的消息都是 1 (顶点0的距离是0 + 1) // 顶点3 第一次收到消息是1 (顶点2当时的距离是0 + 1),第二次收到的消息是2 (顶点2这时的距离是1 + 1) // 一次类推,计算顶点4和顶点5 // 直到图不再发生变化,图 // (1,0) (2,1) (3,2) (4,3) (5,4) def propagateEdge(g: Graph[Int, String]): Graph[Int, String] = { val vertices = g.aggregateMessages[Int](sendMsg, mergeMsg) println("g.vertices && g.edges") g.vertices.collect().foreach(println) g.edges.collect().foreach(println) val g2 = Graph(vertices, g.edges) //g2.vertices.collect().foreach(println) //g2.edges.collect().foreach(println) val check = g2.vertices.join(g.vertices) .map(x => x._2._1 - x._2._2).reduce(_ + _) //g2.vertices.join(g.vertices).collect().foreach(println) if (check > 0) { println("propagateEdge(g2)") propagateEdge(g2) } else g } //顶点属性设置为0 println("init") val initialGraph = graph.mapVertices((_, _) => 0) //initialGraph.vertices.collect().foreach(println) //initialGraph.edges.collect().foreach(println) println("propagateEdge") var g = propagateEdge(initialGraph) println("println") g.vertices.collect().foreach(println) sparkSession.stop() } }
spark-TestAggregateMessages-聚合消息
最新推荐文章于 2020-09-03 17:39:10 发布