这里是一些aggregateMessages的基本应用和代码的详细解释,还有一个老师布置的章节作业(在他的代码上进行优化算法)
目录
aggregateMessages聚合+ Join计算GraphX中每个节点的入度;
先写一个简单代码解释一下aggregateMessages聚合的作用吧;
这是intalliji中aggregateMessages的方法解释:注意一下参数的传入和结果的形式就可以很好的应用了。
def aggregateMessages[A](
sendMsg : scala.Function1[org.apache.spark.graphx.EdgeContext[VD, ED, A], scala.Unit],
mergeMsg : scala.Function2[A, A, A],
tripletFields : org.apache.spark.graphx.TripletFields = { /* compiled code */ }
)
(implicit evidence$11 : scala.reflect.ClassTag[A]) : org.apache.spark.graphx.VertexRDD[A] = { /* compiled code */ }
aggregateMessages聚合+ Join计算GraphX中每个节点的入度;
package aggregate
import org.apache.spark.graphx.{Edge, Graph, VertexId, VertexRDD}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo1 {
def main(args: Array[String]): Unit = {
//设置运行环境
val conf = new SparkConf().setAppName("SimpleGraphX").setMaster("local")
val sc = new SparkContext(conf)
sc.setLogLevel("WARN")
// 构建图
val myVertices = sc.parallelize(Array((1L, "张三"), (2L, "李四"), (3L, "王五"), (4L, "钱六"),
(5L, "领导")))
val myEdges = sc.makeRDD(Array( Edge(1L,2L,"朋友"),
Edge(2L,3L,"朋友") , Edge(3L,4L,"朋友"),
Edge(4L,5L,"上下级"),Edge(3L,5L,"上下级")
))
val myGraph = Graph(myVertices,myEdges)
// myGraph.inDegrees
var vertices: VertexRDD[Int] = myGraph.aggregateMessages[Int](_.sendToSrc(1), _ + _)
vertices.collect.foreach(println(_))
println("***********************************")
vertices.join(myGraph.vertices).collect.foreach(println(_))
println("***********************************")
vertices.join(myGraph.vertices).map((_._2.swap)).collect.foreach( println(_) )
}
}
package com.lightfall.graphx.Apps
import org.apache.spark._
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
object MapGraphX4 {
def main(args: Array[String]): Unit = {
// 创建 spark 上下文
val spark_conf = new SparkConf().setAppName("base").setMaster("local[*]")
val sc = new SparkContext(spark_conf)
sc.setLogLevel("WARN")
// 创建顶点 RDD
val vertices_user: RDD[(VertexId, Double)] = sc.parallelize(Array(
(1L, 20.0),
(2L, 27.0),
(3L, 65.0),
(4L, 42.0),
(5L, 55.0),
(6L, 30.0)
))
// 创建边 RDD
val edges_relationship: RDD[Edge[Int]] = sc.parallelize(Array(
Edge(2L, 1L, 7),
Edge(2L, 4L, 2),
Edge(3L, 2L, 4),
Edge(3L, 6L, 3),
Edge(4L, 1L, 1),
Edge(5L, 2L, 2),
Edge(5L, 3L, 8),
Edge(5L, 6L, 3)
))
// 定义默认顶点
// 当边中使用了不存在的顶点时,会使用这个默认顶点
val default_vertex_user = (0.0)
val graph = Graph(vertices_user, edges_relationship, default_vertex_user)
// eg1:统计粉丝年龄
// 定义 SendMsg 方法
// 如果 dst 的 age 大于 src 的 age,就代表是粉丝,将它的年龄发送给 dst
def SendMsg(edge_context: EdgeContext[Double, Int, (Int, Double)]): Unit = {
if(edge_context.dstAttr > edge_context.srcAttr) {
edge_context.sendToDst((1, edge_context.srcAttr))
}
}
// 使用 aggregateMessages 方法来统计粉丝数
val older_followers: VertexRDD[(Int, Double)] = graph.aggregateMessages[(Int, Double)](SendMsg, (a, b) => (a._1+b._1, a._2+b._2))
//求粉丝的平均年龄
val avarage_followers = older_followers.map{case (vertex_id, (num, age)) => (age/num)}
avarage_followers.collect.foreach(println)
sc.stop()
}
}
复杂度比较高的Join方法:
package aggregate
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.graphx.{Edge, EdgeContext, Graph, VertexId, VertexRDD}
/**
* 图的聚合操作
* 计算节点 与 root节点的边距
* 通过 g2.vertices.join(g.vertices) 新图+老图的节点信息
* sum(新图边距 - 老图边距) == 0 没有在更改的信息之后判作结束标志
*/
object test {
def sendMsg(ec:EdgeContext[Int,String,Int]):Unit = {
ec.sendToDst( ec.srcAttr +1)
}
def mergeMsg(a: Int , b:Int) :Int = {
math.max(a,b)
}
def sumEdgeCount(g:Graph[Int,String]):Graph[Int,String] = {
val verts: VertexRDD[Int] = g.aggregateMessages[Int](sendMsg, mergeMsg)
val g2 = Graph(verts ,g.edges)
verts.collect.foreach(println(_))
println()
g2.vertices.join(g.vertices).collect.foreach(println(_))
val check = g2.vertices.join(g.vertices).map(x => x._2._1 - x._2._2).reduce(_+_)
if(check > 0)
sumEdgeCount(g2)
else
g
}
def main(args: Array[String]): Unit = {
//设置运行环境
val conf = new SparkConf().setAppName("SimpleGraphX").setMaster("local")
val sc = new SparkContext(conf)
sc.setLogLevel("WARN")
// 构建图
val myVertices = sc.parallelize(
Array(
(1L, "张三"),
(2L, "李四"),
(3L, "王五"),
(4L, "钱六"),
(5L, "领导")
))
val myEdges = sc.makeRDD(
Array(
Edge(1L,2L,"朋友"),
Edge(2L,3L,"朋友"),
Edge(3L,4L,"朋友"),
Edge(4L,5L,"上下级"),
Edge(3L,5L,"上下级")
))
val myGraph = Graph(myVertices,myEdges)
// 使用mapVertices操作修改(vid, attr) 变为 (0, 0)初始化结果集
val initGraph = myGraph.mapVertices((_,_)=>0)
sumEdgeCount(initGraph).vertices.collect.foreach(println(_))
}
}
代码中每次遍历Graph都要Join操作一下,代码性能消耗比较大。
章节作业优化代码:
import org.apache.spark.graphx.{Edge, EdgeContext, Graph, VertexId, VertexRDD}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object MapGraphX4 {
//aggregateMessages聚类形成结果集的方法
def sendMsg(ec: EdgeContext[Int, String, Int]): Unit = {
ec.sendToDst(ec.srcAttr + 1)
}
//aggregateMessages聚类后连个结果集的选取最后结果集的方法
def mergeMsg(a: Int, b: Int): Int = {
math.max(a, b)
}
def sumEdgeCount(g: Graph[Int, String], Num: Int): Graph[Int, String] = {
val verts: VertexRDD[Int] = g.aggregateMessages[Int](sendMsg, mergeMsg)
val g2 = Graph(verts, g.edges)
/* 方法一的思路:
record=g.vertieces.collect.toList
newRecord=g2.vertiecs.collect.toList
if(record==newRecord)的时候结束*/
/* 方法二的思路:
用num标记最大深度,直到最大深度不改变的时候就结束*/
//我这里用的是方法二,等下周如果老师的代码和我的想法不一样的话,再来更新方法三,一样就不更新了。
val check: Int = g2.vertices.map(x => x._2).max()
if (check > Num)
sumEdgeCount(g2, Num + 1)
else
g
}
def main(args: Array[String]): Unit = {
//设置运行环境
val conf = new SparkConf().setAppName("SimpleGraphX").setMaster("local")
val sc = new SparkContext(conf)
sc.setLogLevel("WARN")
// 构建图
val myVertices: RDD[(VertexId, (String))] =
sc.parallelize(Array(
(1L, "张三"),
(2L, "李四"),
(3L, "王五"),
(4L, "钱六"),
(5L, "领导")
))
val myEdges: RDD[Edge[String]] =
sc.parallelize(Array(
Edge(1L, 2L, "朋友"),
Edge(2L, 3L, "朋友"),
Edge(3L, 4L, "朋友"),
Edge(4L, 5L, "上下级"),
Edge(3L, 5L, "上下级")
))
val myGraph = Graph(myVertices, myEdges)
//初始化结果集
val initGraph = myGraph.mapVertices((vId, attr) => {0})
//调用方法
sumEdgeCount(initGraph, 0).vertices.collect.foreach(println(_))
}
}
这里的两个思路,写起来都是比较容易的就直接写了注释,两个代码整合到一起了。