spark GraphX 常用算子介绍

ddf482

已于 2023-04-26 17:45:31 修改

阅读量171

点赞数

文章标签： spark scala

于 2023-04-26 17:44:16 首次发布

本文链接：https://blog.csdn.net/zzf135/article/details/130379548

版权

GraphX 常用算子介绍

参考官网文档：https://spark.apache.org/docs/latest/graphx-programming-guide.html

构建图，查看图信息

package zzf

import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

object graphx01_graphProperty {
  def main(args: Array[String]): Unit = {
    val spark: SparkSession= SparkSession.builder().master("local[*]").appName("graphx01").getOrCreate()
    val sc = spark.sparkContext

    // Create an RDD for the vertices
    val users: RDD[(VertexId, (String, String))] =
      sc.parallelize(Seq((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
        (5L, ("franklin", "prof")), (2L, ("istoica", "prof"))))
    // Create an RDD for edges
    val relationships: RDD[Edge[String]] =
      sc.parallelize(Seq(Edge(3L, 7L, "collab"),    Edge(5L, 3L, "advisor"),
        Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi")))
    // Define a default user in case there are relationship with missing user
    val defaultUser = ("John Doe", "Missing")
    // Build the initial Graph
    val graph = Graph(users, relationships, defaultUser)

    println("vertices..............")
    graph.vertices.collect.foreach(println)

    println("edges...............")
    graph.edges.collect.foreach(println)

    println("triplets............")
    graph.triplets.collect.foreach(println)
    
  }

}

执行效果：

vertices..............
(2,(istoica,prof))
(3,(rxin,student))
(5,(franklin,prof))
(7,(jgonzal,postdoc))
edges...............
Edge(3,7,collab)
Edge(5,3,advisor)
Edge(2,5,colleague)
Edge(5,7,pi)
triplets............
((3,(rxin,student)),(7,(jgonzal,postdoc)),collab)
((5,(franklin,prof)),(3,(rxin,student)),advisor)
((2,(istoica,prof)),(5,(franklin,prof)),colleague)
((5,(franklin,prof)),(7,(jgonzal,postdoc)),pi)

边过滤，点过滤

    println("vertices filter...........")
    graph.vertices.filter { case (id, (name, pos)) => pos == "postdoc" }.collect.foreach(println)
    println(graph.vertices.filter { case (id, (name, pos)) => pos == "postdoc" }.count)
    println("edge filter...........")
    graph.edges.filter(e => e.srcId > e.dstId).collect.foreach(println)
    println(graph.edges.filter(e => e.srcId > e.dstId).count)

运行效果

vertices filter...........
(7,(jgonzal,postdoc))
1
edge filter...........
Edge(5,3,advisor)
1

三元组属性值

    val facts: RDD[String] =
      graph.triplets.map(triplet =>
        triplet.srcAttr._1 + " is the " + triplet.attr + " of " + triplet.dstAttr._1)
    facts.collect.foreach(println(_))

运行效果

在这里插入图片描述

出度入度

    println("numedges: "+ graph.numEdges)
    println("numVertices: "+ graph.numVertices)
    println("入度，按目标顶点id分组就和")
    graph.inDegrees.collect.foreach(println)
    println("出度，按源顶点id分组就和")
    graph.outDegrees.collect.foreach(println)
    println("出度入度合并")
    graph.degrees.collect.foreach(println)

运行效果

numedges: 4
numVertices: 4
入度，按目标顶点id分组就和
(3,1)
(5,1)
(7,2)
出度，按源顶点id分组就和
(2,1)
(3,1)
(5,2)
度数，顶点id分组就和
(2,1)
(3,2)
(5,3)
(7,2)

outerjoinvertices

顶点属性有多个时，用元组表示，按照元组索引来取值

    val inputGraph: Graph[Int, String] =
      graph.outerJoinVertices(graph.outDegrees)((vid, _, degOpt) => degOpt.getOrElse(10))
    inputGraph.triplets.collect.foreach(println)

// 等价
    val inputGraph: Graph[Int, String] =
      graph.outerJoinVertices(graph.outDegrees)((vid, _, degOpt) =>
        degOpt match {
          case Some(opt) => opt
          case None => 10
        }
      )
// 加入其他的判断条件
    val inputGraph: Graph[Int, String] =
      graph.outerJoinVertices(graph.outDegrees)((vid, attr, degOpt) =>
        degOpt match {
          case Some(opt) => if (attr._1 == "rxin") {
            1000
          }else {
            opt
          }
          case None => 10
        }
      )

// 给定点新增属性
    val inputGraph = graph.outerJoinVertices(graph.outDegrees) {
      case (vid, (name, ident), Some(degree)) => (name, ident, degree)
      case (vid, (name, ident), None) => (name, ident, 0)
    }

运行效果：
在这里插入图片描述

subgraph

package zzf

import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

object graphx01_graphProperty {
  def main(args: Array[String]): Unit = {
    val spark: SparkSession= SparkSession.builder().master("local[*]").appName("graphx01").getOrCreate()
    val sc = spark.sparkContext

    val users: RDD[(VertexId, (String, String))] =
      sc.parallelize(Seq((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
        (5L, ("franklin", "prof")), (2L, ("istoica", "prof")),
        (4L, ("peter", "student"))))
    // Create an RDD for edges
    val relationships: RDD[Edge[String]] =
      sc.parallelize(Seq(Edge(3L, 7L, "collab"),    Edge(5L, 3L, "advisor"),
        Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi"),
        Edge(4L, 0L, "student"),   Edge(5L, 0L, "colleague")))
    // Define a default user in case there are relationship with missing user
    val defaultUser = ("John Doe", "Missing")
    // Build the initial Graph
    val graph = Graph(users, relationships, defaultUser)
    // Notice that there is a user 0 (for which we have no information) connected to users
    // 4 (peter) and 5 (franklin).
    graph.triplets.collect.foreach(println(_))
    println("=================")
    // Remove missing vertices as well as the edges to connected to them
    val validGraph = graph.subgraph(vpred = (id, attr) => attr._2 != "Missing")
    // The valid subgraph will disconnect users 4 and 5 by removing user 0
    validGraph.triplets.collect.foreach(println(_))



  }

}

在这里插入图片描述

同时过滤点和边

连通图

package zzf

import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

object graphx01_graphProperty {
  def main(args: Array[String]): Unit = {
    val spark: SparkSession= SparkSession.builder().master("local[*]").appName("graphx01").getOrCreate()
    val sc = spark.sparkContext

    val users: RDD[(VertexId, (String, String))] =
      sc.parallelize(Seq((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
        (5L, ("franklin", "prof")), (2L, ("istoica", "prof")),(10L, ("istoica10", "prof10")),(11L, ("istoica11", "prof11")),
        (4L, ("peter", "student"))))
    // Create an RDD for edges
    val relationships: RDD[Edge[String]] =
      sc.parallelize(Seq(Edge(3L, 7L, "collab"),    Edge(5L, 3L, "advisor"),
        Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi"),Edge(10L, 11L, "pi11"),
        Edge(4L, 0L, "student"),   Edge(5L, 0L, "colleague")))
    // Define a default user in case there are relationship with missing user
    val defaultUser = ("John Doe", "Missing")
    // Build the initial Graph
    val graph = Graph(users, relationships, defaultUser)
    println("graph...........")
    graph.triplets.collect.foreach(println)
    // Notice that there is a user 0 (for which we have no information) connected to users
    // 4 (peter) and 5 (franklin).
    // Run Connected Components
    val ccGraph = graph.connectedComponents() // No longer contains missing field
    println("ccGraph..............")
    ccGraph.triplets.collect.foreach(println)
    // Remove missing vertices as well as the edges to connected to them
    val validGraph = graph.subgraph(vpred = (id, attr) => attr._2 != "Missing")
    // Restrict the answer to the valid subgraph
    val validCCGraph = ccGraph.mask(validGraph)

  }

}

运行结果：

((5,(franklin,prof)),(3,(rxin,student)),advisor)
((2,(istoica,prof)),(5,(franklin,prof)),colleague)
((5,(franklin,prof)),(7,(jgonzal,postdoc)),pi)
((10,(istoica10,prof10)),(11,(istoica11,prof11)),pi11)
((4,(peter,student)),(0,(John Doe,Missing)),student)
((5,(franklin,prof)),(0,(John Doe,Missing)),colleague)
ccGraph..............
((3,0),(7,0),collab)
((5,0),(3,0),advisor)
((2,0),(5,0),colleague)
((5,0),(7,0),pi)
((10,10),(11,10),pi11)
((4,0),(0,0),student)
((5,0),(0,0),colleague)

Graphx用图中顶点的id来标识节点所属的连通体，同一个连通体的编号是采用该联通体中最小的节点id来标识的。
在这里插入图片描述

aggregateMessages

用来对图中的顶点进行聚合计算，他会从每条边发出一条消息给变得源顶点和目标顶点，然后用一个函数来合并每个顶点收到的所有消息，最后返回一个新的VertexRDD，包含每个顶点的聚合结果。
入度

    graph.aggregateMessages[Int](
      _.sendToDst(1),
      _+_
    ).collect().foreach(println)

运行效果

graph...........
((3,(rxin,student)),(7,(jgonzal,postdoc)),collab)
((5,(franklin,prof)),(3,(rxin,student)),advisor)
((2,(istoica,prof)),(5,(franklin,prof)),colleague)
((5,(franklin,prof)),(7,(jgonzal,postdoc)),pi)
((10,(istoica10,prof10)),(11,(istoica11,prof11)),pi11)
((4,(peter,student)),(0,(John Doe,Missing)),student)
((5,(franklin,prof)),(0,(John Doe,Missing)),colleague)
(0,2)
(11,1)
(3,1)
(5,1)
(7,2)

计算每个顶点相邻的点，相邻即可，忽略方向

    graph.aggregateMessages[List[Long]](
      triplet => {
        triplet.sendToSrc(List(triplet.dstId))
        triplet.sendToDst(List(triplet.srcId))
      },
      _++_
    ).collect().foreach(println)

运行效果
在这里插入图片描述

发送给源顶点

    graph.aggregateMessages[List[Long]](
      triplet => {
        triplet.sendToSrc(List(triplet.dstId))
      },
      _++_
    ).collect().foreach(println)

运行效果：
在这里插入图片描述
发送目标顶点的属性值给源顶点

ddf482

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
spark GraphX 常用算子介绍

构建图，查看图信息边过滤，点过滤三元组属性值出度入度subgraph连通图参考官网文档：https://spark.apache.org/docs/latest/graphx-programming-guide.html。
复制链接

扫一扫