Graphx ~2:代码案例

三.我的案例

3.1 spark-shell 中的案例

注意:以下三个案例中,创建 Graph 的方式不同

3.1.1 导包

scala> import org.apache.spark.graphx._
scala> import org.apache.spark.graphx.GraphLoader
scala> import org.apache.spark.rdd.RDD

3.1.2 第一个案例

#使用 Seq 直接创建 RDD
scala> val vertices = sc.makeRDD(Seq((1L,1),(2L,1),(3L,1)))
vertices: org.apache.spark.rdd.RDD[(Long, Int)] = ParallelCollectionRDD[175] at makeRDD at <console>:42

scala> val edges = sc.makeRDD(Seq(Edge(1L,2L,1),Edge(2L,1L,3),Edge(2L,3L,2)))
edges: org.apache.spark.rdd.RDD[org.apache.spark.graphx.Edge[Int]] = ParallelCollectionRDD[177] at makeRDD at <console>:44

#创建图 Graph
scala> val graph = Graph(vertices,edges)
graph: org.apache.spark.graphx.Graph[Int,Int] = org.apache.spark.graphx.impl.GraphImpl@1245174d

#查看点的数据
scala> graph.vertices.collect
res123: Array[(org.apache.spark.graphx.VertexId, Int)] = Array((1,1), (2,1), (3,1))

#查看边的数据
scala> graph.edges.collect
res120: Array[org.apache.spark.graphx.Edge[Int]] = Array(Edge(1,2,1), Edge(2,1,3), Edge(2,3,2))

#查看图的数据
scala> graph.triplets.collect
res121: Array[org.apache.spark.graphx.EdgeTriplet[Int,Int]] = Array(((1,1),(2,1),1), ((2,1),(1,1),3), ((2,1),(3,1),2))

3.1.2 第二个案例

#用添加类型的方式,直接用数组来创建 RDD
scala> val users:RDD[(VertexId,(String,String))] = sc.makeRDD(Array((3L,("caicai","boss")),(6L,("luozi","teacher")),(9L,("cailuo","professor"))))
users: org.apache.spark.rdd.RDD[(org.apache.spark.graphx.VertexId, (String, String))] = ParallelCollectionRDD[196] at makeRDD at <console>:44

scala> val relationships:RDD[Edge[String]]=sc.parallelize(Array(Edge(3L,6L,"collaboretor"),Edge(9L,3L,"couple"),Edge(3L,9L,"customers")))
relationships: org.apache.spark.rdd.RDD[org.apache.spark.graphx.Edge[String]] = ParallelCollectionRDD[197] at parallelize at <console>:44

#创建图 Graph
scala> val graphUser = Graph(users,relationships)
graphUser: org.apache.spark.graphx.Graph[(String, String),String] = org.apache.spark.graphx.impl.GraphImpl@61cdf480

#查看点的数据
scala> graphUser.vertices.collect
res124: Array[(org.apache.spark.graphx.VertexId, (String, String))] = Array((9,(cailuo,professor)), (6,(luozi,teacher)), (3,(caicai,boss)))

#查看边的数据
scala> graphUser.edges.collect
res125: Array[org.apache.spark.graphx.Edge[String]] = Array(Edge(3,6,collaboretor), Edge(9,3,couple), Edge(3,9,customers))

#查看图的数据
scala> graphUser.triplets.collect
res126: Array[org.apache.spark.graphx.EdgeTriplet[(String, String),String]] = Array(((3,(caicai,boss)),(6,(luozi,teacher)),collaboretor), ((9,(cailuo,professor)),(3,(caicai,boss)),couple), ((3,(caicai,boss)),(9,(cailuo,professor)),customers))

3.1.2 第三个案例

#设置顶点和边,注意顶点和边都是用元组定义的 Array
#顶点的数据类型是 VD:(String,Int)
scala> val verterArray=Array(
     | (1L,("ciacia1",26)),
     | (2L,("ciacia2",26)),
     | (3L,("ciacia3",26)),
     | (4L,("ciacia4",26)),
     | (6L,("ciacia6",26)),
     | (7L,("ciacia7",26)),
     | (8L,("ciacia8",26))
     | )
verterArray: Array[(Long, (String, Int))] = Array((1,(ciacia1,26)), (2,(ciacia2,26)), (3,(ciacia3,26)), (4,(ciacia4,26)), (6,(ciacia6,26)), (7,(ciacia7,26)), (8,(ciacia8,26)))

#边的数据类型是 ED:Int
scala> val edgeArray = Array(
     | Edge(2L,1L,70),
     | Edge(2L,1L,27),
     | Edge(2L,1L,38),
     | Edge(2L,1L,17),
     | Edge(2L,1L,6),
     | Edge(2L,1L,9),
     | Edge(2L,1L,87),
     | Edge(2L,1L,29)
     | )
edgeArray: Array[org.apache.spark.graphx.Edge[Int]] = Array(Edge(2,1,70), Edge(2,1,27), Edge(2,1,38), Edge(2,1,17), Edge(2,1,6), Edge(2,1,9), Edge(2,1,87), Edge(2,1,29))

#构造 vertexRDD
scala> var vertexRDD = sc.parallelize(verterArray)
vertexRDD: org.apache.spark.rdd.RDD[(Long, (String, Int))] = ParallelCollectionRDD[20] at parallelize at <console>:31

#构造 edgeRDD
scala> val edgeRDD = sc.parallelize(edgeArray)
edgeRDD: org.apache.spark.rdd.RDD[org.apache.spark.graphx.Edge[Int]] = ParallelCollectionRDD[21] at parallelize at <console>:31

#构造图 Graph(VD,ED)
scala> val graph3 = Graph(vertexRDD,edgeRDD)
graph3: org.apache.spark.graphx.Graph[(String, Int),Int] = org.apache.spark.graphx.impl.GraphImpl@7ad56a58

#查看点的数据
scala> graph3.vertices.collect
res4: Array[(org.apache.spark.graphx.VertexId, (String, Int))] = Array((4,(ciacia4,26)), (8,(ciacia8,26)), (1,(ciacia1,26)), (6,(ciacia6,26)), (2,(ciacia2,26)), (3,(ciacia3,26)), (7,(ciacia7,26)))

#查看边的数据
scala> graph3.edges.collect
res5: Array[org.apache.spark.graphx.Edge[Int]] = Array(Edge(2,1,70), Edge(2,1,27), Edge(2,1,38), Edge(2,1,17), Edge(2,1,6), Edge(2,1,9), Edge(2,1,87), Edge(2,1,29))

#查看图的数据
scala> graph3.triplets.collect
res6: Array[org.apache.spark.graphx.EdgeTriplet[(String, Int),Int]] = Array(((2,(ciacia2,26)),(1,(ciacia1,26)),70), ((2,(ciacia2,26)),(1,(ciacia1,26)),27), ((2,(ciacia2,26)),(1,(ciacia1,26)),38), ((2,(ciacia2,26)),(1,(ciacia1,26)),17), ((2,(ciacia2,26)),(1,(ciacia1,26)),6), ((2,(ciacia2,26)),(1,(ciacia1,26)),9), ((2,(ciacia2,26)),(1,(ciacia1,26)),87), ((2,(ciacia2,26)),(1,(ciacia1,26)),29))

#用 foreach 查看图的数据
scala> graph3.triplets.collect.foreach(println)
((2,(ciacia2,26)),(1,(ciacia1,26)),70)
((2,(ciacia2,26)),(1,(ciacia1,26)),27)
((2,(ciacia2,26)),(1,(ciacia1,26)),38)
((2,(ciacia2,26)),(1,(ciacia1,26)),17)
((2,(ciacia2,26)),(1,(ciacia1,26)),6)
((2,(ciacia2,26)),(1,(ciacia1,26)),9)
((2,(ciacia2,26)),(1,(ciacia1,26)),87)
((2,(ciacia2,26)),(1,(ciacia1,26)),29)

//点查询 找出age大于20的点
scala> graph3.vertices.filter(v=>v._2._2>20).collect
res24: Array[(org.apache.spark.graphx.VertexId, (String, Int))] = Array((4,(ciacia4,26)), (8,(ciacia8,26)), (1,(ciacia1,26)), (6,(ciacia6,26)), (2,(ciacia2,26)), (3,(ciacia3,26)), (7,(ciacia7,26)))

#找出年龄大于20的点
scala> for(
     | (id,(name,age)) <- graph3.vertices.filter
     | {
     | case(id,(name,age)) => age>20
     | }.collect){
     | println(id,name,age)
     | }
(4,ciacia4,26)
(8,ciacia8,26)
(1,ciacia1,26)
(6,ciacia6,26)
(2,ciacia2,26)
(3,ciacia3,26)
(7,ciacia7,26)

#找出年龄大于20的点,并打印出字符串信息(注意 println 中要加 s )
scala> for(
     |   (id,(name,age)) <- graph3.vertices.filter
     |           {
     |                   case(id,(name,age))=>age>20
     |           }
     |   ){
     |           println(s"$id name is: $name,niianling is $age")
     |   }
1 name is: ciacia1,niianling is 26
6 name is: ciacia6,niianling is 26
2 name is: ciacia2,niianling is 26
4 name is: ciacia4,niianling is 26
8 name is: ciacia8,niianling is 26
3 name is: ciacia3,niianling is 26
7 name is: ciacia7,niianling is 26


#出度srcAttr	入度dstAttr	关系attr
scala>graph3.triplets.collect.foreach(x=>println(x.srcAttr,x.dstAttr,x.attr))
((ciacia2,26),(ciacia1,26),70)
((ciacia2,26),(ciacia1,26),27)
((ciacia2,26),(ciacia1,26),38)
((ciacia2,26),(ciacia1,26),17)
((ciacia2,26),(ciacia1,26),6)
((ciacia2,26),(ciacia1,26),9)
((ciacia2,26),(ciacia1,26),87)
((ciacia2,26),(ciacia1,26),29)

scala> graph3.triplets.collect.foreach(x=>println(s"${x.srcAttr._1} likes ${x.dstAttr._1},他们的亲密度是${x.attr}"))
ciacia2 likes ciacia1,他们的亲密度是70
ciacia2 likes ciacia1,他们的亲密度是27
ciacia2 likes ciacia1,他们的亲密度是38
ciacia2 likes ciacia1,他们的亲密度是17
ciacia2 likes ciacia1,他们的亲密度是6
ciacia2 likes ciacia1,他们的亲密度是9
ciacia2 likes ciacia1,他们的亲密度是87
ciacia2 likes ciacia1,他们的亲密度是29


# for 打印两个点之间的关系系数大于6
scala> for(t<-graph3.triplets.filter(x=>x.attr>6).collect){println(t)}
((2,(ciacia2,26)),(1,(ciacia1,26)),70)
((2,(ciacia2,26)),(1,(ciacia1,26)),27)
((2,(ciacia2,26)),(1,(ciacia1,26)),38)
((2,(ciacia2,26)),(1,(ciacia1,26)),17)
((2,(ciacia2,26)),(1,(ciacia1,26)),9)
((2,(ciacia2,26)),(1,(ciacia1,26)),87)
((2,(ciacia2,26)),(1,(ciacia1,26)),29)

#用 for 打印字符串信息
scala> for(triplet <- graph3.triplets.filter(x => x.attr>6))
     | {
     |   println( s"${triplet.srcAttr._1} loves ${triplet.dstAttr._1}" )
     | }
ciacia2 loves ciacia1
ciacia2 loves ciacia1
ciacia2 loves ciacia1
ciacia2 loves ciacia1
ciacia2 loves ciacia1
ciacia2 loves ciacia1
ciacia2 loves ciacia1

3.2 JAVA API 中的代码案例

注意:这是一个 IDEA 的Scala工程

3.2.1 导包

import org.apache.log4j.{Level, Logger}

import org.apache.spark.{SparkContext, SparkConf}

import org.apache.spark.graphx._

import org.apache.spark.rdd.RDD

3.2.2 JAVA API 代码

屏蔽日志,设置运行环境

object GraphXExample {

  def main(args: Array[String]) {

    //屏蔽日志

    Logger.getLogger("org.apache.spark").setLevel(Level.WARN)

    Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)

 

    //设置运行环境

    val conf = new SparkConf().setAppName("SimpleGraphX").setMaster("local")

    val sc = new SparkContext(conf)

设置顶点和边,注意顶点和边都是用元组定义的Array

顶点的数据类型是VD:(String,Int)

val vertexArray = Array(

      (1L, ("Alice", 28)),

      (2L, ("Bob", 27)),

      (3L, ("Charlie", 65)),

      (4L, ("David", 42)),

      (5L, ("Ed", 55)),

      (6L, ("Fran", 50))

    )

边的数据类型ED:Int

val edgeArray = Array(

      Edge(2L, 1L, 7),

      Edge(2L, 4L, 2),

      Edge(3L, 2L, 4),

      Edge(3L, 6L, 3),

      Edge(4L, 1L, 1),

      Edge(5L, 2L, 2),

      Edge(5L, 3L, 8),

      Edge(5L, 6L, 3)

    )

构造vertexRDD和edgeRDD

val vertexRDD: RDD[(Long, (String, Int))] = sc.parallelize(vertexArray)

val edgeRDD: RDD[Edge[Int]] = sc.parallelize(edgeArray)

构造图Graph[VD,ED]

val graph: Graph[(String, Int), Int] = Graph(vertexRDD, edgeRDD)

图的属性

找出图中年龄大于30的顶点

graph.vertices.filter { case (id, (name, age)) => age > 30}.collect.foreach 
{
      case (id, (name, age)) => println(s"$name is $age")
 }

边操作:找出图中属性大于5的边

graph.edges.filter(e => e.attr > 5).collect.foreach(e => println(s"${e.srcId} to ${e.dstId} att ${e.attr}"))

triplets操作,((srcId, srcAttr), (dstId, dstAttr), attr)
列出边属性>5的tripltes:

for (triplet <- graph.triplets.filter(t => t.attr > 5).collect)
 {

      println(s"${triplet.srcAttr._1} likes ${triplet.dstAttr._1}")

 }

Degrees操作

println("找出图中最大的出度、入度、度数:

def max(a: (VertexId, Int), b: (VertexId, Int)): (VertexId, Int) = {

      if (a._2 > b._2) a else b

    }

println("max of outDegrees:" + graph.outDegrees.reduce(max) + " max of inDegrees:" + graph.inDegrees.reduce(max) + " max of Degrees:" + graph.degrees.reduce(max))

转换操作

顶点的转换操作,顶点age + 10

graph.mapVertices{ case (id, (name, age)) => (id, (name, age+10))}.vertices.collect.foreach(v => println(s"${v._2._1} is ${v._2._2}"))

边的转换操作,边的属性*2

graph.mapEdges(e=>e.attr*2).edges.collect.foreach(e => println(s"${e.srcId} to ${e.dstId} att ${e.attr}"))

结构操作

顶点年纪>30的子图

val subGraph = graph.subgraph(vpred = (id, vd) => vd._2 >= 30)

子图所有顶点

subGraph.vertices.collect.foreach(v => println(s"${v._2._1} is ${v._2._2}"))

子图所有边

subGraph.edges.collect.foreach(e => println(s"${e.srcId} to ${e.dstId} att ${e.attr}"))

连接操作

val inDegrees: VertexRDD[Int] = graph.inDegrees

case class User(name: String, age: Int, inDeg: Int, outDeg: Int)

创建一个新图,顶点VD的数据类型为User,并从graph做类型转换

val initialUserGraph: Graph[User, Int] = graph.mapVertices { case (id, (name, age)) => User(name, age, 0, 0)}

initialUserGraph与inDegrees、outDegrees(RDD)进行连接,并修改initialUserGraph中inDeg值、outDeg值

  val userGraph = initialUserGraph.outerJoinVertices(initialUserGraph.inDegrees) {

      case (id, u, inDegOpt) => User(u.name, u.age, inDegOpt.getOrElse(0), u.outDeg)

    }.outerJoinVertices(initialUserGraph.outDegrees) {

      case (id, u, outDegOpt) => User(u.name, u.age, u.inDeg,outDegOpt.getOrElse(0))

    }

连接图的属性:

userGraph.vertices.collect.foreach(v => println(s"${v._2.name} inDeg: ${v._2.inDeg}  outDeg: ${v._2.outDeg}"))

出度和入读相同的人员

userGraph.vertices.filter {

      case (id, u) => u.inDeg == u.outDeg

    }.collect.foreach {

      case (id, property) => println(property.name)

    }

聚合操作

找出年纪最大的追求者:

val oldestFollower: VertexRDD[(String, Int)] = userGraph.mapReduceTriplets[(String, Int)](

      // 将源顶点的属性发送给目标顶点,map过程

      edge => Iterator((edge.dstId, (edge.srcAttr.name, edge.srcAttr.age))),

      // 得到最大追求者,reduce过程

      (a, b) => if (a._2 > b._2) a else b

    )

 

    userGraph.vertices.leftJoin(oldestFollower) { (id, user, optOldestFollower) =>

      optOldestFollower match {

        case None => s"${user.name} does not have any followers."

        case Some((name, age)) => s"${name} is the oldest follower of ${user.name}."

      }

    }.collect.foreach { case (id, str) => println(str)}

实用操作

聚合操作 找出5到各顶点的最短

 val sourceId: VertexId = 5L // 定义源点

    val initialGraph = graph.mapVertices((id, _) => if (id == sourceId) 0.0 else Double.PositiveInfinity)

    val sssp = initialGraph.pregel(Double.PositiveInfinity)(

      (id, dist, newDist) => math.min(dist, newDist),

      triplet => {  // 计算权重

        if (triplet.srcAttr + triplet.attr < triplet.dstAttr) {

          Iterator((triplet.dstId, triplet.srcAttr + triplet.attr))

        } else {

          Iterator.empty

        }

      },

      (a,b) => math.min(a,b) // 最短距离

    )

    println(sssp.vertices.collect.mkString("\n"))
    sc.stop()

  }

}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值