转载GraphX 之ConnectComponent
首先准备数据
people.csv内容如下
4,Dave,25
6,Faith,21
8,Harvey,47
2,Bob,18
1,Alice,20
3,Charlie,30
7,George,34
9,Ivy,21
5,Eve,30
10,Lily,35
11,Helen,35
12,Ann,35
links.csv内容如下
1,2,friend
1,3,sister
2,4,brother
3,2,boss
4,5,client
1,9,friend
6,7,cousin
7,9,coworker
8,9,father
10,11,colleague
10,12,colleague
11,12,colleague
代码
package nj.zb.kb09.graph
import org.apache.spark.SparkContext
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
object ConnectComponentDemo {
case class Person(name:String,age:Int)
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().appName("pregel")
.master("local[*]").getOrCreate()
val sc: SparkContext = spark.sparkContext
case class People(name:String,age:Int)
val peopleRDD: RDD[(VertexId, People)] = sc.textFile("in/people.csv")
.map(_.split(","))
.map(x => (x(0).toLong, People(x(1), x(2).toInt)))
val linksRDD: RDD[Edge[String]] = sc.textFile("in/links.csv")
.map { x =>
val row = x.split(",")
Edge(row(0).toInt, row(1).toInt, row(2))
}
val graph=Graph(peopleRDD,linksRDD)
val mincc: Graph[VertexId, String] = graph.connectedComponents()
mincc.vertices.collect()foreach(println)
val newGraph: Graph[(VertexId, String, PartitionID), String]
= mincc.outerJoinVertices(peopleRDD)((id,mincc,p)=>(mincc,p.get.name,p.get.age))
mincc.vertices.map(_._2).collect.distinct.foreach (x =>{
val sub: Graph[(VertexId, String, PartitionID), String] = newGraph.subgraph(vpred = (id1,id2) => id2._1==x)
println(sub.triplets.collect().mkString(","))})
}
}
//输出
(4,1)
(6,1)
(8,1)
(12,10)
(10,10)
(2,1)
(11,10)
(1,1)
(3,1)
(7,1)
(9,1)
(5,1)
((1,(1,Alice,20)),(2,(1,Bob,18)),friend),((1,(1,Alice,20)),(3,(1,Charlie,30)),sister),((1,(1,Alice,20)),(9,(1,Ivy,21)),friend),((2,(1,Bob,18)),(4,(1,Dave,25)),brother),((3,(1,Charlie,30)),(2,(1,Bob,18)),boss),((4,(1,Dave,25)),(5,(1,Eve,30)),client),((6,(1,Faith,21)),(7,(1,George,34)),cousin),((7,(1,George,34)),(9,(1,Ivy,21)),coworker),((8,(1,Harvey,47)),(9,(1,Ivy,21)),father)
((10,(10,Lily,35)),(11,(10,Helen,35)),colleague),((10,(10,Lily,35)),(12,(10,Ann,35)),colleague),((11,(10,Helen,35)),(12,(10,Ann,35)),colleague)
从结果中可以看到通过计算之后的图,每个顶点多了一个属性,这个属性表示的就是这个顶点所在的连通图中的最小顶点id。例如顶点11所在的连通图中的最小顶点id是10,顶点4所在的连通图中的最小顶点id是1
经过connectedComponents得到的结果,可以知道哪些顶点在一个连通图中,这样就可以将一个大图拆分成若干个连通子图,结果为2个子图