图计算入门级程序
先看这个图,3和5之间有关系,3和7之间有关系,5和之间有关系,7和1之间有关系,5和2之间有关系,所以这就像是一个关系网,他们两两之间可能都有关系
绿色的线,8和9有关系,10和8有关系,那么可能是9和10之间也存在关系
一个简单的例子
import org.apache.log4j.{Level, Logger}
import org.apache.spark.graphx.{Edge, Graph, VertexId, VertexRDD}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object graphXDemo03 {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
val conf = new SparkConf().setAppName(s"${this.getClass.getName}").setMaster("local")
val sc = new SparkContext(conf)
//设置点
val user = sc.parallelize(Array((3L,("huahua","student")),
(7L,("nini","postdoc")),(5L,("fafa","prof")),
(2L,("lili","prof")),(1L,("minmi","prof")),(9L,("kiki","prof")),(8L,("bobo","prof")),(10L,("kuku","prof"))) )
//设置边,也就是谁跟谁之间有关系
val relationships = sc.parallelize(Array(Edge(3L,7L,"collab"),
Edge(5L,3L,"advisor"),Edge(2L,5L,"colleague"),Edge(5L,7L,"teacher"),Edge(1L,7L,"teacher"),Edge(8L,9L,"colleague"),Edge(10L,9L,"advisor")))
//构建图
val graph = Graph(user,relationships)
// 每个关系网中 每个id和最小的以一个元素之间的关联, 得到的是两个ID
val conn: VertexRDD[VertexId] = graph.connectedComponents().vertices
conn.foreach(println)
/*
(1,1)
(3,1)
(7,1)
(9,8)
(8,8)
(10,8)
(5,1)
(2,1)
*/
val joined: RDD[(VertexId, (VertexId, (String, String)))] = conn.join(user) //join后
//第一个 VertexId 代表一个元素A和 第二个VertexId代表一个关系图中id最小的一个B (String, String) 是A元素的值
//join 后得到 (3,(1,(rxin,student))) ,
joined.foreach(println)
/*join后的结果
*(1,(1,(minmi,prof)))
(3,(1,(huahua,student)))
(7,(1,(nini,postdoc)))
(9,(8,(kiki,prof)))
(8,(8,(bobo,prof)))
(10,(8,(kuku,prof)))
(5,(1,(fafa,prof)))
(2,(1,(lili,prof)))
*/
joined.map{
case(userId,(commId,value))=>{
(commId,List((userId,value)))
}
}.reduceByKey((list1,list2)=>{
list1++list2
}).foreach(println)
//把每个关系网中的与id最小的 有关系的的所有的值的总和成一个List
/*
(1,List((1,(minmi,prof)), (3,(huahua,student)), (7,(nini,postdoc)), (5,(fafa,prof)), (2,(lili,prof))))
(8,List((9,(kiki,prof)), (8,(bobo,prof)), (10,(kuku,prof))))
*/
}
}