更多代码请见:https://github.com/xubo245/SparkLearning
1解释
取联通部分
源码:
object ConnectedComponents {
/**
* Compute the connected component membership of each vertex and return a graph with the vertex
* value containing the lowest vertex id in the connected component containing that vertex.
*
* @tparam VD the vertex attribute type (discarded in the computation)
* @tparam ED the edge attribute type (preserved in the computation)
*
* @param graph the graph for which to compute the connected components
*
* @return a graph with vertex attributes containing the smallest vertex in each
* connected component
*/
def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Graph[VertexId, ED] = {
val ccGraph = graph.mapVertices { case (vid, _) => vid }
def sendMessage(edge: EdgeTriplet[VertexId, ED]): Iterator[(VertexId, VertexId)] = {
if (edge.srcAttr < edge.dstAttr) {
Iterator((edge.dstId, edge.srcAttr))
} else if (edge.srcAttr > edge.dstAttr) {
Iterator((edge.srcId, edge.dstAttr))
} else {
Iterator.empty
}
}
val initialMessage = Long.MaxValue
Pregel(ccGraph, initialMessage, activeDirection = EdgeDirection.Either)(
vprog = (id, attr, msg) => math.min(attr, msg),
sendMsg = sendMessage,
mergeMsg = (a, b) => math.min(a, b))
} // end of connectedComponents
}
返回的是连通图中最小的顶点id
2.代码:
/**
* @author xubo
* ref http://spark.apache.org/docs/1.5.2/graphx-programming-guide.html
* time 20160503
*/
package org.apache.spark.graphx.learning
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.Graph.graphToGraphOps
import org.apache.spark.graphx.VertexId
import org.apache.spark.graphx.util.GraphGenerators
import org.apache.spark.graphx.GraphLoader
object ConnectedComponents {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("ConnectedComponents").setMaster("local[4]")
val sc = new SparkContext(conf)
// Load the edges as a graph
// Load the graph as in the PageRank example
val graph = GraphLoader.edgeListFile(sc, "file/data/graphx/input/followers.txt")
// Find the connected components
val cc = graph.connectedComponents().vertices
// Join the connected components with the usernames
val users = sc.textFile("file/data/graphx/input/users.txt").map { line =>
val fields = line.split(",")
(fields(0).toLong, fields(1))
}
val ccByUsername = users.join(cc).map {
case (id, (username, cc)) => (username, cc)
}
// Print the result
println("\ngraph edges");
println("edges:");
graph.edges.collect.foreach(println)
graph.edges.collect.foreach(println)
println("vertices:");
graph.vertices.collect.foreach(println)
println("triplets:");
graph.triplets.collect.foreach(println)
println("\nusers");
users.collect.foreach(println)
println("\ncc:");
cc.collect.foreach(println)
println("\nccByUsername");
println(ccByUsername.collect().mkString("\n"))
}
}
文件同【3】
3.结果:
graph edges
edges:
Edge(1,2,1)
Edge(2,1,1)
Edge(4,1,1)
Edge(6,3,1)
Edge(7,3,1)
Edge(3,7,1)
Edge(6,7,1)
Edge(7,6,1)
Edge(1,2,1)
Edge(2,1,1)
Edge(4,1,1)
Edge(6,3,1)
Edge(7,3,1)
Edge(3,7,1)
Edge(6,7,1)
Edge(7,6,1)
vertices:
(4,1)
(6,1)
(2,1)
(1,1)
(3,1)
(7,1)
triplets:
((1,1),(2,1),1)
((2,1),(1,1),1)
((4,1),(1,1),1)
((6,1),(3,1),1)
((7,1),(3,1),1)
((3,1),(7,1),1)
((6,1),(7,1),1)
((7,1),(6,1),1)
users
(1,BarackObama)
(2,ladygaga)
(3,jeresig)
(4,justinbieber)
(6,matei_zaharia)
(7,odersky)
(8,anonsys)
cc:
(4,1)
(6,3)
(2,1)
(1,1)
(3,3)
(7,3)
ccByUsername
(justinbieber,1)
(matei_zaharia,3)
(ladygaga,1)
(BarackObama,1)
(jeresig,3)
(odersky,3)
将followers中加上3 2 连接后:
graph edges
edges:
Edge(1,2,1)
Edge(2,1,1)
Edge(4,1,1)
Edge(6,3,1)
Edge(7,3,1)
Edge(3,2,1)
Edge(3,7,1)
Edge(6,7,1)
Edge(7,6,1)
vertices:
(4,1)
(6,1)
(2,1)
(1,1)
(3,1)
(7,1)
triplets:
((1,1),(2,1),1)
((2,1),(1,1),1)
((4,1),(1,1),1)
((6,1),(3,1),1)
((7,1),(3,1),1)
((3,1),(2,1),1)
((3,1),(7,1),1)
((6,1),(7,1),1)
((7,1),(6,1),1)
users
(1,BarackObama)
(2,ladygaga)
(3,jeresig)
(4,justinbieber)
(6,matei_zaharia)
(7,odersky)
(8,anonsys)
cc:
(4,1)
(6,1)
(2,1)
(1,1)
(3,1)
(7,1)
ccByUsername
(justinbieber,1)
(matei_zaharia,1)
(ladygaga,1)
(BarackObama,1)
(jeresig,1)
(odersky,1)
不同cc的值在:
def sendMessage(edge: EdgeTriplet[VertexId, ED]): Iterator[(VertexId, VertexId)] = {
if (edge.srcAttr < edge.dstAttr) {
Iterator((edge.dstId, edge.srcAttr))
} else if (edge.srcAttr > edge.dstAttr) {
Iterator((edge.srcId, edge.dstAttr))
} else {
Iterator.empty
}
中由于属性值都是1,不应该返回empty吗?不懂
参考
【1】 http://spark.apache.org/docs/1.5.2/graphx-programming-guide.html
【2】https://github.com/xubo245/SparkLearning
【3】http://blog.csdn.net/xubo245/article/details/51315240