GraphX
三.我的案例
3.1 spark-shell 中的案例
注意:以下三个案例中,创建 Graph 的方式不同
3.1.1 导包
scala> import org.apache.spark.graphx._
scala> import org.apache.spark.graphx.GraphLoader
scala> import org.apache.spark.rdd.RDD
3.1.2 第一个案例
#使用 Seq 直接创建 RDD
scala> val vertices = sc.makeRDD(Seq((1L,1),(2L,1),(3L,1)))
vertices: org.apache.spark.rdd.RDD[(Long, Int)] = ParallelCollectionRDD[175] at makeRDD at <console>:42
scala> val edges = sc.makeRDD(Seq(Edge(1L,2L,1),Edge(2L,1L,3),Edge(2L,3L,2)))
edges: org.apache.spark.rdd.RDD[org.apache.spark.graphx.Edge[Int]] = ParallelCollectionRDD[177] at makeRDD at <console>:44
#创建图 Graph
scala> val graph = Graph(vertices,edges)
graph: org.apache.spark.graphx.Graph[Int,Int] = org.apache.spark.graphx.impl.GraphImpl@1245174d
#查看点的数据
scala> graph.vertices.collect
res123: Array[(org.apache.spark.graphx.VertexId, Int)] = Array((1,1), (2,1), (3,1))
#查看边的数据
scala> graph.edges.collect
res120: Array[org.apache.spark.graphx.Edge[Int]] = Array(Edge(1,2,1), Edge(2,1,3), Edge(2,3,2))
#查看图的数据
scala> graph.triplets.collect
res121: Array[org.apache.spark.graphx.EdgeTriplet[Int,Int]] = Array(((1,1),(2,1),1), ((2,1),(1,1),3), ((2,1),(3,1),2))
3.1.2 第二个案例
#用添加类型的方式,直接用数组来创建 RDD
scala> val users:RDD[(VertexId,(String,String))] = sc.makeRDD(Array((3L,("caicai","boss")),(6L,("luozi","teacher")),(9L,("cailuo","professor"))))
users: org.apache.spark.rdd.RDD[(org.apache.spark.graphx.VertexId, (String, String))] = ParallelCollectionRDD[196] at makeRDD at <console>:44
scala> val relationships:RDD[Edge[String]]=sc.parallelize(Array(Edge(3L,6L,"collaboretor"),Edge(9L,3L,"couple"),Edge(3L,9L,"customers")))
relationships: org.apache.spark.rdd.RDD[org.apache.spark.graphx.Edge[String]] = ParallelCollectionRDD[197] at parallelize at <console>:44
#创建图 Graph
scala> val graphUser = Graph(users,relationships)
graphUser: org.apache.spark.graphx.Graph[(String, String),String] = org.apache.spark.graphx.impl.GraphImpl@61cdf480
#查看点的数据
scala> graphUser.vertices.collect
res124: Array[(org.apache.spark.graphx.VertexId, (String, String))] = Array((9,(cailuo,professor)), (6,(luozi,teacher)), (3,(caicai,boss)))
#查看边的数据
scala> graphUser.edges.collect
res125: Array[org.apache.spark.graphx.Edge[String]] = Array(Edge(3,6,collaboretor), Edge(9,3,couple), Edge(3,9,customers))
#查看图的数据
scala> graphUser.triplets.collect
res126: Array[org.apache.spark.graphx.EdgeTriplet[(String, String),String]] = Array(((3,(caicai,boss)),(6,(luozi,teacher)),collaboretor), ((9,(cailuo,professor)),(3,(caicai,boss)),couple), ((3,(caicai,boss)),(9,(cailuo,professor)),customers))
3.1.2 第三个案例
#设置顶点和边,注意顶点和边都是用元组定义的 Array
#顶点的数据类型是 VD:(String,Int)
scala> val verterArray=Array(
| (1L,("ciacia1",26)),
| (2L,("ciacia2",26)),
| (3L,("ciacia3",26)),
| (4L,("ciacia4",26)),
| (6L,("ciacia6",26)),
| (7L,("ciacia7",26)),
| (8L,("ciacia8",26))
| )
verterArray: Array[(Long, (String, Int))] = Array((1,(ciacia1,26)), (2,(ciacia2,26)), (3,(ciacia3,26)), (4,(ciacia4,26)), (6,(ciacia6,26)), (7,(ciacia7,26)), (8,(ciacia8,26)))
#边的数据类型是 ED:Int
scala> val edgeArray = Array(
| Edge(2L,1L,70),
| Edge(2L,1L,27),
| Edge(2L,1L,38),
| Edge(2L,1L,17),
| Edge(2L,1L,6),
| Edge(2L,1L,9),
| Edge(2L,1L,87),
| Edge(2L,1L,29)
| )
edgeArray: Array[org.apache.spark.graphx.Edge[Int]] = Array(Edge(2,1,70), Edge(2,1,27), Edge(2,1,38), Edge(2,1,17), Edge(2,1,6), Edge(2,1,9), Edge(2,1,87), Edge(2,1,29))
#构造 vertexRDD
scala> var vertexRDD = sc.parallelize(verterArray)
vertexRDD: org.apache.spark.rdd.RDD[(Long, (String, Int))] = ParallelCollectionRDD[20] at parallelize at <console>:31
#构造 edgeRDD
scala> val edgeRDD = sc.parallelize(edgeArray)
edgeRDD: org.apache.spark.rdd.RDD[org.apache.spark.graphx.Edge[Int]] = ParallelCollectionRDD[21] at parallelize at <console>:31
#构造图 Graph(VD,ED)
scala> val graph3 = Graph(vertexRDD,edgeRDD)
graph3: org.apache.spark.graphx.Graph[(String, Int),Int] = org.apache.spark.graphx.impl.GraphImpl@7ad56a58
#查看点的数据
scala> graph3.vertices.collect
res4: Array[(org.apache.spark.graphx.VertexId, (String, Int))] = Array((4,(ciacia4,26)), (8,(ciacia8,26)), (1,(ciacia1,26)), (6,(ciacia6,26)), (2,(ciacia2,26)), (3,(ciacia3,26)), (7,(ciacia7,26)))
#查看边的数据
scala> graph3.edges.collect
res5: Array[org.apache.spark.graphx.Edge[Int]] = Array(Edge(2,1,70), Edge(2,1,27), Edge(2,1,38), Edge(2,1,17), Edge(2,1,6), Edge(2,1,9), Edge(2,1,87), Edge(2,1,29))
#查看图的数据
scala> graph3.triplets.collect
res6: Array[org.apache.spark.graphx.EdgeTriplet[(String, Int),Int]] = Array(((2,(ciacia2,26)),(1,(ciacia1,26)),70), ((2,(ciacia2,26)),(1,(ciacia1,26)),27), ((2,(ciacia2,26)),(1,(ciacia1,26)),38), ((2,(ciacia2,26)),(1,(ciacia1,26)),17), ((2,(ciacia2,26)),(1,(ciacia1,26)),6), ((2,(ciacia2,26)),(1,(ciacia1,26)),9), ((2,(ciacia2,26)),(1,(ciacia1,26)),87), ((2,(ciacia2,26)),(1,(ciacia1,26)),29))
#用 foreach 查看图的数据
scala> graph3.triplets.collect.foreach(println)
((2,(ciacia2,26)),(1,(ciacia1,26)),70)
((2,(ciacia2,26)),(1,(ciacia1,26)),27)
((2,(ciacia2,26)),(1,(ciacia1,26)),38)
((2,(ciacia2,26)),(1,(ciacia1,26)),17)
((2,(ciacia2,26)),(1,(ciacia1,26)),6)
((2,(ciacia2,26)),(1,(ciacia1,26)),9)
((2,(ciacia2,26)),(1,(ciacia1,26)),87)
((2,(ciacia2,26)),(1,(ciacia1,26)),29)
//点查询 找出age大于20的点
scala> graph3.vertices.filter(v=>v._2._2>20).collect
res24: Array[(org.apache.spark.graphx.VertexId, (String, Int))] = Array((4,(ciacia4,26)), (8,(ciacia8,26)), (1,(ciacia1,26)), (6,(ciacia6,26)), (2,(ciacia2,26)), (3,(ciacia3,26)), (7,(ciacia7,26)))
#找出年龄大于20的点
scala> for(
| (id,(name,age)) <- graph3.vertices.filter
| {
| case(id,(name,age)) => age>20
| }.collect){
| println(id,name,age)
| }
(4,ciacia4,26)
(8,ciacia8,26)
(1,ciacia1,26)
(6,ciacia6,26)
(2,ciacia2,26)
(3,ciacia3,26)
(7,ciacia7,26)
#找出年龄大于20的点,并打印出字符串信息(注意 println 中要加 s )
scala> for(
| (id,(name,age)) <- graph3.vertices.filter
| {
| case(id,(name,age))=>age>20
| }
| ){
| println(s"$id name is: $name,niianling is $age")
| }
1 name is: ciacia1,niianling is 26
6 name is: ciacia6,niianling is 26
2 name is: ciacia2,niianling is 26
4 name is: ciacia4,niianling is 26
8 name is: ciacia8,niianling is 26
3 name is: ciacia3,niianling is 26
7 name is: ciacia7,niianling is 26
#出度srcAttr 入度dstAttr 关系attr
scala>graph3.triplets.collect.foreach(x=>println(x.srcAttr,x.dstAttr,x.attr))
((ciacia2,26),(ciacia1,26),70)
((ciacia2,26),(ciacia1,26),27)
((ciacia2,26),(ciacia1,26),38)
((ciacia2,26),(ciacia1,26),17)
((ciacia2,26),(ciacia1,26),6)
((ciacia2,26),(ciacia1,26),9)
((ciacia2,26),(ciacia1,26),87)
((ciacia2,26),(ciacia1,26),29)
scala> graph3.triplets.collect.foreach(x=>println(s"${x.srcAttr._1} likes ${x.dstAttr._1},他们的亲密度是${x.attr}"))
ciacia2 likes ciacia1,他们的亲密度是70
ciacia2 likes ciacia1,他们的亲密度是27
ciacia2 likes ciacia1,他们的亲密度是38
ciacia2 likes ciacia1,他们的亲密度是17
ciacia2 likes ciacia1,他们的亲密度是6
ciacia2 likes ciacia1,他们的亲密度是9
ciacia2 likes ciacia1,他们的亲密度是87
ciacia2 likes ciacia1,他们的亲密度是29
# for 打印两个点之间的关系系数大于6
scala> for(t<-graph3.triplets.filter(x=>x.attr>6).collect){println(t)}
((2,(ciacia2,26)),(1,(ciacia1,26)),70)
((2,(ciacia2,26)),(1,(ciacia1,26)),27)
((2,(ciacia2,26)),(1,(ciacia1,26)),38)
((2,(ciacia2,26)),(1,(ciacia1,26)),17)
((2,(ciacia2,26)),(1,(ciacia1,26)),9)
((2,(ciacia2,26)),(1,(ciacia1,26)),87)
((2,(ciacia2,26)),(1,(ciacia1,26)),29)
#用 for 打印字符串信息
scala> for(triplet <- graph3.triplets.filter(x => x.attr>6))
| {
| println( s"${triplet.srcAttr._1} loves ${triplet.dstAttr._1}" )
| }
ciacia2 loves ciacia1
ciacia2 loves ciacia1
ciacia2 loves ciacia1
ciacia2 loves ciacia1
ciacia2 loves ciacia1
ciacia2 loves ciacia1
ciacia2 loves ciacia1
3.2 JAVA API 中的代码案例
注意:这是一个 IDEA 的Scala工程
3.2.1 导包
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
3.2.2 JAVA API 代码
屏蔽日志,设置运行环境
object GraphXExample {
def main(args: Array[String]) {
//屏蔽日志
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
//设置运行环境
val conf = new SparkConf().setAppName("SimpleGraphX").setMaster("local")
val sc = new SparkContext(conf)
设置顶点和边,注意顶点和边都是用元组定义的Array
顶点的数据类型是VD:(String,Int)
val vertexArray = Array(
(1L, ("Alice", 28)),
(2L, ("Bob", 27)),
(3L, ("Charlie", 65)),
(4L, ("David", 42)),
(5L, ("Ed", 55)),
(6L, ("Fran", 50))
)
边的数据类型ED:Int
val edgeArray = Array(
Edge(2L, 1L, 7),
Edge(2L, 4L, 2),
Edge(3L, 2L, 4),
Edge(3L, 6L, 3),
Edge(4L, 1L, 1),
Edge(5L, 2L, 2),
Edge(5L, 3L, 8),
Edge(5L, 6L, 3)
)
构造vertexRDD和edgeRDD
val vertexRDD: RDD[(Long, (String, Int))] = sc.parallelize(vertexArray)
val edgeRDD: RDD[Edge[Int]] = sc.parallelize(edgeArray)
构造图Graph[VD,ED]
val graph: Graph[(String, Int), Int] = Graph(vertexRDD, edgeRDD)
图的属性
找出图中年龄大于30的顶点
graph.vertices.filter { case (id, (name, age)) => age > 30}.collect.foreach
{
case (id, (name, age)) => println(s"$name is $age")
}
边操作:找出图中属性大于5的边
graph.edges.filter(e => e.attr > 5).collect.foreach(e => println(s"${e.srcId} to ${e.dstId} att ${e.attr}"))
triplets操作,((srcId, srcAttr), (dstId, dstAttr), attr)
列出边属性>5的tripltes:
for (triplet <- graph.triplets.filter(t => t.attr > 5).collect)
{
println(s"${triplet.srcAttr._1} likes ${triplet.dstAttr._1}")
}
Degrees操作
println("找出图中最大的出度、入度、度数:
def max(a: (VertexId, Int), b: (VertexId, Int)): (VertexId, Int) = {
if (a._2 > b._2) a else b
}
println("max of outDegrees:" + graph.outDegrees.reduce(max) + " max of inDegrees:" + graph.inDegrees.reduce(max) + " max of Degrees:" + graph.degrees.reduce(max))
转换操作
顶点的转换操作,顶点age + 10
graph.mapVertices{ case (id, (name, age)) => (id, (name, age+10))}.vertices.collect.foreach(v => println(s"${v._2._1} is ${v._2._2}"))
边的转换操作,边的属性*2
graph.mapEdges(e=>e.attr*2).edges.collect.foreach(e => println(s"${e.srcId} to ${e.dstId} att ${e.attr}"))
结构操作
顶点年纪>30的子图
val subGraph = graph.subgraph(vpred = (id, vd) => vd._2 >= 30)
子图所有顶点
subGraph.vertices.collect.foreach(v => println(s"${v._2._1} is ${v._2._2}"))
子图所有边
subGraph.edges.collect.foreach(e => println(s"${e.srcId} to ${e.dstId} att ${e.attr}"))
连接操作
val inDegrees: VertexRDD[Int] = graph.inDegrees
case class User(name: String, age: Int, inDeg: Int, outDeg: Int)
创建一个新图,顶点VD的数据类型为User,并从graph做类型转换
val initialUserGraph: Graph[User, Int] = graph.mapVertices { case (id, (name, age)) => User(name, age, 0, 0)}
initialUserGraph与inDegrees、outDegrees(RDD)进行连接,并修改initialUserGraph中inDeg值、outDeg值
val userGraph = initialUserGraph.outerJoinVertices(initialUserGraph.inDegrees) {
case (id, u, inDegOpt) => User(u.name, u.age, inDegOpt.getOrElse(0), u.outDeg)
}.outerJoinVertices(initialUserGraph.outDegrees) {
case (id, u, outDegOpt) => User(u.name, u.age, u.inDeg,outDegOpt.getOrElse(0))
}
连接图的属性:
userGraph.vertices.collect.foreach(v => println(s"${v._2.name} inDeg: ${v._2.inDeg} outDeg: ${v._2.outDeg}"))
出度和入读相同的人员
userGraph.vertices.filter {
case (id, u) => u.inDeg == u.outDeg
}.collect.foreach {
case (id, property) => println(property.name)
}
聚合操作
找出年纪最大的追求者:
val oldestFollower: VertexRDD[(String, Int)] = userGraph.mapReduceTriplets[(String, Int)](
// 将源顶点的属性发送给目标顶点,map过程
edge => Iterator((edge.dstId, (edge.srcAttr.name, edge.srcAttr.age))),
// 得到最大追求者,reduce过程
(a, b) => if (a._2 > b._2) a else b
)
userGraph.vertices.leftJoin(oldestFollower) { (id, user, optOldestFollower) =>
optOldestFollower match {
case None => s"${user.name} does not have any followers."
case Some((name, age)) => s"${name} is the oldest follower of ${user.name}."
}
}.collect.foreach { case (id, str) => println(str)}
实用操作
聚合操作 找出5到各顶点的最短
val sourceId: VertexId = 5L // 定义源点
val initialGraph = graph.mapVertices((id, _) => if (id == sourceId) 0.0 else Double.PositiveInfinity)
val sssp = initialGraph.pregel(Double.PositiveInfinity)(
(id, dist, newDist) => math.min(dist, newDist),
triplet => { // 计算权重
if (triplet.srcAttr + triplet.attr < triplet.dstAttr) {
Iterator((triplet.dstId, triplet.srcAttr + triplet.attr))
} else {
Iterator.empty
}
},
(a,b) => math.min(a,b) // 最短距离
)
println(sssp.vertices.collect.mkString("\n"))
sc.stop()
}
}