spark-shell版本
// 导入Spark Graph包
scala> import org.apache.spark.graphx._
// 创建 vertices 顶点rdd
scala> val vertices = sc.makeRDD(Seq((1L,1),(2L,2),(3L,3)))
// 创建 edges 边rdd
scala> val edges = sc.makeRDD(Seq(Edge(1L,2L,1),Edge(2L,3L,2)))
// 创建 graph对象
scala> val graph = Graph(vertices,edges)
scala> graph.
aggregateMessages edges mapVertices persist stronglyConnectedComponents
cache filter mask personalizedPageRank subgraph
checkpoint getCheckpointFiles numEdges pickRandomVertex triangleCount
collectEdges groupEdges numVertices pregel triplets
collectNeighborIds inDegrees ops removeSelfEdges unpersist
collectNeighbors isCheckpointed outDegrees reverse unpersistVertices
connectedComponents joinVertices outerJoinVertices staticPageRank vertices
// 获取graph图对象的vertices信息
scala> graph.vertices.collect
res1: Array[(org.apache.spark.graphx.VertexId, Int)] = Array((1,1), (2,2), (3,3))
// 获取graph图对象的edges信息
scala> graph.edges.collect
res2: Array[org.apache.spark.graphx.Edge[Int]] = Array(Edge(1,2,1), Edge(2,3,2))
// 通过文件加载
followers.txt
2 3
3 4
1 4
2 4
scala> val graphLoad = GraphLoader.edgeListFile(sc, "file:///opt/kb09file/followers.txt")
graphLoad: org.apache.spark.graphx.Graph[Int,Int] = org.apache.spark.graphx.impl.GraphImpl@568202bd
scala> graphLoad.vertices.collect
res5: Array[(org.apache.spark.graphx.VertexId, Int)] = Array((4,1), (2,1), (1,1), (3,1))
scala> graphLoad.edges.collect
res6: Array[org.apache.spark.graphx.Edge[Int]] = Array(Edge(1,4,1), Edge(2,3,1), Edge(3,4,1), Edge(2,4,1))
scala> graphLoad.triplets.collect
res7: Array[org.apache.spark.graphx.EdgeTriplet[Int,Int]] = Array(((1,1),(4,1),1), ((2,1),(3,1),1), ((3,1),(4,1),1), ((2,1),(4,1),1))
// 案例 user relation
scala> val users = sc.parallelize(Array((3L,("rxin","student")),(7L,("jgonzal","postdoc")),(5L,("franklin","professor")),(2L,("istoica","professor"))))
users: org.apache.spark.rdd.RDD[(Long, (String, String))] = ParallelCollectionRDD[49] at parallelize at <console>:27
scala> val relationship = sc.parallelize(Array(Edge(3L,7L,"Colla"),Edge(5L,3L,"Advisor"),Edge(2L,5L,"Colleague"),Edge(5L,7L,"Pi")))
relationship: org.apache.spark.rdd.RDD[org.apache.spark.graphx.Edge[String]] = ParallelCollectionRDD[50] at parallelize at <console>:27
scala> val graphUser = Graph(users, relationship)
graphUser: org.apache.spark.graphx.Graph[(String, String),String] = org.apache.spark.graphx.impl.GraphImpl@40bfabed
scala> graphUser.vertices.collect
res8: Array[(org.apache.spark.graphx.VertexId, (String, String))] = Array((2,(istoica,professor)), (3,(rxin,student)), (5,(franklin,professor)), (7,(jgonzal,postdoc)))
scala> graphUser.edges.collect
res9: Array[org.apache.spark.graphx.Edge[String]] = Array(Edge(3,7,Colla), Edge(5,3,Advisor), Edge(2,5,Colleague), Edge(5,7,Pi))
scala> graphUser.triplets.collect
res10: Array[org.apache.spark.graphx.EdgeTriplet[(String, String),String]] = Array(((3,(rxin,student)),(7,(jgonzal,postdoc)),Colla), ((5,(franklin,professor)),(3,(rxin,student)),Advisor), ((2,(istoica,professor)),(5,(franklin,professor)),Colleague), ((5,(franklin,professor)),(7,(jgonzal,postdoc)),Pi))
///
案例二
scala> val userRdd = sc.makeRDD(
| Array(
| (1L,("Alice",28)),
| (2L,("Bob",27)),
| (3L,("Charlie",65)),
| (4L,("David",42)),
| (5L,("Ed",55)),
| (6L,("Fran",50))
| )
| )
userRdd: org.apache.spark.rdd.RDD[(Long, (String, Int))] = ParallelCollectionRDD[69] at makeRDD at <console>:27
scala> val usercallRdd = sc.makeRDD(
| Array(
| Edge(2L,1L,7),
| Edge(3L,2L,4),
| Edge(4L,1L,1),
| Edge(2L,4L,2),
| Edge(5L,2L,2),
| Edge(5L,3L,8),
| Edge(3L,6L,3),
| Edge(5L,6L,3)
| )
| )
usercallRdd: org.apache.spark.rdd.RDD[org.apache.spark.graphx.Edge[Int]] = ParallelCollectionRDD[70] at makeRDD at <console>:27
scala> val userCallGraph = Graph(userRdd,usercallRdd)
userCallGraph: org.apache.spark.graphx.Graph[(String, Int),Int] = org.apache.spark.graphx.impl.GraphImpl@db8615d
scala> userCallGraph.vertices.filter{ case(id,(name,age)) => age>30}
res21: org.apache.spark.graphx.VertexRDD[(String, Int)] = VertexRDDImpl[100] at RDD at VertexRDD.scala:57
scala> userCallGraph.vertices.filter(v=>v._2._2>30).collect.foreach(x=>{println("name:"+x._2._1+" age:"+x._2._2)})
scala> userCallGraph.vertices.filter{case(id,(name,age)) => age>30}
res19: org.apache.spark.graphx.VertexRDD[(String, Int)] = VertexRDDImpl[96] at RDD at VertexRDD.scala:57
scala> userCallGraph.vertices.filter{ case(id,(name,age)) => age>30}.collect.foreach(println)
(3,(Charlie,65))
(4,(David,42))
(5,(Ed,55))
(6,(Fran,50))
scala> userCallGraph.triplets
res22: org.apache.spark.rdd.RDD[org.apache.spark.graphx.EdgeTriplet[(String, Int),Int]] = MapPartitionsRDD[88] at mapPartitions at GraphImpl.scala:48
scala> userCallGraph.triplets.collect.foreach(println)
((2,(Bob,27)),(1,(Alice,28)),7)
((3,(Charlie,65)),(2,(Bob,27)),4)
((4,(David,42)),(1,(Alice,28)),1)
((2,(Bob,27)),(4,(David,42)),2)
((5,(Ed,55)),(2,(Bob,27)),2)
((5,(Ed,55)),(3,(Charlie,65)),8)
((3,(Charlie,65)),(6,(Fran,50)),3)
((5,(Ed,55)),(6,(Fran,50)),3)
scala> userCallGraph.triplets.collect.foreach(x=>println(x.srcAttr._1+" like "+ x.dst))
dstAttr dstId
scala> userCallGraph.triplets.collect.foreach(x=>println(x.srcAttr._1+" like "+ x.dstAttr._1+" stage:"+x.attr))
Bob like Alice stage:7
Charlie like Bob stage:4
David like Alice stage:1
Bob like David stage:2
Ed like Bob stage:2
Ed like Charlie stage:8
Charlie like Fran stage:3
Ed like Fran stage:3
scala> userCallGraph.triplets.filter(x=>x.attr>5).collect.foreach(x=>println(x.srcAttr._1+" like "+ x.dstAttr._1+" stage:"+x.attr))
Bob like Alice stage:7
Ed like Charlie stage:8
idea scala版本
maven依赖
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.8</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.1.1</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.1.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.1.1</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.36</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.21</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-graphx_2.11</artifactId>
<version>2.1.1</version>
</dependency>
</dependencies>
user.txt 文件内容:
2 1
2 4
4 1
5 2
5 3
5 6
3 6
3 2
package Graph
import org.apache.spark.sql.SparkSession
// 导入Spark Graph包
import org.apache.spark.graphx._
object GraphStu {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("GraphStu").master("local[*]").getOrCreate()
val sc = spark.sparkContext
// 创建 vertices 顶点rdd
val vertices = sc.makeRDD(Seq((1L,1),(2L,2),(3L,3)))
// 创建 edges 边rdd
val edges = sc.makeRDD(Seq(Edge(1L,2L,1),Edge(2L,3L,2)))
// 创建 graph对象
val graph = Graph(vertices,edges)
// 获取graph图对象的vertices信息
// graph.vertices.collect.foreach(println)
// 获取graph图对象的edges信息
// graph.edges.collect.foreach(println)
// 获取graph图对象的triplets信息
// graph.triplets.collect.foreach(println)
// 通过文件加载
val graphLoad = GraphLoader.edgeListFile(sc,"in/user.txt")
// graphLoad.vertices.collect.foreach(println)
// graphLoad.edges.collect.foreach(println)
// graphLoad.triplets.collect.foreach(println)
//案例一 user relation
val users = sc.parallelize(Array((3L,("rxin","student")),(7L,("jgonzal","postdoc")),(5L,("franklin","prodessor")),(2L,("istoica","professor"))))
val relationship = sc.parallelize(Array(Edge(3L,7L,"Colla"),Edge(5L,3L,"Advisor"),Edge(2L,5L,"Colleague"),Edge(5L,7L,"Pi")))
val graphUser = Graph(users,relationship)
// graphUser.vertices.collect.foreach(println)
// graphUser.edges.collect.foreach(println)
// graphUser.triplets.collect.foreach(println)
//案例二
println("--------------案例二-------------------")
val userRDD = sc.makeRDD(Array(
(1L, ("Alice", 28)),
(2L, ("Bob", 27)),
(3L, ("Charlie", 65)),
(4L, ("David", 42)),
(5L, ("Ed", 55)),
(6L, ("Fran", 50))
))
val userCallRDD = sc.makeRDD(Array(
Edge(2L, 1L, 7),
Edge(3L, 2L, 4),
Edge(4L, 1L, 1),
Edge(2L, 4L, 2),
Edge(5L, 3L, 5),
Edge(3L, 6L, 3),
Edge(5L, 6L, 3)
))
val userCallGraph = Graph(userRDD,userCallRDD)
// userCallGraph.vertices.collect.foreach(println)
// userCallGraph.edges.collect.foreach(println)
// userCallGraph.triplets.collect.foreach(println)
userCallGraph.vertices.filter{case(id,(name,age))=>age>30}.collect.foreach(println)
//获取名字,年龄
//userCallGraph.vertices.filter{case(id,(name,age))=>age>30}.collect.foreach(x=>{println("name: "+x._2._1+" age: "+x._2._2)})
userCallGraph.triplets.collect.foreach(println)
//使用srcAttr,dstAttr,attr获取triplets的指定信息:Bob like Alice stage:7
userCallGraph.triplets.collect.foreach(x=>println(x.srcAttr._1+" like "+x.dstAttr._1+" stage:"+x.attr))
}
}