Spark graph图处理人际亲密度 笔记(spark-shell版本和idea scala版本 有maven依赖)

spark-shell版本

// 导入Spark Graph包
scala> import org.apache.spark.graphx._

// 创建 vertices 顶点rdd
scala> val vertices = sc.makeRDD(Seq((1L,1),(2L,2),(3L,3)))

// 创建 edges 边rdd
scala> val edges = sc.makeRDD(Seq(Edge(1L,2L,1),Edge(2L,3L,2)))

// 创建 graph对象
scala> val graph = Graph(vertices,edges)

scala> graph.
aggregateMessages         edges                mapVertices         persist                              stronglyConnectedComponents   
cache                     filter               mask                personalizedPageRank                 subgraph                      
checkpoint                getCheckpointFiles   numEdges            pickRandomVertex                     triangleCount                 
collectEdges              groupEdges           numVertices         pregel                               triplets                      
collectNeighborIds        inDegrees            ops                 removeSelfEdges                      unpersist                     
collectNeighbors          isCheckpointed       outDegrees          reverse                              unpersistVertices             
connectedComponents       joinVertices         outerJoinVertices   staticPageRank                       vertices 

// 获取graph图对象的vertices信息
scala> graph.vertices.collect
res1: Array[(org.apache.spark.graphx.VertexId, Int)] = Array((1,1), (2,2), (3,3))

// 获取graph图对象的edges信息
scala> graph.edges.collect
res2: Array[org.apache.spark.graphx.Edge[Int]] = Array(Edge(1,2,1), Edge(2,3,2))

// 通过文件加载
followers.txt
2 3
3 4
1 4
2 4

scala> val graphLoad =  GraphLoader.edgeListFile(sc, "file:///opt/kb09file/followers.txt")
graphLoad: org.apache.spark.graphx.Graph[Int,Int] = org.apache.spark.graphx.impl.GraphImpl@568202bd

scala> graphLoad.vertices.collect
res5: Array[(org.apache.spark.graphx.VertexId, Int)] = Array((4,1), (2,1), (1,1), (3,1))

scala> graphLoad.edges.collect
res6: Array[org.apache.spark.graphx.Edge[Int]] = Array(Edge(1,4,1), Edge(2,3,1), Edge(3,4,1), Edge(2,4,1))

scala> graphLoad.triplets.collect
res7: Array[org.apache.spark.graphx.EdgeTriplet[Int,Int]] = Array(((1,1),(4,1),1), ((2,1),(3,1),1), ((3,1),(4,1),1), ((2,1),(4,1),1))


// 案例  user     relation
scala> val users = sc.parallelize(Array((3L,("rxin","student")),(7L,("jgonzal","postdoc")),(5L,("franklin","professor")),(2L,("istoica","professor"))))
users: org.apache.spark.rdd.RDD[(Long, (String, String))] = ParallelCollectionRDD[49] at parallelize at <console>:27


scala> val relationship = sc.parallelize(Array(Edge(3L,7L,"Colla"),Edge(5L,3L,"Advisor"),Edge(2L,5L,"Colleague"),Edge(5L,7L,"Pi")))
relationship: org.apache.spark.rdd.RDD[org.apache.spark.graphx.Edge[String]] = ParallelCollectionRDD[50] at parallelize at <console>:27

scala> val graphUser = Graph(users, relationship)
graphUser: org.apache.spark.graphx.Graph[(String, String),String] = org.apache.spark.graphx.impl.GraphImpl@40bfabed

scala> graphUser.vertices.collect
res8: Array[(org.apache.spark.graphx.VertexId, (String, String))] = Array((2,(istoica,professor)), (3,(rxin,student)), (5,(franklin,professor)), (7,(jgonzal,postdoc)))

scala> graphUser.edges.collect
res9: Array[org.apache.spark.graphx.Edge[String]] = Array(Edge(3,7,Colla), Edge(5,3,Advisor), Edge(2,5,Colleague), Edge(5,7,Pi))

scala> graphUser.triplets.collect
res10: Array[org.apache.spark.graphx.EdgeTriplet[(String, String),String]] = Array(((3,(rxin,student)),(7,(jgonzal,postdoc)),Colla), ((5,(franklin,professor)),(3,(rxin,student)),Advisor), ((2,(istoica,professor)),(5,(franklin,professor)),Colleague), ((5,(franklin,professor)),(7,(jgonzal,postdoc)),Pi))

///
案例二
scala> val userRdd = sc.makeRDD(
     |   Array(
     |   (1L,("Alice",28)),
     |   (2L,("Bob",27)),
     |   (3L,("Charlie",65)),
     |   (4L,("David",42)),
     |   (5L,("Ed",55)),
     |   (6L,("Fran",50))
     |   )
     | )
userRdd: org.apache.spark.rdd.RDD[(Long, (String, Int))] = ParallelCollectionRDD[69] at makeRDD at <console>:27

scala> val usercallRdd = sc.makeRDD(
     |   Array(
     |   Edge(2L,1L,7),
     |   Edge(3L,2L,4),
     |   Edge(4L,1L,1),
     |   Edge(2L,4L,2),
     |   Edge(5L,2L,2),
     |   Edge(5L,3L,8),
     |   Edge(3L,6L,3),
     |   Edge(5L,6L,3)
     |   )
     | )
usercallRdd: org.apache.spark.rdd.RDD[org.apache.spark.graphx.Edge[Int]] = ParallelCollectionRDD[70] at makeRDD at <console>:27

scala> val userCallGraph = Graph(userRdd,usercallRdd)
userCallGraph: org.apache.spark.graphx.Graph[(String, Int),Int] = org.apache.spark.graphx.impl.GraphImpl@db8615d


scala> userCallGraph.vertices.filter{ case(id,(name,age)) => age>30}
res21: org.apache.spark.graphx.VertexRDD[(String, Int)] = VertexRDDImpl[100] at RDD at VertexRDD.scala:57


scala> userCallGraph.vertices.filter(v=>v._2._2>30).collect.foreach(x=>{println("name:"+x._2._1+" age:"+x._2._2)})

scala> userCallGraph.vertices.filter{case(id,(name,age)) => age>30}
res19: org.apache.spark.graphx.VertexRDD[(String, Int)] = VertexRDDImpl[96] at RDD at VertexRDD.scala:57

scala> userCallGraph.vertices.filter{ case(id,(name,age)) => age>30}.collect.foreach(println)
(3,(Charlie,65))
(4,(David,42))
(5,(Ed,55))
(6,(Fran,50))

scala> userCallGraph.triplets
res22: org.apache.spark.rdd.RDD[org.apache.spark.graphx.EdgeTriplet[(String, Int),Int]] = MapPartitionsRDD[88] at mapPartitions at GraphImpl.scala:48

scala> userCallGraph.triplets.collect.foreach(println)
((2,(Bob,27)),(1,(Alice,28)),7)
((3,(Charlie,65)),(2,(Bob,27)),4)
((4,(David,42)),(1,(Alice,28)),1)
((2,(Bob,27)),(4,(David,42)),2)
((5,(Ed,55)),(2,(Bob,27)),2)
((5,(Ed,55)),(3,(Charlie,65)),8)
((3,(Charlie,65)),(6,(Fran,50)),3)
((5,(Ed,55)),(6,(Fran,50)),3)



scala> userCallGraph.triplets.collect.foreach(x=>println(x.srcAttr._1+" like "+ x.dst))
dstAttr   dstId

scala> userCallGraph.triplets.collect.foreach(x=>println(x.srcAttr._1+" like "+ x.dstAttr._1+" stage:"+x.attr))
Bob like Alice stage:7
Charlie like Bob stage:4
David like Alice stage:1
Bob like David stage:2
Ed like Bob stage:2
Ed like Charlie stage:8
Charlie like Fran stage:3
Ed like Fran stage:3

scala> userCallGraph.triplets.filter(x=>x.attr>5).collect.foreach(x=>println(x.srcAttr._1+" like "+ x.dstAttr._1+" stage:"+x.attr))
Bob like Alice stage:7
Ed like Charlie stage:8

idea scala版本

maven依赖

 <dependencies>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.12</version>
      <scope>test</scope>
    </dependency>
    <dependency>
      <groupId>org.scala-lang</groupId>
      <artifactId>scala-library</artifactId>
      <version>2.11.8</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-core_2.11</artifactId>
      <version>2.1.1</version>
    </dependency>
    <dependency>
      <groupId>log4j</groupId>
      <artifactId>log4j</artifactId>
      <version>1.2.17</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-sql_2.11</artifactId>
      <version>2.1.1</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-hive_2.11</artifactId>
      <version>2.1.1</version>
    </dependency>
    <dependency>
      <groupId>mysql</groupId>
      <artifactId>mysql-connector-java</artifactId>
      <version>5.1.36</version>
    </dependency>
    <dependency>
      <groupId>org.slf4j</groupId>
      <artifactId>slf4j-api</artifactId>
      <version>1.7.21</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-graphx_2.11</artifactId>
      <version>2.1.1</version>
    </dependency>
  </dependencies>

user.txt 文件内容:

2 1
2 4
4 1
5 2
5 3
5 6
3 6
3 2
package Graph

import org.apache.spark.sql.SparkSession
// 导入Spark Graph包
import org.apache.spark.graphx._


object GraphStu {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("GraphStu").master("local[*]").getOrCreate()
    val sc = spark.sparkContext

    // 创建 vertices 顶点rdd
    val vertices = sc.makeRDD(Seq((1L,1),(2L,2),(3L,3)))
    // 创建 edges 边rdd
    val edges = sc.makeRDD(Seq(Edge(1L,2L,1),Edge(2L,3L,2)))
    // 创建 graph对象
    val graph = Graph(vertices,edges)
    // 获取graph图对象的vertices信息
//    graph.vertices.collect.foreach(println)
//     获取graph图对象的edges信息
//    graph.edges.collect.foreach(println)
// 获取graph图对象的triplets信息
//    graph.triplets.collect.foreach(println)

    // 通过文件加载
    val graphLoad = GraphLoader.edgeListFile(sc,"in/user.txt")
//    graphLoad.vertices.collect.foreach(println)
//    graphLoad.edges.collect.foreach(println)
//    graphLoad.triplets.collect.foreach(println)

    //案例一  user relation
    val users = sc.parallelize(Array((3L,("rxin","student")),(7L,("jgonzal","postdoc")),(5L,("franklin","prodessor")),(2L,("istoica","professor"))))
    val relationship = sc.parallelize(Array(Edge(3L,7L,"Colla"),Edge(5L,3L,"Advisor"),Edge(2L,5L,"Colleague"),Edge(5L,7L,"Pi")))

    val graphUser = Graph(users,relationship)

//    graphUser.vertices.collect.foreach(println)
//    graphUser.edges.collect.foreach(println)
//    graphUser.triplets.collect.foreach(println)

    //案例二
    println("--------------案例二-------------------")
    val userRDD = sc.makeRDD(Array(
      (1L, ("Alice", 28)),
      (2L, ("Bob", 27)),
      (3L, ("Charlie", 65)),
      (4L, ("David", 42)),
      (5L, ("Ed", 55)),
      (6L, ("Fran", 50))
    ))

    val userCallRDD = sc.makeRDD(Array(
      Edge(2L, 1L, 7),
      Edge(3L, 2L, 4),
      Edge(4L, 1L, 1),
      Edge(2L, 4L, 2),
      Edge(5L, 3L, 5),
      Edge(3L, 6L, 3),
      Edge(5L, 6L, 3)
    ))

    val userCallGraph = Graph(userRDD,userCallRDD)

//    userCallGraph.vertices.collect.foreach(println)
//    userCallGraph.edges.collect.foreach(println)
//    userCallGraph.triplets.collect.foreach(println)

    userCallGraph.vertices.filter{case(id,(name,age))=>age>30}.collect.foreach(println)

    //获取名字,年龄
    //userCallGraph.vertices.filter{case(id,(name,age))=>age>30}.collect.foreach(x=>{println("name: "+x._2._1+" age: "+x._2._2)})

    userCallGraph.triplets.collect.foreach(println)

//使用srcAttr,dstAttr,attr获取triplets的指定信息:Bob like Alice stage:7
    userCallGraph.triplets.collect.foreach(x=>println(x.srcAttr._1+" like "+x.dstAttr._1+" stage:"+x.attr))
  }
}
  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值