PageRank算法
PageRank算法原理剖析及Spark实现 - 简书 (jianshu.com)
import org.apache.spark.SparkContext
import org.apache.spark.graphx.{Edge, Graph, VertexId, VertexRDD}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
case class User(name:String,age:Int,inDeg:Int,outDeg:Int)
object GraphDemo4 {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().appName("sparkgraph")
.master("local[*]")
.getOrCreate()
val sc: SparkContext = spark.sparkContext
val users: RDD[(Long, (String, Int))] = sc.makeRDD( //元组里面不限类型
Array(
(1L, ("Alice", 28)),
(2L, ("Bob", 27)),
(3L, ("Charlie", 65)),
(4L, ("David", 42)),
(5L, ("Ed", 55)),
(6L, ("Fran", 50)),
(7L,("KB11",8)),
(8L,("KB12",7)),
(9L,("KB13",9))
)
)
val edges: RDD[Edge[Int]] = sc.makeRDD(
Array(
Edge(2L, 1L, 7),
Edge(3L, 2L, 4),
Edge(4L, 1L, 1),
Edge(2L, 4L, 2),
Edge(5L, 2L, 2),
Edge(5L, 3L, 8),
Edge(3L, 6L, 3),
Edge(5L, 6L, 3),
Edge(7L,8L,12),
Edge(8L,9L,32),
Edge(9L,7L,35)
)
)
val graph: Graph[(String, Int), Int] = Graph(users,edges)
val graph1: Graph[User, Int] = graph.mapVertices{case(id,(name,age))=>{User(name,age,0,0)}}
// graph.mapVertices{(x,y)=>User(y._1,y._2,0,0)}
graph1.vertices.collect().foreach(println)
println("---------------------------------")
val inDegrees: VertexRDD[Int] = graph.inDegrees
inDegrees.collect().foreach(println)
val graph2: Graph[User, Int] = graph1.outerJoinVertices(inDegrees)((id, user, inDeg)=>{User(user.name,user.age,inDeg.getOrElse(0),0)})
graph2.vertices.foreach(println)
val outDegrees: VertexRDD[Int] = graph.outDegrees
println("-------------------------")
outDegrees.foreach(println)
val graph3: Graph[User, Int] = graph2.outerJoinVertices(outDegrees)((id, user, outDeg)=>{User(user.name,user.age,user.inDeg,outDeg.getOrElse(0))})
println("-------------------------")
graph3.vertices.foreach(x=>println(x._2.name+"喜欢 "+x._2.outDeg+"人,被 "+x._2.inDeg+"人喜欢。"))
// graph3.vertices.foreach{case (x,y)=>{println(y.name+"喜欢 "+y.outDeg+"人,被")+y.inDeg+"人喜欢")}}
println("---------------pageRank-----------------------")
val graph44: Graph[Double, Double] = graph.pageRank(0.0001)
graph44.vertices.foreach(println)
/**0,0001
* (6,0.9969646507526427)
* (2,0.9969646507526427)
* (1,1.7924127957615184)
* (5,0.5451618049228395)
* (3,0.6996243163176441)
* (4,0.9688717814927127)
*/
println("---------connectedComponents------------")
val graphConn: Graph[VertexId, Int] = graph.connectedComponents()
graphConn.triplets.collect().foreach(println)
val emailRDD: RDD[(Long, String)] = sc.parallelize(
Array(
(1L, "qq.com"),
(3L, "163.com"),
(6L, "souhu.com"),
(7L,"fox.com")
)
)
val phoneRDD: RDD[(Long, String)] = sc.parallelize(
Array(
(1L, "12345678922"),
(3L, "22232342433"),
(6L, "23543652577")
)
)
val graphjoin: Graph[(String, Int), Int] = graph.joinVertices(emailRDD)((id, y, z)=>{(y._1+"@"+z,y._2)})
// graph.vertices.collect().foreach(println)
println("----------------------------------------------")
// graphjoin.vertices.collect().foreach(println)
println("-----------------------------------")
val graphjoin2: Graph[(String, Int), Int] = graph.outerJoinVertices(emailRDD)((id, y, z)=>{(y._1+"@"+z.getOrElse("tao.com"),y._2)})
// graph.vertices.collect().foreach(println)
/**
* (4,(David@tao.com,42))
* (1,(Alice@qq.com,28))
* (5,(Ed@tao.com,55))
* (6,(Fran@souhu.com,50))
* (2,(Bob@tao.com,27))
* (3,(Charlie@163.com,65))
*/
println("----------------------------------------------")
// graphjoin2.vertices.collect().foreach(println)
println("--------------------------------------------------")
/**
* (4,(David,42))
* (1,(Alice,28))
* (5,(Ed,55))
* (6,(Fran,50))
* (2,(Bob,27))
* (3,(Charlie,65))
*/
val graphjoin3: Graph[(String, Int), Int] = graph.outerJoinVertices(phoneRDD)((id, y, z)=>{(y._1+":"+z.getOrElse("13611112222"),y._2)})
// graph.vertices.collect().foreach(println)
println("----------------------------------------------")
// graphjoin3.vertices.collect().foreach(println)
/**
* (4,(David:13611112222,42))
* (1,(Alice:12345678922,28))
* (5,(Ed:13611112222,55))
* (6,(Fran:23543652577,50))
* (2,(Bob:13611112222,27))
* (3,(Charlie:22232342433,65))
*/
}
}
pregel函数
Spark GraphX 中的 pregel函数_Bamdli-CSDN博客
import org.apache.spark.SparkContext
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
object GraphDemo5 {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().appName("sparkgraph")
.master("local[*]")
.getOrCreate()
val sc: SparkContext = spark.sparkContext
val users: RDD[(Long, (String, Int))] = sc.makeRDD( //元组里面不限类型
Array(
(1L, ("Alice", 28)),
(2L, ("Bob", 27)),
(3L, ("Charlie", 65)),
(4L, ("David", 42)),
(5L, ("Ed", 55)),
(6L, ("Fran", 50))
)
)
val edges: RDD[Edge[Int]] = sc.makeRDD(
Array(
Edge(2L, 1L, 7),
Edge(3L, 2L, 4),
Edge(4L, 1L, 1),
Edge(2L, 4L, 2),
Edge(5L, 2L, 2),
Edge(5L, 3L, 8),
Edge(3L, 6L, 3),
Edge(5L, 6L, 3)
)
)
val graph: Graph[(String, Int), Int] = Graph(users,edges)
val startVertexId=5L
val initGraph: Graph[Double, Int] = graph.mapVertices {
case (vid, (name, age)) => {
if (vid == 5L) 0.0
else Double.MaxValue
}
}
initGraph.vertices.foreach(println)
/**
* (4,1.7976931348623157E308)
* (6,1.7976931348623157E308)
* (1,1.7976931348623157E308)
* (2,1.7976931348623157E308)
* (3,1.7976931348623157E308)
* (5,0.0)
*/
println("------------------------------------------------")
val pregelGraph: Graph[Double, PartitionID] = initGraph.pregel(
Double.PositiveInfinity, //初始值
10,
EdgeDirection.Out
)(
(vid: VertexId, vd: Double, disMsg: Double) => { //vprog: (VertexId, VD, A) => VD,
val min: Double = math.min(vd, disMsg)
println(s"顶点$vid, 属性$vd 收到的消息$disMsg 属性值与收到的消息比较后结果取最小值 $min")
min
},
(edgeTriplet: EdgeTriplet[Double, PartitionID]) => {
println(s"顶点${edgeTriplet.srcId} 给${edgeTriplet.dstId} 发送消息:${edgeTriplet.srcAttr} " +
s"与${edgeTriplet.attr} 相加结果: ${edgeTriplet.srcAttr + edgeTriplet.attr}")
if (edgeTriplet.srcAttr + edgeTriplet.attr < edgeTriplet.dstAttr)
Iterator[(VertexId, Double)]((edgeTriplet.dstId, edgeTriplet.srcAttr + edgeTriplet.attr))
else
Iterator.empty
},
(msg1: Double, msg2: Double) => {
math.min(msg1, msg2)
}
)
pregelGraph.vertices.foreach(println)
/**
* 顶点4, 属性1.7976931348623157E308 收到的消息Infinity 属性值与收到的消息比较后结果取最小值 1.7976931348623157E308
* 顶点3, 属性1.7976931348623157E308 收到的消息Infinity 属性值与收到的消息比较后结果取最小值 1.7976931348623157E308
* 顶点1, 属性1.7976931348623157E308 收到的消息Infinity 属性值与收到的消息比较后结果取最小值 1.7976931348623157E308
* 顶点5, 属性0.0 收到的消息Infinity 属性值与收到的消息比较后结果取最小值 0.0
* 顶点6, 属性1.7976931348623157E308 收到的消息Infinity 属性值与收到的消息比较后结果取最小值 1.7976931348623157E308
* 顶点2, 属性1.7976931348623157E308 收到的消息Infinity 属性值与收到的消息比较后结果取最小值 1.7976931348623157E308
* 顶点5 给2 发送消息:0.0 与2 相加结果: 2.0
* 顶点5 给3 发送消息:0.0 与8 相加结果: 8.0
* 顶点2 给4 发送消息:1.7976931348623157E308 与2 相加结果: 1.7976931348623157E308
* 顶点4 给1 发送消息:1.7976931348623157E308 与1 相加结果: 1.7976931348623157E308
* 顶点2 给1 发送消息:1.7976931348623157E308 与7 相加结果: 1.7976931348623157E308
* 顶点3 给2 发送消息:1.7976931348623157E308 与4 相加结果: 1.7976931348623157E308
* 顶点3 给6 发送消息:1.7976931348623157E308 与3 相加结果: 1.7976931348623157E308
* 顶点5 给6 发送消息:0.0 与3 相加结果: 3.0
* 顶点6, 属性1.7976931348623157E308 收到的消息3.0 属性值与收到的消息比较后结果取最小值 3.0
* 顶点2, 属性1.7976931348623157E308 收到的消息2.0 属性值与收到的消息比较后结果取最小值 2.0
* 顶点3, 属性1.7976931348623157E308 收到的消息8.0 属性值与收到的消息比较后结果取最小值 8.0
* 顶点2 给4 发送消息:2.0 与2 相加结果: 4.0
* 顶点3 给6 发送消息:8.0 与3 相加结果: 11.0
* 顶点2 给1 发送消息:2.0 与7 相加结果: 9.0
* 顶点3 给2 发送消息:8.0 与4 相加结果: 12.0
* 顶点4, 属性1.7976931348623157E308 收到的消息4.0 属性值与收到的消息比较后结果取最小值 4.0
* 顶点1, 属性1.7976931348623157E308 收到的消息9.0 属性值与收到的消息比较后结果取最小值 9.0
* 顶点4 给1 发送消息:4.0 与1 相加结果: 5.0
* 顶点1, 属性9.0 收到的消息5.0 属性值与收到的消息比较后结果取最小值 5.0
* (1,5.0)
* (5,0.0)
* (4,4.0)
* (6,3.0)
* (2,2.0)
* (3,8.0)
*/
}
}
航班飞行网图分析
import org.apache.spark.SparkContext
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
object FlightDemo {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().appName("flightdemo")
.master("local[*]")
.getOrCreate()
val sc: SparkContext = spark.sparkContext
val flightRDD: RDD[String] = sc.textFile("in/flight.csv")
// flightRDD.collect().foreach(println)
//找出所有机场编号 和 机场名称
val airPort: RDD[(VertexId, String)] = flightRDD.map(x => x.split(","))
.flatMap(x => Array((x(5).toLong, x(6)), (x(7).toLong, x(8))))
.distinct() //(12953,LGA)
//找出所有航线和航线间的距离
val lines: RDD[Edge[Int]] = flightRDD.map(x => x.split(","))
.map(x => (x(5).toLong, x(7).toLong, x(16).toInt))
.distinct()
.map(x => Edge(x._1, x._2, x._3))
// lines.collect().foreach(println)
val graph: Graph[String, Int] = Graph(airPort,lines)
// graph.triplets.collect().foreach(println)
//求机场数量(求顶点个数)
val vertices: VertexId = graph.numVertices
//求航线数量(求边个数)
val edges: VertexId = graph.numEdges
println("机场数量:"+ vertices + "航线数量:"+ edges)
//求最长飞行线路(最长的边)
graph.triplets.sortBy(x=>x.attr,false).take(3)
.foreach(x=>println(x.srcAttr+"到达"+x.dstAttr+"距离是"+x.attr))
//求最短飞行线路(最短的边)
graph.triplets.sortBy(x=>x.attr, true).take(3)
.foreach(x=>println(x.srcAttr+"到达"+x.dstAttr+"距离是"+x.attr))
//入场航班最多的机场
val value: RDD[(VertexId, Int)] = graph.inDegrees.sortBy(x=>x._2,false)
val tuple: (VertexId, Int) = value.take(1)(0)
println("进场航班最多的机场" +tuple)
//出厂航班最多的机场
val value2: RDD[(VertexId, Int)] = graph.outDegrees.sortBy(x=>x._2,false)
val tuple2: (VertexId, Int) = value2.take(1)(0)
println("出场航班最多的机场" +tuple2)
//找出最重要的前三个机场
graph.pageRank(0.05).vertices.sortBy(x=>x._2,false).take(3).foreach(println)
println("--------------------------------------")
//找出10397机场到其它机场的最便宜的航线
val startAirPort=10397L
val init_graph: Graph[Double, Double] = graph.mapVertices((id, name) => {
if (id == startAirPort) 0.0
else Double.MaxValue
}).mapEdges(e => 180 + e.attr.toDouble * 0.15)
// init_graph.triplets.take(3).foreach(println)
val pregel_graph: Graph[Double, Double] = init_graph.pregel(
Double.MaxValue,
Int.MaxValue,
EdgeDirection.Out
)(
(id, dist, new_dist) => {
math.min(dist, new_dist)
},
(triple) => {
if (triple.attr + triple.srcAttr < triple.dstAttr)
Iterator((triple.dstId, triple.attr + triple.srcAttr)) //合并属性
else
Iterator.empty
},
(new_dist1, new_dist2) => {
math.min(new_dist1, new_dist2)
}
)
pregel_graph.vertices.filter(x=>x._1==12892L).collect().foreach(println)
/**
* 机场数量:301航线数量:4088
* HNL到达JFK距离是4983
* JFK到达HNL距离是4983
* HNL到达EWR距离是4963
* PSG到达WRG距离是31
* WRG到达PSG距离是31
* ACV到达CEC距离是56
* 进场航班最多的机场(10397,152)
* 出场航班最多的机场(10397,153)
* (10397,11.060247708032241)
* (13930,10.805558753161533)
* (11298,10.652656481033038)
* --------------------------------------
* (12892,472.05)
*/
}
}