数据格式
文件格式为CSV,字段之间分隔符为“,”
依次为:#日、周#、航空公司、飞机注册号、航班号、起飞机场编号、起飞机场、到达机场编号、到达机场、预计起飞时间(时分)、起飞时间、起飞延迟(分钟)、到达预计时间、到达时间、到达延迟(分钟)、预计飞行时间、飞行距离
构建航班飞行网图
装载CSV为RDD,每个机场作为顶点。关键字段:起飞机场编号、起飞机场、到达机场编号、到达机场、飞行距离
初始化顶点集airports:RDD[(VertexId,String)],顶点属性为机场名称
初始化边集lines:RDD[Edge],边属性为飞行距离
package flight
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
object jFlightDemo {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("etldemo")
val spark: SparkSession = SparkSession.builder().master("local[*]")
.appName("demo")
.config(conf)
.getOrCreate()
val sc: SparkContext = spark.sparkContext
val flightRDD: RDD[String] = sc.textFile("in/flight.csv")
// flightRDD.collect.foreach(println)
//数组 取出 起飞机场编号-起飞机场 到达机场编号-到达机场
val airPort: RDD[(VertexId, String)] = flightRDD.map(x => x.split(","))
.flatMap(x => Array((x(5).toLong, x(6)), (x(7).toLong, x(8))))
.distinct()
// airPort.collect().foreach(println)
val lines: RDD[Edge[PartitionID]] = flightRDD.map(x => x.split(",")) //起飞机场编号 降落机场编号 飞行距离
.map(x => (x(5).toLong, x(7).toLong, x(16).toInt))
.distinct()
.map(x => Edge(x._1, x._2, x._3))
//构建graph对象
val graph = Graph(airPort,lines)
// graph.triplets.collect().foreach(println)
//机场数量
val numvertices: VertexId = graph.numVertices
//航线数量
val numedges: VertexId = graph.numEdges
//计算最长的航线
val rdd1: RDD[EdgeTriplet[String, PartitionID]] = graph.triplets.sortBy(x=>x.attr,false)
val strings: Array[String] = rdd1.map(triplet => triplet.srcAttr+" " + triplet.dstAttr +" "+ triplet.attr)
.take(3)
// for (str<- strings){
// println(str)
// }
//找出最繁忙的机场
val indegrees: VertexRDD[PartitionID] = graph.inDegrees//降落飞机的数量
val busyAirPort: (VertexId, PartitionID) = indegrees.sortBy(x => x._2, false)
.take(1)(0)
println(busyAirPort)
val outdrgrees: VertexRDD[PartitionID] = graph.outDegrees
val busyAirPort2: (VertexId, PartitionID) = outdrgrees.sortBy(x => x._2, false)
.take(1)(0)
println(busyAirPort2)
// val tuples: Array[(VertexId, PartitionID)] = outdrgrees.sortBy(x => x._2, false)
// .take(1)
// for(t<- tuples){
// println(t)
// }
//最重要的机场
val vertices: VertexRDD[Double] = graph.pageRank(0.05).vertices
// vertices.collect.foreach(println)
vertices.sortBy(x=>x._2,false).collect.foreach(println)
//找出最便宜的飞行航线
//从初始指定的源点到任意一点的最短距离
//price = 180.0 + distance * 0.15
val value: RDD[(VertexId, String)] = airPort.sample(false,1.0/airPort.count(),1)
val source_id =10268.toLong
val srcAirportId = value.first()._1 //13296
val srcAirportName = value.first()._2
println(source_id,srcAirportId,srcAirportName)
val init_graph: Graph[Double, Double] = graph.mapVertices((id, value) => {
if (id == srcAirportId) 0
else Double.PositiveInfinity
}).mapEdges(e => e.attr.toDouble * 0.15 + 180.0)
val pregel_graph: Graph[Double, Double] = init_graph.pregel(
Double.PositiveInfinity,//初始值
Int.MaxValue,//最大迭代次数
EdgeDirection.Out//发送方向
)(
//3、vprog 节点调用该消息将聚合后的数据和本节点进行属性的合并
(id, dist, new_dist) => math.min(dist, new_dist),
//2、sendMsg 激活态的节点调用该方法发送消息
triple => {
if (triple.attr + triple.srcAttr < triple.dstAttr)
Iterator((triple.dstId, triple.attr + triple.srcAttr))
else
Iterator.empty
},
// 1、mergeMsg如果一个节点接收到多条消息,先用mergeMsg 来将多条消息聚合成为一条消息,如果节点只收到一条消息,则不调用该函数
(dist, new_dist) => math.min(dist, new_dist)
)
// pregel_graph.triplets.collect.foreach(println)
//13296 到 15919最便宜的费用是558.75
//13296 到 11618最便宜的费用是211.35
//15919 到11618 的最便宜的费用是 349.65
//((15919,558.75),(11618,211.35),349.65)
val tuples: Array[(VertexId, Double)] = pregel_graph.vertices.sortBy(x => x._2)
.take(3)
println(tuples.toList)
}
}