[Spark]根据父子节点构建树

数据集中有两列child和parent,需要依据此父子节点来构建树,找到根节点以及调用链。

 一、递归遍历

import org.apache.spark.sql.DataFrame;
val sqlContext=new org.apache.spark.sql.SQLContext(sc);

val df = sqlContext.createDataFrame(List(
  ("a", "b", "b", "a-b", "20190201"),
  ("b", "c", "c", "b-c", "20190201"),
  ("c", "d", "d", "c-d", "20190201"),
  ("d", "e", "e", "d-e", "20190201")
)).toDF("child", "parent", "grandparent", "chain", "dt")

var result = df.selectExpr("child as child1","parent as parent1","grandparent as grandparent1","chain as chain1","dt as dt1")

def findroot(dataframe:DataFrame) : DataFrame = {
    val r = result.where("grandparent1 is not null").take(1)
    if(r.isEmpty) {
        return result
    }
    result = dataframe
    result = df.join(result,df("parent") === result("child1"),"left_outer").selectExpr("child as child1","NVL(parent1, parent) as parent1","grandparent1","concat_ws('-',child,chain1) as chain1","dt as dt1")
    findroot(result)
}

val output = findroot(result)

output.show

运行结果如下:

二、优化

  import org.apache.spark.sql.functions._
  import org.apache.spark.sql.DataFrame
  def findRoot(df:DataFrame):DataFrame = {
    import df.sparkSession.implicits._
    df.cache()
    var result = df.withColumn("chain",concat_ws("->",df("child"),df("parent"))).withColumn("root",df("parent")).withColumn("tmp",df("parent"))
    var sample = result.where("tmp is not null").take(1)
    while (!sample.isEmpty) {
      result = result.as("result").join(df.as("source"),($"source.child"===$"result.tmp").and($"result.tmp".isNotNull),"left_outer")
        .select(col("result.child"),col("result.parent")
          ,concat_ws ("->",col("result.chain"),col("source.parent")).as("chain")
          ,coalesce(col("source.parent"),column("result.root")).as("root"),col("source.parent").as("tmp")  )
      result.cache()
      sample = result.where("tmp is not null").take(1)
    }

    df.unpersist()
    result.drop("tmp")
  } 

val sqlContext=new org.apache.spark.sql.SQLContext(sc);

val df = sqlContext.createDataFrame(List(
  ("a", "b"),
  ("b", "c"),
  ("c", "d"),
  ("d", "e")
)).toDF("child", "parent")

val result = findRoot(df)
result.show

运行结果如下:

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值